janmayjay commited on
Commit
39a7537
·
1 Parent(s): faad144

Add application file

Browse files
Files changed (45) hide show
  1. .gitignore +4 -0
  2. app.py +47 -0
  3. checkpoints/assoc_gcn.pt +3 -0
  4. config.json +11 -0
  5. models/model.pth +3 -0
  6. page_transcription.py +100 -0
  7. preprocess.py +21 -0
  8. requirements.txt +140 -0
  9. train.py +15 -0
  10. utils/__pycache__/character_to_character.cpython-311.pyc +0 -0
  11. utils/__pycache__/coco_to_yolo.cpython-311.pyc +0 -0
  12. utils/__pycache__/create_audiobook.cpython-311.pyc +0 -0
  13. utils/__pycache__/create_directories.cpython-311.pyc +0 -0
  14. utils/__pycache__/get_yolo_predictions.cpython-311.pyc +0 -0
  15. utils/__pycache__/identify_speaker.cpython-311.pyc +0 -0
  16. utils/__pycache__/japanese_ocr.cpython-311.pyc +0 -0
  17. utils/__pycache__/json_to_coco.cpython-311.pyc +0 -0
  18. utils/__pycache__/json_to_gcn.cpython-311.pyc +0 -0
  19. utils/__pycache__/panel_ordering.cpython-311.pyc +0 -0
  20. utils/__pycache__/train_speaker.cpython-311.pyc +0 -0
  21. utils/__pycache__/transcript_page.cpython-311.pyc +0 -0
  22. utils/__pycache__/utilities.cpython-311.pyc +0 -0
  23. utils/__pycache__/visual_llm_gender.cpython-311.pyc +0 -0
  24. utils/__pycache__/xml_to_json.cpython-311.pyc +0 -0
  25. utils/character_to_character.py +192 -0
  26. utils/coco_to_yolo.py +52 -0
  27. utils/create_audiobook.py +55 -0
  28. utils/create_directories.py +13 -0
  29. utils/directories.txt +13 -0
  30. utils/get_yolo_predictions.py +11 -0
  31. utils/identify_speaker.py +443 -0
  32. utils/japanese_ocr.py +52 -0
  33. utils/json_to_coco.py +131 -0
  34. utils/json_to_gcn.py +484 -0
  35. utils/panel_ordering.py +258 -0
  36. utils/train_speaker.py +385 -0
  37. utils/transcript_page.py +37 -0
  38. utils/utilities.py +306 -0
  39. utils/visual_llm_gender.py +79 -0
  40. utils/xml_to_json.py +45 -0
  41. yolo11m/runs/detect/train/args.yaml +105 -0
  42. yolo11m/runs/detect/train2/args.yaml +105 -0
  43. yolo11m/runs/detect/train2/results.csv +31 -0
  44. yolo11m/runs/detect/train2/weights/best.pt +3 -0
  45. yolo11m/runs/detect/train2/weights/last.pt +3 -0
.gitignore ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ **/*.jpg
2
+ **/*.png
3
+ **/*.jpg
4
+ **/*.png
app.py ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import os
3
+ import shutil
4
+ import uuid
5
+ import subprocess
6
+
7
+ # Function to handle image upload and transcription
8
+ def process_image(image):
9
+ # Save uploaded image to temp folder
10
+ temp_folder = "./temp_uploads"
11
+ os.makedirs(temp_folder, exist_ok=True)
12
+
13
+ # Generate unique filename
14
+ image_filename = f"{uuid.uuid4().hex}.jpg"
15
+ image_path = os.path.join(temp_folder, image_filename)
16
+
17
+ # Save image
18
+ image.save(image_path)
19
+
20
+ # Run your script with subprocess
21
+ try:
22
+ subprocess.run(["python", "page_transcription.py", f"-img={image_path}"], check=True)
23
+ except subprocess.CalledProcessError as e:
24
+ return f"Error during transcription: {e}", None
25
+
26
+ # Check if output.wav is generated
27
+ output_audio_path = "output.wav"
28
+ if os.path.exists(output_audio_path):
29
+ return "Transcription complete. Playing audio...", output_audio_path
30
+ else:
31
+ return "Failed to generate audio file.", None
32
+
33
+ # Gradio interface
34
+ iface = gr.Interface(
35
+ fn=process_image,
36
+ inputs=gr.Image(type="pil", label="Upload a Manga Page"),
37
+ outputs=[
38
+ gr.Textbox(label="Status"),
39
+ gr.Audio(label="Generated Audio", type="filepath")
40
+ ],
41
+ title="Manga Page Audio Transcription",
42
+ description="Upload a manga image page, and this tool will transcribe and play the audio using a backend Python script."
43
+ )
44
+
45
+ # Launch app
46
+ if __name__ == "__main__":
47
+ iface.launch()
checkpoints/assoc_gcn.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4c09dc23e06af2a1f373cb03748a92f134dcbeab2829543f928a5247e51839fe
3
+ size 810267
config.json ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "root" : "./dataset/",
3
+ "dir_path" : "./utils/directories.txt",
4
+ "manga109_root_dir" : "./dataset/Manga109/",
5
+ "root_images" : "./dataset/images/",
6
+ "root_annotations" : "./dataset/annotations/",
7
+ "root_labels" : "./dataset/labels/",
8
+ "root_json" : "./dataset/json/",
9
+ "yolo_path" : "./yolo11m/runs/detect/train2/weights/best.pt",
10
+ "speaker_association_path" : "./checkpoints/assoc_gcn.pt"
11
+ }
models/model.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b7b8228bcec85fb40eaf2781a63816ccc1e8d6cc43e90a0cd257406e3abc85fc
3
+ size 102802172
page_transcription.py ADDED
@@ -0,0 +1,100 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+ import json
3
+ import torch
4
+ from ultralytics import YOLO
5
+ from utils.identify_speaker import get_predictions_from_yolo
6
+ from utils.panel_ordering import panel_ordering
7
+ from utils.utilities import show_prediction_per_page
8
+ from utils.utilities import show_panel_order
9
+ from utils.character_to_character import count_characters_single_page
10
+ from utils.identify_speaker import identify_speaker,load_trained_speaker_model
11
+ from utils.utilities import visualize_speaker_results
12
+ from utils.utilities import process_results
13
+ from transformers import pipeline
14
+ from utils.transcript_page import transcript_page
15
+ from utils.visual_llm_gender import visual_llm_gender
16
+ from utils.utilities import get_speakers
17
+ from scipy.io.wavfile import write
18
+ from utils.create_audiobook import create_audiobook
19
+ import sounddevice as sd
20
+ import soundfile as sf
21
+
22
+ # def main():
23
+ # parser = argparse.ArgumentParser(description="Script to transcript one single image")
24
+ # # parser.add_argument("filename", help="Path to the input file")
25
+ # parser.add_argument("-img", "--test_image", type=str, default="./dataset/images/test/AkkeraKanjinchou_083.jpg", help="Test Image on which transcription needs to be done")
26
+ # parser.add_argument("-vis", "--visualize", type=bool, default=False, help="Visualize predictions")
27
+
28
+ # args = parser.parse_args()
29
+ # print(f"Test image at : {args.test_image}")
30
+ # print(f"Test image at : {args.visualize}")
31
+ def main():
32
+ parser = argparse.ArgumentParser(description="Script to transcript one single image")
33
+ parser.add_argument(
34
+ "-img", "--test_image",
35
+ type=str,
36
+ default="./dataset/images/test/AkkeraKanjinchou_083.jpg",
37
+ help="Test Image on which transcription needs to be done"
38
+ )
39
+ parser.add_argument(
40
+ "-vis", "--visualize",
41
+ action="store_true",
42
+ help="Provide only flagname to Visualize predictions"
43
+ )
44
+
45
+ args = parser.parse_args()
46
+ print(f"Test image at : {args.test_image}")
47
+ print(f"Visualize? : {args.visualize}")
48
+
49
+ config = {}
50
+ with open('config.json') as f:
51
+ config = json.load(f)
52
+ device = 'cuda' if torch.cuda.is_available() else 'cpu'
53
+ yolo_model = YOLO(config["yolo_path"])
54
+ test_image = args.test_image
55
+
56
+ predictions = get_predictions_from_yolo(test_image, yolo_model)
57
+
58
+ boxOrderEstimator = panel_ordering(test_image,predictions)
59
+
60
+ unique_chars, labels,pairs = count_characters_single_page(test_image,predictions,eps=0.3,min_samples=1,device='cuda')
61
+
62
+ assoc_model = load_trained_speaker_model(config['speaker_association_path'], device)
63
+ results,predictions = identify_speaker(
64
+ assoc_model, test_image, yolo_model, 'cuda'
65
+ )
66
+ processed_predictions = process_results(predictions,boxOrderEstimator,pairs,labels,results)
67
+
68
+ # print("Panels predicted present in predictions.panels")
69
+ # for panel in predictions.panels:
70
+ # print(panel)
71
+ # print("Sppech bubbles predicted present in predictions.speech_bubbles")
72
+ # for panel in predictions.speech_bubbles:
73
+ # print(panel)
74
+ # print("Faces predicted present in predictions.faces")
75
+ # for panel in predictions.faces:
76
+ # print(panel)
77
+ # print("Bodies predicted present in predictions.bodies")
78
+ # for panel in predictions.bodies:
79
+ # print(panel)
80
+ translator = pipeline('translation', model='Mitsua/elan-mt-bt-ja-en')
81
+ japanese_transcript,english_transcript = transcript_page(test_image,translator,processed_predictions)
82
+ gender_list = visual_llm_gender(unique_chars,predictions,test_image)
83
+ speaker_list = get_speakers(gender_list)
84
+ combined,sample_rate = create_audiobook(gender_list,english_transcript,speaker_list)
85
+ write("output.wav", sample_rate, combined)
86
+
87
+ data, fs = sf.read('output.wav')
88
+ sd.play(data, fs)
89
+ sd.wait()
90
+
91
+ if args.visualize :
92
+ show_prediction_per_page(test_image,processed_predictions)
93
+ show_panel_order(test_image,boxOrderEstimator)
94
+ visualize_speaker_results(test_image,processed_predictions,results)
95
+
96
+
97
+ if __name__ == "__main__":
98
+ main()
99
+
100
+
preprocess.py ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ from utils.create_directories import create_directories
3
+ from utils.xml_to_json import xml_to_json
4
+ from utils.json_to_coco import json_to_coco
5
+ from utils.coco_to_yolo import coco_to_yolo
6
+ from utils.json_to_gcn import json_to_gcn
7
+
8
+ def preprocess():
9
+
10
+ config = {}
11
+ with open('config.json') as f:
12
+ config = json.load(f)
13
+ create_directories(config)
14
+
15
+ xml_to_json(config)
16
+
17
+ json_to_coco(config)
18
+
19
+ coco_to_yolo(config)
20
+
21
+ json_to_gcn(config)
requirements.txt ADDED
@@ -0,0 +1,140 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ absl-py==2.3.1
2
+ accelerate==1.9.0
3
+ aiohappyeyeballs==2.6.1
4
+ aiohttp==3.12.14
5
+ aiosignal==1.4.0
6
+ anyascii==0.3.3
7
+ asttokens==3.0.0
8
+ attrs==25.3.0
9
+ audioread==3.0.1
10
+ av==15.0.0
11
+ babel==2.17.0
12
+ certifi==2025.7.14
13
+ cffi==1.17.1
14
+ charset-normalizer==3.4.2
15
+ colorama==0.4.6
16
+ comm==0.2.3
17
+ contourpy==1.3.3
18
+ coqpit-config==0.2.1
19
+ coqui-tts==0.27.0
20
+ coqui-tts-trainer==0.3.1
21
+ cycler==0.12.1
22
+ Cython==3.1.2
23
+ dateparser==1.1.8
24
+ debugpy==1.8.15
25
+ decorator==5.2.1
26
+ decord==0.6.0
27
+ docopt==0.6.2
28
+ einops==0.8.1
29
+ encodec==0.1.1
30
+ executing==2.2.0
31
+ filelock==3.13.1
32
+ fire==0.7.0
33
+ fonttools==4.59.0
34
+ frozenlist==1.7.0
35
+ fsspec==2024.6.1
36
+ fugashi==1.5.1
37
+ grpcio==1.74.0
38
+ gruut==2.4.0
39
+ gruut-ipa==0.13.0
40
+ gruut_lang_de==2.0.1
41
+ gruut_lang_en==2.0.1
42
+ gruut_lang_es==2.0.1
43
+ gruut_lang_fr==2.0.2
44
+ hf-xet==1.1.5
45
+ huggingface-hub==0.34.3
46
+ idna==3.10
47
+ inflect==7.5.0
48
+ ipykernel==6.30.0
49
+ ipython==9.4.0
50
+ ipython_pygments_lexers==1.1.1
51
+ jaconv==0.4.0
52
+ jedi==0.19.2
53
+ Jinja2==3.1.4
54
+ joblib==1.5.1
55
+ jsonlines==1.2.0
56
+ jupyter_client==8.6.3
57
+ jupyter_core==5.8.1
58
+ kiwisolver==1.4.8
59
+ lazy_loader==0.4
60
+ librosa==0.11.0
61
+ llvmlite==0.44.0
62
+ loguru==0.7.3
63
+ manga-ocr==0.1.14
64
+ manga109api==0.3.1
65
+ Markdown==3.8.2
66
+ MarkupSafe==2.1.5
67
+ matplotlib==3.10.3
68
+ matplotlib-inline==0.1.7
69
+ monotonic-alignment-search==0.2.0
70
+ more-itertools==10.7.0
71
+ mpmath==1.3.0
72
+ msgpack==1.1.1
73
+ multidict==6.6.3
74
+ nest-asyncio==1.6.0
75
+ networkx==3.3
76
+ num2words==0.5.14
77
+ numba==0.61.2
78
+ numpy==2.1.2
79
+ opencv-python==4.12.0.88
80
+ packaging==25.0
81
+ pandas==2.3.1
82
+ parso==0.8.4
83
+ pillow==11.0.0
84
+ platformdirs==4.3.8
85
+ pooch==1.8.2
86
+ prompt_toolkit==3.0.51
87
+ propcache==0.3.2
88
+ protobuf==6.31.1
89
+ psutil==7.0.0
90
+ pure_eval==0.2.3
91
+ py-cpuinfo==9.0.0
92
+ pycparser==2.22
93
+ Pygments==2.19.2
94
+ pyparsing==3.2.3
95
+ pyperclip==1.9.0
96
+ pysbd==0.3.4
97
+ python-crfsuite==0.9.11
98
+ python-dateutil==2.9.0.post0
99
+ pytz==2025.2
100
+ pywin32==311
101
+ PyYAML==6.0.2
102
+ pyzmq==27.0.0
103
+ qwen-vl-utils==0.0.8
104
+ regex==2025.7.34
105
+ requests==2.32.4
106
+ safetensors==0.5.3
107
+ scikit-learn==1.7.1
108
+ scipy==1.16.0
109
+ sentencepiece==0.2.0
110
+ six==1.17.0
111
+ sounddevice==0.5.2
112
+ soundfile==0.13.1
113
+ soxr==0.5.0.post1
114
+ stack-data==0.6.3
115
+ sympy==1.13.3
116
+ tensorboard==2.20.0
117
+ tensorboard-data-server==0.7.2
118
+ termcolor==3.1.0
119
+ threadpoolctl==3.6.0
120
+ tokenizers==0.21.4
121
+ torch==2.7.1+cu118
122
+ torch-geometric==2.6.1
123
+ torchaudio==2.7.1+cu118
124
+ torchvision==0.22.1+cu118
125
+ tornado==6.5.1
126
+ tqdm==4.67.1
127
+ traitlets==5.14.3
128
+ transformers @ git+https://github.com/huggingface/transformers@7dca2ff8cfd6102430acd3af3fdc3831b9779884
129
+ typeguard==4.4.4
130
+ typing_extensions==4.14.1
131
+ tzdata==2025.2
132
+ tzlocal==5.3.1
133
+ ultralytics==8.3.170
134
+ ultralytics-thop==2.0.14
135
+ unidic-lite==1.0.8
136
+ urllib3==2.5.0
137
+ wcwidth==0.2.13
138
+ Werkzeug==3.1.3
139
+ win32_setctime==1.2.0
140
+ yarl==1.20.1
train.py ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ torch.cuda.is_available()
3
+ from utils.train_speaker import train_speaker
4
+ from ultralytics import YOLO
5
+
6
+ def train(config):
7
+
8
+ model = YOLO("yolo11m.pt")
9
+
10
+ model.info()
11
+
12
+ model.train(data='data.yaml',epochs=10, device='cuda', imgsz=640, batch=16)
13
+
14
+
15
+ train_speaker(config)
utils/__pycache__/character_to_character.cpython-311.pyc ADDED
Binary file (9.75 kB). View file
 
utils/__pycache__/coco_to_yolo.cpython-311.pyc ADDED
Binary file (3.96 kB). View file
 
utils/__pycache__/create_audiobook.cpython-311.pyc ADDED
Binary file (2.58 kB). View file
 
utils/__pycache__/create_directories.cpython-311.pyc ADDED
Binary file (1.12 kB). View file
 
utils/__pycache__/get_yolo_predictions.cpython-311.pyc ADDED
Binary file (1.03 kB). View file
 
utils/__pycache__/identify_speaker.cpython-311.pyc ADDED
Binary file (16.4 kB). View file
 
utils/__pycache__/japanese_ocr.cpython-311.pyc ADDED
Binary file (4.18 kB). View file
 
utils/__pycache__/json_to_coco.cpython-311.pyc ADDED
Binary file (8.37 kB). View file
 
utils/__pycache__/json_to_gcn.cpython-311.pyc ADDED
Binary file (23.5 kB). View file
 
utils/__pycache__/panel_ordering.cpython-311.pyc ADDED
Binary file (14.2 kB). View file
 
utils/__pycache__/train_speaker.cpython-311.pyc ADDED
Binary file (21.7 kB). View file
 
utils/__pycache__/transcript_page.cpython-311.pyc ADDED
Binary file (2.67 kB). View file
 
utils/__pycache__/utilities.cpython-311.pyc ADDED
Binary file (18.3 kB). View file
 
utils/__pycache__/visual_llm_gender.cpython-311.pyc ADDED
Binary file (4.28 kB). View file
 
utils/__pycache__/xml_to_json.cpython-311.pyc ADDED
Binary file (3.35 kB). View file
 
utils/character_to_character.py ADDED
@@ -0,0 +1,192 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ def compute_iou(boxA, boxB):
2
+ """
3
+ Compute Intersection over Union (IoU) between two boxes.
4
+ Boxes are (x1, y1, x2, y2).
5
+ """
6
+ xA = max(boxA[0], boxB[0])
7
+ yA = max(boxA[1], boxB[1])
8
+ xB = min(boxA[2], boxB[2])
9
+ yB = min(boxA[3], boxB[3])
10
+
11
+ interW = max(0, xB - xA)
12
+ interH = max(0, yB - yA)
13
+ interArea = interW * interH
14
+
15
+ boxAArea = (boxA[2] - boxA[0]) * (boxA[3] - boxA[1])
16
+ # boxBArea = (boxB[2] - boxB[0]) * (boxB[3] - boxB[1])
17
+ # unionArea = boxAArea + boxBArea - interArea
18
+
19
+ return interArea / boxAArea if boxAArea > 0 else 0.0
20
+
21
+ def match_faces_to_bodies(face_boxes, body_boxes, iou_threshold=0.9):
22
+ """
23
+ Match each body box to the face box with highest IoU above threshold.
24
+ Return list of (face_box or None, body_box).
25
+ If no face meets the threshold for a body, face_box is None.
26
+ """
27
+ matches = []
28
+ used_faces = set()
29
+
30
+ for b in body_boxes:
31
+ best_iou = 0.0
32
+ best_f = None
33
+ for i, f in enumerate(face_boxes):
34
+ if i in used_faces:
35
+ continue
36
+ iou = compute_iou(f, b)
37
+ if iou > best_iou:
38
+ best_iou = iou
39
+ best_f = i
40
+ if best_iou >= iou_threshold:
41
+ matches.append((face_boxes[best_f], b))
42
+ used_faces.add(best_f)
43
+ else:
44
+ # No face sufficiently inside this body → treat body alone
45
+ matches.append((None, b))
46
+
47
+
48
+ return matches
49
+
50
+ # matches = match_faces_to_bodies(predictions_face,predictions_body)
51
+ # for face_box, body_box in matches:
52
+ # print(face_box,body_box)
53
+
54
+ import cv2
55
+ import numpy as np
56
+
57
+ def crop_and_prepare_embeddings(image_path, face_boxes, body_boxes, model, fuse_features):
58
+ """
59
+ For each matched (face, body) pair:
60
+ - If both present: crop both and fuse embeddings.
61
+ - If only face: crop face and use face embedding alone.
62
+ - If only body: crop body and use body embedding alone.
63
+ Returns list of embeddings.
64
+ """
65
+ img = cv2.imread(image_path)
66
+ pairs = match_faces_to_bodies(face_boxes, body_boxes)
67
+ embeddings = []
68
+ new_pairs = []
69
+ for face_box, body_box in pairs:
70
+ if face_box is not None:
71
+ x1, y1, x2, y2 = face_box
72
+ face_crop = img[y1:y2, x1:x2]
73
+ face_emb = model(face_crop)
74
+ else:
75
+ face_emb = None
76
+
77
+ if body_box is not None:
78
+ x1, y1, x2, y2 = body_box
79
+ body_crop = img[y1:y2, x1:x2]
80
+ body_emb = model(body_crop)
81
+ else:
82
+ body_emb = None
83
+
84
+ if face_emb is not None and body_emb is not None:
85
+ emb = fuse_features(face_emb, body_emb)
86
+ # elif face_emb is not None:
87
+ # emb = face_emb
88
+ # else:
89
+ # emb = body_emb
90
+
91
+ embeddings.append(emb)
92
+ new_pairs.append((face_box, body_box))
93
+
94
+ return embeddings,new_pairs
95
+
96
+ """
97
+ Prerequisites
98
+ -------------
99
+ pip install torch torchvision scikit-learn opencv-python pillow tqdm
100
+ Download a backbone checkpoint that outputs 128-D embeddings.
101
+ Here we assume `reid_backbone.pth` is a ResNet-50 fine-tuned on manga faces/bodies.
102
+ """
103
+ import cv2
104
+ import torch
105
+ import torch.nn as nn
106
+ import torchvision.transforms as T
107
+ import numpy as np
108
+ from sklearn.cluster import DBSCAN
109
+ from tqdm import tqdm
110
+ from pathlib import Path
111
+
112
+ # -------------------------------------------------------------------
113
+ # 1. Detection post-processing: crop_faces() is given; add crop_bodies()
114
+ # -------------------------------------------------------------------
115
+ def crop_bodies(image_path, boxes):
116
+ """Crop body regions given YOLO boxes (x1,y1,x2,y2)."""
117
+ img = cv2.imread(image_path)
118
+ crops = []
119
+ for (x1, y1, x2, y2) in boxes:
120
+ body = img[y1:y2, x1:x2]
121
+ crops.append(body)
122
+ return crops
123
+ def crop_faces(image_path, boxes):
124
+ image = cv2.imread(image_path)
125
+ faces = []
126
+ for (x1, y1, x2, y2) in boxes:
127
+ face = image[y1:y2, x1:x2] # crop the face
128
+ faces.append(face)
129
+ return faces
130
+ # -------------------------------------------------------------------
131
+ # 2. Embedding network wrapper
132
+ # -------------------------------------------------------------------
133
+ class ReIDNet(nn.Module):
134
+ """Backbone that outputs a 128-D L2-normalised embedding."""
135
+ def __init__(self, ckpt="./models/model.pth", dim=1031):
136
+ super().__init__()
137
+ base = torch.hub.load('pytorch/vision:v0.14.0', 'resnet50', weights=None)
138
+ base.fc = nn.Linear(base.fc.in_features, dim)
139
+ self.backbone = base
140
+ def add_prefix(state_dict, prefix="backbone."):
141
+ return {f"{prefix}{k}" if not k.startswith(prefix) else k: v
142
+ for k, v in state_dict.items()}
143
+ raw = torch.load(ckpt, map_location="cpu")
144
+ patched = add_prefix(raw)
145
+ self.load_state_dict(patched)
146
+ self.backbone.eval()
147
+
148
+ # Normalisation identical to training
149
+ self.preprocess = T.Compose([
150
+ T.ToPILImage(),
151
+ T.Resize((112, 112)),
152
+ T.ToTensor(),
153
+ T.Normalize(mean=[0.5]*3, std=[0.5]*3)
154
+ ])
155
+
156
+ @torch.inference_mode()
157
+ def forward(self, np_image):
158
+ # tensor = self.preprocess(np_image).unsqueeze(0) # 1×3×112×112
159
+ tensor = self.preprocess(np_image).unsqueeze(0).to(next(self.backbone.parameters()).device)
160
+ feat = self.backbone(tensor)
161
+ return nn.functional.normalize(feat, p=2, dim=1).cpu().numpy().ravel()
162
+
163
+ # -------------------------------------------------------------------
164
+ # 3. Utility to fuse face & body features
165
+ # -------------------------------------------------------------------
166
+ def fuse_features(face_f, body_f, w_face=0.6):
167
+ """Simple weighted late fusion."""
168
+ return np.concatenate([w_face * face_f, (1 - w_face) * body_f])
169
+
170
+ # -------------------------------------------------------------------
171
+ # 4. Main pipeline for a single page
172
+ # -------------------------------------------------------------------
173
+ def count_characters_single_page(img_path,
174
+ predictions,
175
+ eps=0.3,
176
+ min_samples=3,
177
+ device='cpu'):
178
+ model = ReIDNet().to(device)
179
+ face_boxes = [face['bbox'] for face in predictions.faces]
180
+ body_boxes = [body['bbox'] for body in predictions.bodies]
181
+ embeddings,pairs = crop_and_prepare_embeddings(img_path, face_boxes, body_boxes, model, fuse_features)
182
+ fused_embeds = np.vstack(embeddings) # N×256
183
+
184
+ # ----------------------------------------------------------------
185
+ # 5. DBSCAN clustering
186
+ # ----------------------------------------------------------------
187
+ db = DBSCAN(metric='cosine', eps=eps, min_samples=min_samples)
188
+ labels = db.fit_predict(fused_embeds) # -1 => noise
189
+
190
+ # Count unique positive cluster IDs
191
+ unique_chars = len(set(l for l in labels if l != -1))
192
+ return unique_chars, labels,pairs
utils/coco_to_yolo.py ADDED
@@ -0,0 +1,52 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ def coco_to_yolo(config):
2
+ import os
3
+ import json
4
+ from pathlib import Path
5
+ from tqdm import tqdm
6
+
7
+
8
+ labels_dir = config["root_labels"]
9
+ annotations_dir = config["root_annotations"]
10
+
11
+ def convert_coco_json_to_yolo(coco_json_path, output_dir):
12
+ with open(coco_json_path) as f:
13
+ data = json.load(f)
14
+
15
+ images = {img['id']: img for img in data['images']}
16
+ categories = {cat['id']: cat['name'] for cat in data['categories']}
17
+ category_id_map = {cat_id: i for i, cat_id in enumerate(sorted(categories.keys()))}
18
+
19
+ os.makedirs(output_dir, exist_ok=True)
20
+ print(len(images))
21
+ for ann in tqdm(data['annotations']):
22
+ img = images[ann['image_id']]
23
+ image_name = Path(img['file_name']).stem
24
+ img_width = img['width']
25
+ img_height = img['height']
26
+ bbox = ann['bbox'] # [x_min, y_min, width, height]
27
+ x_center = (bbox[0] + bbox[2] / 2) / img_width
28
+ y_center = (bbox[1] + bbox[3] / 2) / img_height
29
+ w = bbox[2] / img_width
30
+ h = bbox[3] / img_height
31
+
32
+ class_id = category_id_map[ann['category_id']]
33
+
34
+ label_path = os.path.join(output_dir, f"{image_name}.txt")
35
+ with open(label_path, "a") as f:
36
+ f.write(f"{class_id} {x_center:.6f} {y_center:.6f} {w:.6f} {h:.6f}\n")
37
+
38
+ print(f"Converted labels saved to: {output_dir}")
39
+
40
+ # Example usage
41
+ convert_coco_json_to_yolo(
42
+ coco_json_path=annotations_dir+"instances_train.json",
43
+ output_dir=labels_dir+"train/"
44
+ )
45
+ convert_coco_json_to_yolo(
46
+ coco_json_path=annotations_dir+"instances_val.json",
47
+ output_dir=labels_dir+"val/"
48
+ )
49
+ convert_coco_json_to_yolo(
50
+ coco_json_path=annotations_dir+"instances_test.json",
51
+ output_dir=labels_dir+"test/"
52
+ )
utils/create_audiobook.py ADDED
@@ -0,0 +1,55 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ from TTS.api import TTS
3
+ import os
4
+ import numpy as np
5
+ from scipy.io import wavfile
6
+
7
+ # Save the original torch.load function
8
+ _original_torch_load = torch.load
9
+
10
+ # Define a custom function that forces weights_only=False
11
+ def custom_torch_load(*args, **kwargs):
12
+ if "weights_only" not in kwargs:
13
+ kwargs["weights_only"] = False
14
+ return _original_torch_load(*args, **kwargs)
15
+
16
+ # Override torch.load globally
17
+ torch.load = custom_torch_load
18
+
19
+ device = "cuda" if torch.cuda.is_available() else "cpu"
20
+ print(f"Using device: {device}")
21
+
22
+
23
+ # generate_samples()
24
+ def create_audiobook(gender_list,english_transcript,speaker_list):
25
+ parent_wav = []
26
+ tts = TTS("tts_models/multilingual/multi-dataset/xtts_v2").to(device)
27
+ for text in english_transcript:
28
+ print(text['char_id'],text['text'])
29
+ voice_gender = 'Male'
30
+ voice_speaker = ''
31
+ if text['char_id'] != None:
32
+ voice_gender = gender_list[int(text['char_id'])][0]
33
+ print("voice person",speaker_list[text['char_id']])
34
+ voice_speaker = speaker_list[text['char_id']]
35
+ else:
36
+ print("voice person",speaker_list[-1])
37
+ voice_speaker = speaker_list[-1]
38
+ print("Voice Gender",voice_gender)
39
+
40
+ try:
41
+
42
+ wav = tts.tts(
43
+ text=text['text'],
44
+ speaker=voice_speaker.replace('_',' '),
45
+ language="en"
46
+ )
47
+ print("✓ Audio Generated")
48
+ parent_wav.append(wav)
49
+
50
+ except Exception as e:
51
+ print(f"✗ Error with : {e}")
52
+ combined = np.concatenate(parent_wav)
53
+ return combined,tts.synthesizer.output_sample_rate
54
+
55
+ # parent_wav = generate_audio(gender_list,english_transcript,tts)
utils/create_directories.py ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ def create_directories(config):
3
+ dir_path = config["dir_path"]
4
+ dir_names = ""
5
+ if os.path.exists(dir_path):
6
+ with open(dir_path,"r") as f:
7
+ dir_names = f.readlines()
8
+ for dir_name in dir_names:
9
+ dir_name = dir_name.strip()
10
+ if not os.path.exists(dir_name):
11
+ os.mkdir(dir_name)
12
+ else:
13
+ print(f"Ensure {dir_path} exists in utils")
utils/directories.txt ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ./dataset
2
+ ./dataset/images
3
+ ./dataset/labels
4
+ ./dataset/json
5
+ ./dataset/panels
6
+ ./dataset/panel_data
7
+ ./dataset/annotations
8
+ ./dataset/images/train
9
+ ./dataset/images/val
10
+ ./dataset/images/test
11
+ ./dataset/labels/train
12
+ ./dataset/labels/val
13
+ ./dataset/labels/test
utils/get_yolo_predictions.py ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ CLASSES = {0:"body", 1:"face", 2:"frame", 3:"text"} # confirmed by user
3
+
4
+ def get_yolo_predictions(yolo_model,img_path:str):
5
+ results = yolo_model.predict(source=img_path, device='cuda')
6
+ dets = {"body":[], "face":[], "frame":[], "text":[]}
7
+ for box in results[0].boxes:
8
+ c = int(box.cls[0])
9
+ x1, y1, x2, y2 = map(int, box.xyxy[0])
10
+ dets[CLASSES[c]].append([x1,y1,x2,y2])
11
+ return dets
utils/identify_speaker.py ADDED
@@ -0,0 +1,443 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Complete Standalone End-to-End Speech-to-Speaker Association Inference Script
4
+ Includes all necessary functions without external dependencies on custom files
5
+ """
6
+
7
+ import json
8
+ import torch
9
+ import cv2
10
+ from torch_geometric.data import HeteroData, Batch
11
+ from typing import Dict, List, Any, Optional, Tuple, Union
12
+ import os
13
+ from ultralytics import YOLO
14
+ from utils.train_speaker import hungarian_matching,AssocGCN,infer_associations,DatasetLoader
15
+
16
+
17
+
18
+ # ============================================================================
19
+ # NEW CLASSES FOR UNSEEN IMAGE PROCESSING
20
+ # ============================================================================
21
+
22
+ class DetectionPredictions:
23
+ """Container for object detection predictions from an unseen image"""
24
+
25
+ def __init__(self, image_path: str):
26
+ self.image_path = image_path
27
+ self.panels: List[Dict] = []
28
+ self.speech_bubbles: List[Dict] = []
29
+ self.faces: List[Dict] = []
30
+ self.bodies: List[Dict] = []
31
+ self.image_size: Tuple[int, int] = (0, 0) # (width, height)
32
+
33
+ def add_panel(self, bbox: List[float], confidence: float, panel_id: int):
34
+ """Add panel detection (frame in YOLO terms)"""
35
+ self.panels.append({
36
+ 'bbox': bbox, 'confidence': confidence, 'id': panel_id
37
+ })
38
+
39
+ def add_speech_bubble(self, bbox: List[float], confidence: float,
40
+ panel_id: int, bubble_id: int):
41
+ """Add speech bubble detection (text in YOLO terms)"""
42
+ self.speech_bubbles.append({
43
+ 'bbox': bbox, 'confidence': confidence,
44
+ 'panel_id': panel_id, 'id': bubble_id
45
+ })
46
+
47
+ def add_face(self, bbox: List[float], confidence: float,
48
+ panel_id: int, face_id: int):
49
+ """Add face detection"""
50
+ self.faces.append({
51
+ 'bbox': bbox, 'confidence': confidence,
52
+ 'panel_id': panel_id, 'id': face_id
53
+ })
54
+
55
+ def add_body(self, bbox: List[float], confidence: float,
56
+ panel_id: int, body_id: int):
57
+ """Add body detection"""
58
+ self.bodies.append({
59
+ 'bbox': bbox, 'confidence': confidence,
60
+ 'panel_id': panel_id, 'id': body_id
61
+ })
62
+
63
+
64
+ # ============================================================================
65
+ # ENHANCED DATA PROCESSING FUNCTIONS
66
+ # ============================================================================
67
+
68
+ def create_panel_dict_from_predictions(predictions: DetectionPredictions,
69
+ panel_id: int) -> Dict:
70
+ """
71
+ ADAPTED: Creates a panel dictionary from detection predictions
72
+ Uses the same format as original create_hetero_data_from_panel expects
73
+ """
74
+ # Filter predictions for this panel
75
+ panel_bubbles = [b for b in predictions.speech_bubbles if b['panel_id'] == panel_id]
76
+ panel_faces = [f for f in predictions.faces if f['panel_id'] == panel_id]
77
+
78
+ # Create bubbles list in expected format
79
+ bubbles = []
80
+ for i, bubble in enumerate(panel_bubbles):
81
+ bubbles.append({
82
+ 'bubble_id': bubble['id'],
83
+ 'bbox': bubble['bbox']
84
+ })
85
+
86
+ # Create faces list in expected format
87
+ faces = []
88
+ for i, face in enumerate(panel_faces):
89
+ faces.append({
90
+ 'face_id': face['id'],
91
+ 'bbox': face['bbox']
92
+ })
93
+
94
+ # Create panel dict in expected format
95
+ panel_dict = {
96
+ 'panel_id': f"unseen_panel_{panel_id}",
97
+ 'width': predictions.image_size[0],
98
+ 'height': predictions.image_size[1],
99
+ 'bubbles': bubbles,
100
+ 'faces': faces,
101
+ 'links': [] # No ground truth links for unseen images
102
+ }
103
+
104
+ return panel_dict
105
+
106
+
107
+ def create_hetero_data_from_predictions(predictions: DetectionPredictions,
108
+ panel_id: int) -> Optional[HeteroData]:
109
+ """
110
+ ENHANCED: Creates HeteroData from detection predictions
111
+ Leverages existing create_hetero_data_from_panel function
112
+ """
113
+ # Convert predictions to panel dict format
114
+ panel_dict = create_panel_dict_from_predictions(predictions, panel_id)
115
+
116
+ # Use existing function to create HeteroData
117
+ hetero_data = DatasetLoader.create_hetero_data_from_panel(panel_dict)
118
+
119
+ if hetero_data is not None:
120
+ # Remove ground truth labels since we don't have them for unseen images
121
+ if hasattr(hetero_data['bubble', 'to', 'face'], 'edge_label'):
122
+ delattr(hetero_data['bubble', 'to', 'face'], 'edge_label')
123
+
124
+ return hetero_data
125
+
126
+
127
+ # ============================================================================
128
+ # YOLO DETECTION INTEGRATION
129
+ # ============================================================================
130
+
131
+ def get_predictions_from_yolo(img_path: str, yolo_model) -> DetectionPredictions:
132
+ """
133
+ INTEGRATED: YOLO detection pipeline that maps to our detection categories
134
+ Maps YOLO classes: {0:"body", 1:"face", 2:"frame", 3:"text"}
135
+ """
136
+ CLASSES = {0: "body", 1: "face", 2: "frame", 3: "text"}
137
+
138
+ # Create predictions container
139
+ predictions = DetectionPredictions(img_path)
140
+
141
+ # Load image to get dimensions
142
+ img = cv2.imread(img_path)
143
+ if img is None:
144
+ raise ValueError(f"Could not load image: {img_path}")
145
+ height, width = img.shape[:2]
146
+ predictions.image_size = (width, height)
147
+
148
+ # Get YOLO predictions
149
+ results = yolo_model.predict(source=img_path, device='cuda', verbose=False)
150
+
151
+ # Process detections and organize by type
152
+ detections_by_type = {"body": [], "face": [], "frame": [], "text": []}
153
+
154
+ for box in results[0].boxes:
155
+ c = int(box.cls[0])
156
+ x1, y1, x2, y2 = map(int, box.xyxy[0])
157
+ confidence = float(box.conf[0])
158
+
159
+ detection_type = CLASSES[c]
160
+ detections_by_type[detection_type].append({
161
+ 'bbox': [x1, y1, x2, y2],
162
+ 'confidence': confidence
163
+ })
164
+
165
+ # Process frame detections as panels
166
+ for i, frame in enumerate(detections_by_type["frame"]):
167
+ predictions.add_panel(frame['bbox'], frame['confidence'], i)
168
+
169
+ # If no frames detected, create a default full-image panel
170
+ if len(predictions.panels) == 0:
171
+ predictions.add_panel([0, 0, width, height], 1.0, 0)
172
+
173
+ # Process text detections as speech bubbles
174
+ for i, text in enumerate(detections_by_type["text"]):
175
+ panel_id = find_containing_panel(text['bbox'], predictions.panels)
176
+ predictions.add_speech_bubble(text['bbox'], text['confidence'], panel_id, i)
177
+
178
+ from collections import defaultdict
179
+
180
+ # Group bubbles by panel_id
181
+ grouped_texts = defaultdict(list)
182
+ for bubble in predictions.speech_bubbles:
183
+ grouped_texts[bubble['panel_id']].append(bubble)
184
+
185
+ # Assign seq based on right-to-left order (sort by bbox[0] descending)
186
+ for panel_id, bubbles in grouped_texts.items():
187
+ sorted_bubbles = sorted(bubbles, key=lambda b: b['bbox'][0], reverse=True)
188
+ for seq, bubble in enumerate(sorted_bubbles):
189
+ bubble['seq'] = seq
190
+
191
+ # # Optional: print results
192
+ # for bubble in speech_bubbles:
193
+ # print(bubble)
194
+
195
+ # Process face detections
196
+ for i, face in enumerate(detections_by_type["face"]):
197
+ panel_id = find_containing_panel(face['bbox'], predictions.panels)
198
+ predictions.add_face(face['bbox'], face['confidence'], panel_id, i)
199
+
200
+ # Process body detections (optional - can be used for additional context)
201
+ for i, body in enumerate(detections_by_type["body"]):
202
+ panel_id = find_containing_panel(body['bbox'], predictions.panels)
203
+ predictions.add_body(body['bbox'], body['confidence'], panel_id, i)
204
+
205
+ return predictions
206
+
207
+
208
+ def find_containing_panel(bbox: List[float], panels: List[Dict]) -> int:
209
+ """
210
+ Helper function to determine which panel contains a detection
211
+ """
212
+ bbox_center_x = (bbox[0] + bbox[2]) / 2
213
+ bbox_center_y = (bbox[1] + bbox[3]) / 2
214
+
215
+ for panel in panels:
216
+ p_bbox = panel['bbox']
217
+ if (p_bbox[0] <= bbox_center_x <= p_bbox[2] and
218
+ p_bbox[1] <= bbox_center_y <= p_bbox[3]):
219
+ return panel['id']
220
+
221
+ # Return first panel if not contained in any
222
+ return panels[0]['id'] if panels else 0
223
+
224
+
225
+ # ============================================================================
226
+ # MODEL LOADING WITH ERROR HANDLING
227
+ # ============================================================================
228
+
229
+ def load_trained_speaker_model(model_path: str, device: str = "cuda") -> AssocGCN:
230
+ """
231
+ FIXED: Load trained AssocGCN model with proper error handling
232
+ Handles different checkpoint formats including 'model_state' key
233
+ """
234
+ # Create model instance
235
+ model = AssocGCN().to(device)
236
+
237
+ # Load the checkpoint
238
+ checkpoint = torch.load(model_path, map_location=device)
239
+
240
+ # Handle different checkpoint formats
241
+ if isinstance(checkpoint, dict):
242
+ # Check for different possible keys where model state is stored
243
+ if 'model_state_dict' in checkpoint:
244
+ state_dict = checkpoint['model_state_dict']
245
+ elif 'model_state' in checkpoint: # This handles your specific case
246
+ state_dict = checkpoint['model_state']
247
+ elif 'state_dict' in checkpoint:
248
+ state_dict = checkpoint['state_dict']
249
+ else:
250
+ # If no specific key, assume the entire checkpoint is the state dict
251
+ # but filter out non-model keys
252
+ state_dict = {k: v for k, v in checkpoint.items()
253
+ if k not in {'epoch', 'loss', 'optimizer_state_dict'}}
254
+ else:
255
+ # Assume checkpoint is directly the state dict
256
+ state_dict = checkpoint
257
+
258
+ # Handle DataParallel module prefix if present
259
+ if any(key.startswith('module.') for key in state_dict.keys()):
260
+ new_state_dict = {}
261
+ for key, value in state_dict.items():
262
+ new_key = key.replace('module.', '') if key.startswith('module.') else key
263
+ new_state_dict[new_key] = value
264
+ state_dict = new_state_dict
265
+
266
+ try:
267
+ # Try to load with strict=True first
268
+ model.load_state_dict(state_dict, strict=True)
269
+ print("✅ Model loaded successfully with strict=True")
270
+ except RuntimeError as e:
271
+ print(f"⚠️ Warning: {str(e)}")
272
+ print("Attempting to load with strict=False...")
273
+ try:
274
+ # Try with strict=False to ignore missing/unexpected keys
275
+ missing_keys, unexpected_keys = model.load_state_dict(state_dict, strict=False)
276
+ if missing_keys:
277
+ print(f"⚠️ Missing keys: {missing_keys}")
278
+ if unexpected_keys:
279
+ print(f"⚠️ Unexpected keys: {unexpected_keys}")
280
+ print("✅ Model loaded successfully with strict=False")
281
+ except Exception as e2:
282
+ print(f"❌ Failed to load model: {str(e2)}")
283
+ raise e2
284
+
285
+ model.eval()
286
+ return model
287
+
288
+
289
+ # ============================================================================
290
+ # MAIN INFERENCE PIPELINE
291
+ # ============================================================================
292
+
293
+ @torch.no_grad()
294
+ def identify_speaker(model: AssocGCN, image_path: str,
295
+ yolo_model, device: str = "cuda") -> Dict[str, Dict[int, int]]:
296
+ """
297
+ COMPLETE: End-to-end inference pipeline for unseen images
298
+ """
299
+
300
+ try:
301
+ # Step 1: Run YOLO detection pipeline
302
+ print(f"🔍 Running YOLO detection on {os.path.basename(image_path)}...")
303
+ predictions = get_predictions_from_yolo(image_path, yolo_model)
304
+
305
+ # print(f"📊 Detection Results:")
306
+ # print(f" - Panels/Frames: {len(predictions.panels)}")
307
+ # print(f" - Speech Bubbles/Text: {len(predictions.speech_bubbles)}")
308
+ # print(f" - Faces: {len(predictions.faces)}")
309
+ # print(f" - Bodies: {len(predictions.bodies)}")
310
+
311
+ # Check if we have the minimum required detections
312
+ if len(predictions.speech_bubbles) == 0:
313
+ print("⚠️ No speech bubbles/text detected. Cannot perform association.")
314
+ return {}
315
+
316
+ if len(predictions.faces) == 0:
317
+ print("⚠️ No faces detected. Cannot perform association.")
318
+ return {}
319
+
320
+ # Step 2: Create HeteroData for each panel
321
+ panels_data = []
322
+ panel_ids = list(set([p['id'] for p in predictions.panels]))
323
+
324
+ for panel_id in panel_ids:
325
+ panel_data = create_hetero_data_from_predictions(predictions, panel_id)
326
+ # print("panel data", panel_data , dir(panel_data))
327
+ # print(( 'bubble' in panel_data.node_types , 'face' in panel_data.node_types,
328
+ # panel_data['bubble'].x.size(0) > 0 , panel_data['face'].x.size(0) > 0))
329
+ if panel_data is not None:
330
+ panel_data = panel_data.to(device)
331
+ panels_data.append(panel_data)
332
+
333
+ # Step 3: Run model inference
334
+ results = {}
335
+
336
+ for panel_data in panels_data:
337
+ # print("ppppp",panel_data == None)
338
+ # if (hasattr(panel_data, 'bubble') and hasattr(panel_data, 'face') and
339
+ # panel_data['bubble'].x.size(0) > 0 and panel_data['face'].x.size(0) > 0):
340
+ if ( panel_data != None and 'bubble' in panel_data.node_types and 'face' in panel_data.node_types and
341
+ panel_data['bubble'].x.size(0) > 0 and panel_data['face'].x.size(0) > 0):
342
+
343
+ # Run inference using existing function
344
+ mapping = infer_associations(model, panel_data)
345
+
346
+ # print(f"\\n🖼️ {panel_data.panel_id}:")
347
+ # if mapping:
348
+ # for bubble_idx, face_idx in mapping.items():
349
+ # print(f" Text/Bubble {bubble_idx} → Face {face_idx}")
350
+ # else:
351
+ # print(" No associations found")
352
+
353
+ results[panel_data.panel_id] = mapping
354
+ else:
355
+ print(f"⚠️ {panel_data.panel_id}: No valid bubbles or faces, skipping...")
356
+
357
+
358
+ total_associations = 0
359
+ if results:
360
+ for panel_id, mapping in results.items():
361
+ if mapping:
362
+ # print(f"\\n🖼️ {panel_id}:")
363
+ for bubble_id, face_id in mapping.items():
364
+ # print(f" Text/Bubble {bubble_id} ← → Face {face_id}")
365
+ total_associations += 1
366
+ # else:
367
+ # print(f"\\n🖼️ {panel_id}: No associations found")
368
+ else:
369
+ print("No associations found in the image.")
370
+
371
+ print(f"\\n✅ Total associations found: {total_associations}")
372
+
373
+ return results,predictions
374
+
375
+ except Exception as e:
376
+ print(f"❌ Error during inference: {str(e)}")
377
+ raise e
378
+
379
+
380
+
381
+
382
+
383
+
384
+
385
+
386
+
387
+
388
+
389
+
390
+
391
+
392
+
393
+
394
+
395
+ # #!/usr/bin/env python3
396
+ # """
397
+ # inference.py ── Run saved AssocGCN on new images / panels.
398
+ # Generate per-panel mappings and (optionally) aggregate metrics.
399
+ # """
400
+ # import argparse
401
+ # import torch
402
+ # from pathlib import Path
403
+ # from torch_geometric.data import Batch
404
+ # from typing import Dict, List
405
+
406
+ # from utils import load_model
407
+ # from train_speaker import DatasetLoader, infer_associations # your original file
408
+
409
+ # @torch.no_grad()
410
+ # def evaluate_panels(model,
411
+ # panels: List["HeteroData"],
412
+ # compute_metrics: bool = True) -> None:
413
+ # device = next(model.parameters()).device
414
+ # tp = fp = fn = 0
415
+
416
+ # for idx, data in enumerate(panels):
417
+ # mapping = infer_associations(model, data)
418
+ # print(f"\n🖼️ Panel {data.panel_id}:")
419
+ # for bub, face in mapping.items():
420
+ # print(f" Bubble {bub} → Face {face}")
421
+
422
+ # if compute_metrics and "edge_label" in data["bubble", "to", "face"]:
423
+ # gt = {(i.item(), j.item())
424
+ # for i, j, lbl in zip(*data["bubble", "to", "face"].edge_index,
425
+ # data["bubble", "to", "face"].edge_label)
426
+ # if lbl == 1}
427
+ # pred = {(b, f) for b, f in mapping.items()}
428
+
429
+ # tp += len(gt & pred)
430
+ # fp += len(pred - gt)
431
+ # fn += len(gt - pred)
432
+
433
+ # if compute_metrics:
434
+ # prec = tp / (tp + fp) if (tp + fp) else 0
435
+ # rec = tp / (tp + fn) if (tp + fn) else 0
436
+ # f1 = 2*prec*rec/(prec+rec) if (prec+rec) else 0
437
+ # print("\n📊 Aggregated metrics")
438
+ # print(f" Precision: {prec:.3f}")
439
+ # print(f" Recall : {rec:.3f}")
440
+ # print(f" F1 Score : {f1:.3f}")
441
+
442
+ # def identify_speaker(config):
443
+ # pass
utils/japanese_ocr.py ADDED
@@ -0,0 +1,52 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ from pathlib import Path
3
+
4
+ import jaconv
5
+ import torch
6
+ from PIL import Image
7
+ from loguru import logger
8
+ from transformers import ViTImageProcessor, AutoTokenizer, VisionEncoderDecoderModel, GenerationMixin
9
+
10
+
11
+ class MangaOcrModel(VisionEncoderDecoderModel, GenerationMixin):
12
+ pass
13
+
14
+ class MangaOcr:
15
+ def __init__(self, pretrained_model_name_or_path="kha-white/manga-ocr-base", force_cpu=False):
16
+ logger.info(f"Loading OCR model from {pretrained_model_name_or_path}")
17
+ self.processor = ViTImageProcessor.from_pretrained(pretrained_model_name_or_path)
18
+ self.tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path)
19
+ self.model = MangaOcrModel.from_pretrained(pretrained_model_name_or_path)
20
+
21
+ if not force_cpu and torch.cuda.is_available():
22
+ logger.info("Using CUDA")
23
+ self.model.cuda()
24
+ elif not force_cpu and torch.backends.mps.is_available():
25
+ logger.info("Using MPS")
26
+ self.model.to("mps")
27
+ else:
28
+ logger.info("Using CPU")
29
+ logger.info("OCR ready")
30
+
31
+ def __call__(self, img_or_path):
32
+ img = img_or_path.convert("L").convert("RGB")
33
+
34
+ x = self._preprocess(img)
35
+ x = self.model.generate(x[None].to(self.model.device), max_length=300)[0].cpu()
36
+ x = self.tokenizer.decode(x, skip_special_tokens=True)
37
+ x = post_process(x)
38
+ return x
39
+
40
+ def _preprocess(self, img):
41
+ pixel_values = self.processor(img, return_tensors="pt").pixel_values
42
+ return pixel_values.squeeze()
43
+
44
+
45
+ def post_process(text):
46
+ text = "".join(text.split())
47
+ text = text.replace("…", "...")
48
+ text = re.sub("[・.]{2,}", lambda x: (x.end() - x.start()) * ".", text)
49
+ text = jaconv.h2z(text, ascii=True, digit=True)
50
+
51
+ return text
52
+
utils/json_to_coco.py ADDED
@@ -0,0 +1,131 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import os
3
+
4
+ def json_to_coco(config):
5
+ base_dir = config["root"]
6
+ image_dir = config["root_images"]
7
+ annotations_dir = config["root_annotations"]
8
+
9
+ def create_coco(json_dir):
10
+ # COCO base structure
11
+ coco = {
12
+ "images": [],
13
+ "annotations": [],
14
+ "categories": []
15
+ }
16
+ uniq_filename_mapping = {}
17
+ # Category mapping
18
+ category_names = ["body", "face", "frame", "text"]
19
+ category_map = {name: idx + 1 for idx, name in enumerate(category_names)}
20
+ coco["categories"] = [{"id": cid, "name": name, "supercategory": "object"} for name, cid in category_map.items()]
21
+ import copy
22
+ coco_train = copy.deepcopy(coco)
23
+ coco_test = copy.deepcopy(coco)
24
+ coco_val = copy.deepcopy(coco)
25
+ uniq = 0
26
+ for book_json in os.listdir(json_dir):
27
+ book = str(book_json).split(".")[0]
28
+ data = []
29
+ with open(base_dir +"json/"+book+".json", 'r',encoding='utf-8') as file:
30
+ data = json.load(file)
31
+
32
+ pages = data["page"]
33
+
34
+ train_output = []
35
+ val_output = []
36
+ test_output = []
37
+
38
+ import math
39
+ import shutil
40
+
41
+ page_count = len(pages) - 2
42
+ div = page_count / 10
43
+ div = int(math.ceil(div))
44
+ # train_count = page_count - (2 * div)
45
+ # val_count = div
46
+ # test_count = div
47
+ # print(train_count,val_count,test_count)
48
+ train_arr = []
49
+ val_arr = []
50
+ test_arr = []
51
+ count = 0
52
+ for i in range(page_count,0,-1):
53
+ if count < div :
54
+ test_arr.append(i)
55
+ elif count < 2*div :
56
+ val_arr.append(i)
57
+ else:
58
+ train_arr.append(i)
59
+ count += 1
60
+
61
+ # Annotation ID counter
62
+ ann_id = 1
63
+
64
+ # Helper to create COCO bbox annotation
65
+ def create_annotation(ann, category_name, image_id, ann_id,uniq):
66
+ x_min = int(ann["@xmin"])
67
+ y_min = int(ann["@ymin"])
68
+ x_max = int(ann["@xmax"])
69
+ y_max = int(ann["@ymax"])
70
+ width = x_max - x_min
71
+ height = y_max - y_min
72
+
73
+ return {
74
+ "id": ann_id,
75
+ "image_id": uniq,
76
+ "category_id": category_map[category_name],
77
+ "bbox": [x_min, y_min, width, height],
78
+ "area": width * height,
79
+ "iscrowd": 0
80
+ }
81
+
82
+ def append_data(coco, ann_id,uniq):
83
+ # Add image entry
84
+ coco["images"].append({
85
+ "id": uniq,
86
+ "width": page["@width"],
87
+ "height": page["@height"],
88
+ "file_name": f"{book}_{str(img_id).zfill(3)}.jpg"
89
+ })
90
+
91
+
92
+ # Loop through each category and add annotations
93
+ for category in category_names:
94
+ for ann in page.get(category, []):
95
+ coco["annotations"].append(
96
+ create_annotation(ann, category, img_id, ann_id,uniq)
97
+ )
98
+ ann_id += 1
99
+
100
+
101
+ for page in pages:
102
+ uniq += 1
103
+ img_id = int(page["@index"])
104
+ if img_id == 0 or img_id == 1 :
105
+ continue
106
+
107
+ curr_img_path = os.path.join(config["manga109_root_dir"],"images",book,f"{str(img_id).zfill(3)}.jpg")
108
+ uniq_filename_mapping[uniq] = book+"_"+(curr_img_path.split("\\")[-1])
109
+ if img_id in train_arr:
110
+ append_data(coco_train, ann_id,uniq)
111
+ train_output.append(coco)
112
+ shutil.copyfile(curr_img_path,os.path.join(image_dir,"train/",book+"_"+(curr_img_path.split("\\")[-1])))
113
+ if img_id in val_arr:
114
+ append_data(coco_val, ann_id,uniq)
115
+ val_output.append(coco)
116
+ shutil.copyfile(curr_img_path,os.path.join(image_dir,"val/",book+"_"+(curr_img_path.split("\\")[-1])))
117
+ if img_id in test_arr:
118
+ append_data(coco_test, ann_id,uniq)
119
+ test_output.append(coco)
120
+ shutil.copyfile(curr_img_path,os.path.join(image_dir,"test/",book+"_"+(curr_img_path.split("\\")[-1])))
121
+
122
+ with open(base_dir+"temp_uniq_file_name_mapping.json","w",encoding="utf-8") as f:
123
+ json.dump(uniq_filename_mapping, f, ensure_ascii=False, indent=2)
124
+ with open(annotations_dir+"instances_train.json", "w", encoding="utf-8") as f:
125
+ json.dump(coco_train, f, ensure_ascii=False, indent=2)
126
+ with open(annotations_dir+"instances_val.json", "w", encoding="utf-8") as f:
127
+ json.dump(coco_val, f, ensure_ascii=False, indent=2)
128
+ with open(annotations_dir+"instances_test.json", "w", encoding="utf-8") as f:
129
+ json.dump(coco_test, f, ensure_ascii=False, indent=2)
130
+
131
+ create_coco(base_dir+"json/")
utils/json_to_gcn.py ADDED
@@ -0,0 +1,484 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Enhanced Manga Panel Dataset Converter: Extract frames as panels and prepare GCN-ready dataset.
3
+ Crops each frame from manga pages and creates dataset with elements within each panel.
4
+ """
5
+
6
+ import json
7
+ import os
8
+ import argparse
9
+ from pathlib import Path
10
+ from typing import Dict, List, Any, Optional, Tuple
11
+ import difflib
12
+ import math
13
+ import cv2
14
+ import numpy as np
15
+
16
+
17
+ class MangaPanelDatasetConverter:
18
+ def __init__(self, fuzzy_mapping: bool = True, proximity_threshold: float = 200.0):
19
+ self.fuzzy_mapping = fuzzy_mapping
20
+ self.proximity_threshold = proximity_threshold
21
+ self.character_mapping = {}
22
+ self.stats = {
23
+ 'total_pages': 0,
24
+ 'total_panels': 0,
25
+ 'total_bubbles': 0,
26
+ 'total_faces': 0,
27
+ 'total_bodies': 0,
28
+ 'successful_links': 0,
29
+ 'failed_links': 0,
30
+ 'unique_characters': set(),
31
+ 'extracted_panels': 0
32
+ }
33
+
34
+ def load_manga_data(self, file_path: str) -> Dict[str, Any]:
35
+ """Load manga data from JSON file."""
36
+ with open(file_path, 'r', encoding='utf-8') as f:
37
+ return json.load(f)
38
+
39
+ def build_character_mapping(self, character_list: List[Dict]) -> Dict[str, str]:
40
+ """Build character ID to name mapping from character list."""
41
+ mapping = {}
42
+ for char in character_list:
43
+ mapping_id = char.get('@id', '')
44
+ char_name = char.get('@name', '')
45
+ if mapping_id and char_name:
46
+ mapping[mapping_id] = char_name
47
+ return mapping
48
+
49
+ def check_overlap(self, bbox1: List[int], bbox2: List[int]) -> bool:
50
+ """Check if two bounding boxes overlap."""
51
+ x1_min, y1_min, x1_max, y1_max = bbox1
52
+ x2_min, y2_min, x2_max, y2_max = bbox2
53
+
54
+ return not (x1_max <= x2_min or x2_max <= x1_min or
55
+ y1_max <= y2_min or y2_max <= y1_min)
56
+
57
+ def is_element_in_frame(self, element_bbox: List[int], frame_bbox: List[int],
58
+ overlap_threshold: float = 0.5) -> bool:
59
+ """
60
+ Check if an element (face, body, text) is within or significantly overlaps with a frame.
61
+
62
+ Args:
63
+ element_bbox: [xmin, ymin, xmax, ymax] of the element
64
+ frame_bbox: [xmin, ymin, xmax, ymax] of the frame
65
+ overlap_threshold: Minimum overlap ratio to consider element as belonging to frame
66
+ """
67
+ if not self.check_overlap(element_bbox, frame_bbox):
68
+ return False
69
+
70
+ # Calculate intersection area
71
+ x_left = max(element_bbox[0], frame_bbox[0])
72
+ y_top = max(element_bbox[1], frame_bbox[1])
73
+ x_right = min(element_bbox[2], frame_bbox[2])
74
+ y_bottom = min(element_bbox[3], frame_bbox[3])
75
+
76
+ if x_right <= x_left or y_bottom <= y_top:
77
+ return False
78
+
79
+ intersection_area = (x_right - x_left) * (y_bottom - y_top)
80
+ element_area = (element_bbox[2] - element_bbox[0]) * (element_bbox[3] - element_bbox[1])
81
+
82
+ # Check if significant portion of element is within frame
83
+ overlap_ratio = intersection_area / element_area if element_area > 0 else 0
84
+ return overlap_ratio >= overlap_threshold
85
+
86
+ def adjust_coordinates_to_frame(self, element_bbox: List[int], frame_bbox: List[int]) -> List[int]:
87
+ """Adjust element coordinates to be relative to the cropped frame."""
88
+ frame_x_min, frame_y_min = frame_bbox[0], frame_bbox[1]
89
+
90
+ adjusted_bbox = [
91
+ element_bbox[0] - frame_x_min, # xmin
92
+ element_bbox[1] - frame_y_min, # ymin
93
+ element_bbox[2] - frame_x_min, # xmax
94
+ element_bbox[3] - frame_y_min # ymax
95
+ ]
96
+
97
+ # Ensure coordinates are not negative
98
+ adjusted_bbox = [max(0, coord) for coord in adjusted_bbox]
99
+
100
+ return adjusted_bbox
101
+
102
+ def extract_panel_from_page(self, image_path: str, frame_data: Dict,
103
+ output_dir: str, page_index: int, frame_index: int) -> Optional[str]:
104
+ """
105
+ Extract a single panel (frame) from manga page image.
106
+
107
+ Args:
108
+ image_path: Path to the manga page image
109
+ frame_data: Frame annotation with coordinates
110
+ output_dir: Directory to save extracted panel
111
+ page_index: Index of the current page
112
+ frame_index: Index of the frame within the page
113
+
114
+ Returns:
115
+ Path to extracted panel image or None if failed
116
+ """
117
+ # print(image_path)
118
+ # Load the image
119
+ image = cv2.imread(image_path)
120
+ if image is None:
121
+ print(f"Warning: Could not load image {image_path}")
122
+ return None
123
+
124
+ try:
125
+ # Get frame coordinates
126
+ xmin = int(frame_data.get('@xmin', 0))
127
+ ymin = int(frame_data.get('@ymin', 0))
128
+ xmax = int(frame_data.get('@xmax', image.shape[1]))
129
+ ymax = int(frame_data.get('@ymax', image.shape[0]))
130
+
131
+ # Validate and clip coordinates
132
+ xmin = max(0, xmin)
133
+ ymin = max(0, ymin)
134
+ xmax = min(image.shape[1], xmax)
135
+ ymax = min(image.shape[0], ymax)
136
+
137
+ if xmax <= xmin or ymax <= ymin:
138
+ print(f"Warning: Invalid frame coordinates for page {page_index}, frame {frame_index}")
139
+ return None
140
+
141
+ # Crop the panel
142
+ cropped_panel = image[ymin:ymax, xmin:xmax]
143
+
144
+ # Generate panel filename
145
+ panel_filename = f"page_{page_index:04d}_panel_{frame_index:03d}.jpg"
146
+ panel_path = os.path.join(output_dir, panel_filename)
147
+
148
+ # Save the cropped panel
149
+ success = cv2.imwrite(panel_path, cropped_panel)
150
+ if success:
151
+ self.stats['extracted_panels'] += 1
152
+ # print(f"Extracted panel: {panel_path}")
153
+ return panel_path
154
+ else:
155
+ print(f"Warning: Failed to save panel {panel_path}")
156
+ return None
157
+
158
+ except Exception as e:
159
+ print(f"Error extracting panel from page {page_index}, frame {frame_index}: {str(e)}")
160
+ return None
161
+
162
+ def calculate_distance(self, bbox1: List[int], bbox2: List[int]) -> float:
163
+ """Calculate Euclidean distance between centers of two bounding boxes."""
164
+ cx1, cy1 = (bbox1[0] + bbox1[2]) / 2, (bbox1[1] + bbox1[3]) / 2
165
+ cx2, cy2 = (bbox2[0] + bbox2[2]) / 2, (bbox2[1] + bbox2[3]) / 2
166
+ return math.sqrt((cx1 - cx2)**2 + (cy1 - cy2)**2)
167
+
168
+ def find_character_by_body_id(self, body_id: str, bodies: List[Dict]) -> Optional[str]:
169
+ """Find character ID associated with a body ID."""
170
+ for body in bodies:
171
+ if body.get('@id') == body_id:
172
+ return body.get('@character')
173
+ return None
174
+
175
+ def find_face_by_character(self, character_id: str, faces: List[Dict]) -> Optional[int]:
176
+ """Find face index by character ID."""
177
+ for i, face in enumerate(faces):
178
+ if face.get('@character') == character_id:
179
+ return i
180
+ return None
181
+
182
+ def find_closest_face(self, bubble_bbox: List[int], faces: List[Dict]) -> Optional[int]:
183
+ """Find closest face to bubble based on spatial proximity."""
184
+ if not faces:
185
+ return None
186
+
187
+ min_distance = float('inf')
188
+ closest_face_idx = None
189
+
190
+ for i, face in enumerate(faces):
191
+ face_bbox = [
192
+ face['bbox'][0], face['bbox'][1],
193
+ face['bbox'][2], face['bbox'][3]
194
+ ]
195
+ distance = self.calculate_distance(bubble_bbox, face_bbox)
196
+
197
+ if distance < min_distance and distance <= self.proximity_threshold:
198
+ min_distance = distance
199
+ closest_face_idx = i
200
+
201
+ return closest_face_idx
202
+
203
+ def associate_bubble_to_face(self, text_item: Dict, faces: List[Dict],
204
+ bodies: List[Dict], original_bodies: List[Dict]) -> Optional[int]:
205
+ """Associate a speech bubble to a face using multiple strategies."""
206
+ bubble_bbox = text_item['bbox']
207
+
208
+ # Strategy 1: Direct body ID to character mapping
209
+ body_id = text_item.get('body_ref')
210
+ if body_id:
211
+ character_id = self.find_character_by_body_id(body_id, original_bodies)
212
+ if character_id:
213
+ face_idx = self.find_face_by_character(character_id, faces)
214
+ if face_idx is not None:
215
+ return face_idx
216
+
217
+ # Strategy 2: Spatial proximity - find closest face
218
+ closest_face_idx = self.find_closest_face(bubble_bbox, faces)
219
+ if closest_face_idx is not None:
220
+ return closest_face_idx
221
+
222
+ # Strategy 3: Single face fallback
223
+ if len(faces) == 1:
224
+ return 0
225
+
226
+ return None
227
+
228
+ def process_panel(self, page_data: Dict, frame_data: Dict, page_index: int,
229
+ frame_index: int, character_mapping: Dict[str, str],
230
+ image_path: str = None, panels_output_dir: str = None) -> Dict[str, Any]:
231
+ """Process a single panel (frame) and create GCN dataset entry."""
232
+
233
+ # Extract panel image if paths provided
234
+ panel_image_path = None
235
+ if image_path and panels_output_dir:
236
+ panel_image_path = self.extract_panel_from_page(
237
+ image_path, frame_data, panels_output_dir, page_index, frame_index
238
+ )
239
+
240
+ # Get frame bounding box
241
+ frame_bbox = [
242
+ frame_data['@xmin'], frame_data['@ymin'],
243
+ frame_data['@xmax'], frame_data['@ymax']
244
+ ]
245
+
246
+ panel_width = frame_bbox[2] - frame_bbox[0]
247
+ panel_height = frame_bbox[3] - frame_bbox[1]
248
+
249
+ # Find elements within this frame
250
+ panel_faces = []
251
+ panel_bodies = []
252
+ panel_bubbles = []
253
+
254
+ # Process faces
255
+ for face in page_data.get('face', []):
256
+ face_bbox = [face['@xmin'], face['@ymin'], face['@xmax'], face['@ymax']]
257
+ if self.is_element_in_frame(face_bbox, frame_bbox):
258
+ adjusted_bbox = self.adjust_coordinates_to_frame(face_bbox, frame_bbox)
259
+ character_id = face.get('@character', '')
260
+ character_name = character_mapping.get(character_id, character_id)
261
+
262
+ panel_faces.append({
263
+ "face_id": len(panel_faces),
264
+ "bbox": adjusted_bbox,
265
+ "original_id": face.get('@id', ''),
266
+ "attributes": {
267
+ "character_id": character_id,
268
+ "character_name": character_name
269
+ }
270
+ })
271
+
272
+ if character_id:
273
+ self.stats['unique_characters'].add(character_id)
274
+
275
+ # Process bodies
276
+ for body in page_data.get('body', []):
277
+ body_bbox = [body['@xmin'], body['@ymin'], body['@xmax'], body['@ymax']]
278
+ if self.is_element_in_frame(body_bbox, frame_bbox):
279
+ adjusted_bbox = self.adjust_coordinates_to_frame(body_bbox, frame_bbox)
280
+ character_id = body.get('@character', '')
281
+ character_name = character_mapping.get(character_id, character_id)
282
+
283
+ panel_bodies.append({
284
+ "body_id": len(panel_bodies),
285
+ "bbox": adjusted_bbox,
286
+ "original_id": body.get('@id', ''),
287
+ "attributes": {
288
+ "character_id": character_id,
289
+ "character_name": character_name
290
+ }
291
+ })
292
+
293
+ # Process text/speech bubbles
294
+ for text in page_data.get('text', []):
295
+ text_bbox = [text['@xmin'], text['@ymin'], text['@xmax'], text['@ymax']]
296
+ if self.is_element_in_frame(text_bbox, frame_bbox):
297
+ adjusted_bbox = self.adjust_coordinates_to_frame(text_bbox, frame_bbox)
298
+
299
+ panel_bubbles.append({
300
+ "bubble_id": len(panel_bubbles),
301
+ "bbox": adjusted_bbox,
302
+ "text": text.get('#text', ''),
303
+ "original_id": text.get('@id', ''),
304
+ "body_ref": text.get('@bodyid', '')
305
+ })
306
+
307
+ # Create bubble-to-face links
308
+ links = []
309
+ original_bodies = page_data.get('body', [])
310
+
311
+ for bubble in panel_bubbles:
312
+ face_idx = self.associate_bubble_to_face(
313
+ bubble, panel_faces, panel_bodies, original_bodies
314
+ )
315
+ if face_idx is not None:
316
+ links.append({
317
+ "bubble_id": bubble['bubble_id'],
318
+ "face_id": face_idx
319
+ })
320
+ self.stats['successful_links'] += 1
321
+ else:
322
+ self.stats['failed_links'] += 1
323
+
324
+ # Update statistics
325
+ self.stats['total_panels'] += 1
326
+ self.stats['total_bubbles'] += len(panel_bubbles)
327
+ self.stats['total_faces'] += len(panel_faces)
328
+ self.stats['total_bodies'] += len(panel_bodies)
329
+
330
+ # Create panel ID
331
+ panel_id = f"page_{page_index:04d}_panel_{frame_index:03d}"
332
+
333
+ return {
334
+ "panel_id": panel_id,
335
+ "width": panel_width,
336
+ "height": panel_height,
337
+ "panel_image_path": panel_image_path,
338
+ "bubbles": panel_bubbles,
339
+ "faces": panel_faces,
340
+ "bodies": panel_bodies,
341
+ "links": links,
342
+ "metadata": {
343
+ "original_page_index": page_index,
344
+ "original_frame_index": frame_index,
345
+ "original_frame_id": frame_data.get('@id', ''),
346
+ "original_frame_bbox": frame_bbox
347
+ }
348
+ }
349
+
350
+ def convert_dataset(self, input_file: str, output_file: str,
351
+ image_dir: str = None, panels_output_dir: str = None) -> Dict[str, Any]:
352
+ """Convert manga dataset to panel-based GCN format."""
353
+ print(f"Loading manga data from: {input_file}")
354
+ manga_data = self.load_manga_data(input_file)
355
+
356
+ # Extract title and character mapping
357
+ title = manga_data.get('title', 'Unknown')
358
+ character_list = manga_data.get('character', [])
359
+ character_mapping = self.build_character_mapping(character_list)
360
+
361
+ print(f"Title: {title}")
362
+ print(f"Characters found: {len(character_mapping)}")
363
+
364
+ # Create panels output directory if specified
365
+ if panels_output_dir:
366
+ os.makedirs(panels_output_dir, exist_ok=True)
367
+ print(f"Panel extraction enabled. Output directory: {panels_output_dir}")
368
+
369
+ # Process all pages and frames
370
+ pages = manga_data.get('page', [])
371
+ converted_panels = []
372
+
373
+ for page_idx, page_data in enumerate(pages):
374
+ self.stats['total_pages'] += 1
375
+
376
+ # Construct image path if image directory provided
377
+ image_path = os.path.join(image_dir,f"{str(page_data.get('@index', page_idx)).zfill(3)}.jpg")
378
+
379
+ # Process each frame in the page
380
+ frames = page_data.get('frame', [])
381
+ for frame_idx, frame_data in enumerate(frames):
382
+ panel = self.process_panel(
383
+ page_data, frame_data, page_idx, frame_idx,
384
+ character_mapping, image_path, panels_output_dir
385
+ )
386
+ converted_panels.append(panel)
387
+
388
+ # Create final dataset
389
+ dataset = {
390
+ "title": title,
391
+ "character_mapping": character_mapping,
392
+ "panels": converted_panels,
393
+ "conversion_stats": {
394
+ "total_pages": self.stats['total_pages'],
395
+ "total_panels": self.stats['total_panels'],
396
+ "total_bubbles": self.stats['total_bubbles'],
397
+ "total_faces": self.stats['total_faces'],
398
+ "total_bodies": self.stats['total_bodies'],
399
+ "successful_links": self.stats['successful_links'],
400
+ "failed_links": self.stats['failed_links'],
401
+ "extracted_panels": self.stats['extracted_panels'],
402
+ "unique_characters": len(self.stats['unique_characters']),
403
+ "link_success_rate": (self.stats['successful_links'] /
404
+ max(1, self.stats['total_bubbles'])) * 100
405
+ }
406
+ }
407
+
408
+ # Save converted dataset
409
+ print(f"Saving converted dataset to: {output_file}")
410
+ with open(output_file, 'w', encoding='utf-8') as f:
411
+ json.dump(dataset, f, ensure_ascii=False, indent=2)
412
+
413
+ # Print statistics
414
+ self.print_conversion_stats()
415
+
416
+ return dataset
417
+
418
+ def print_conversion_stats(self):
419
+ """Print conversion statistics."""
420
+ print("\n=== Conversion Statistics ===")
421
+ print(f"Total pages processed: {self.stats['total_pages']}")
422
+ print(f"Total panels extracted: {self.stats['total_panels']}")
423
+ print(f"Total speech bubbles: {self.stats['total_bubbles']}")
424
+ print(f"Total faces: {self.stats['total_faces']}")
425
+ print(f"Total bodies: {self.stats['total_bodies']}")
426
+ print(f"Successful links: {self.stats['successful_links']}")
427
+ print(f"Failed links: {self.stats['failed_links']}")
428
+ print(f"Panel images extracted: {self.stats['extracted_panels']}")
429
+ print(f"Unique characters: {len(self.stats['unique_characters'])}")
430
+ if self.stats['total_bubbles'] > 0:
431
+ success_rate = (self.stats['successful_links'] / self.stats['total_bubbles']) * 100
432
+ print(f"Link success rate: {success_rate:.1f}%")
433
+ print("=" * 30)
434
+
435
+
436
+ def json_to_gcn(config):
437
+ # !python ./frame.py ../json/AkkeraKanjinchou.json ./output2.json --image-dir ../Manga109/images/AkkeraKanjinchou/ --panels-output-dir ./frames1/
438
+ # parser = argparse.ArgumentParser(description='Convert manga dataset to panel-based GCN format')
439
+ # parser.add_argument('input_file', help='Input JSON file with manga annotations')
440
+ # parser.add_argument('output_file', help='Output JSON file for GCN dataset')
441
+ # parser.add_argument('--image-dir', type=str, required=True,
442
+ # help='Directory containing manga page images')
443
+ # parser.add_argument('--panels-output-dir', type=str, required=True,
444
+ # help='Directory to save extracted panel images')
445
+ # parser.add_argument('--fuzzy-mapping', action='store_true', default=True,
446
+ # help='Enable fuzzy ID matching (default: True)')
447
+ # parser.add_argument('--proximity-threshold', type=float, default=200.0,
448
+ # help='Maximum distance for spatial proximity matching (default: 200.0)')
449
+ # parser.add_argument('--overlap-threshold', type=float, default=0.5,
450
+ # help='Minimum overlap ratio for element-frame association (default: 0.5)')
451
+
452
+ # args = parser.parse_args()
453
+ base_dir = config["root"]
454
+ manga109_root_dir = config["manga109_root_dir"]
455
+ root_json = config["root_json"]
456
+
457
+ for book_json in os.listdir(root_json):
458
+ book_name = str(book_json).split(".")[0]
459
+ panel_dir = os.path.join(base_dir,"panels",book_name)
460
+ if not os.path.exists(panel_dir):
461
+ os.makedirs(panel_dir)
462
+
463
+ # Convert dataset
464
+ converter = MangaPanelDatasetConverter(
465
+ # fuzzy_mapping=args.fuzzy_mapping,
466
+ # proximity_threshold=args.proximity_threshold
467
+ )
468
+ input_file = os.path.join(root_json,book_json)
469
+ output_file = os.path.join(base_dir,"panel_data",book_json)
470
+ image_dir = os.path.join(manga109_root_dir,"images",book_name)
471
+ try:
472
+ result = converter.convert_dataset(
473
+ input_file,
474
+ output_file,
475
+ image_dir,
476
+ panel_dir
477
+ )
478
+ print(f"\nConversion completed successfully!")
479
+ print(f"GCN dataset saved to: {output_file}")
480
+ print(f"Panel images saved to: {panel_dir}")
481
+
482
+ except Exception as e:
483
+ print(f"Error during conversion: {str(e)}")
484
+
utils/panel_ordering.py ADDED
@@ -0,0 +1,258 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import cv2
2
+
3
+ class BoundingBox(object):
4
+ def __init__(self, xmin=None, ymin=None, xmax=None, ymax=None,
5
+ panels=None,
6
+ bbtype=None,
7
+ id_=""):
8
+ self.dict = {
9
+ "@xmin": xmin,
10
+ "@ymin": ymin,
11
+ "@xmax": xmax,
12
+ "@ymax": ymax,
13
+ "@id": id_,
14
+ }
15
+ if panels is None:
16
+ self.panels = [self]
17
+ else:
18
+ self.panels = panels
19
+ self.bbtype = bbtype
20
+
21
+ def init_dict(self, d):
22
+ self.dict = d
23
+ self.dict["@xmin"] = float(self.xmin)
24
+ self.dict["@ymin"] = float(self.ymin)
25
+ self.dict["@xmax"] = float(self.xmax)
26
+ self.dict["@ymax"] = float(self.ymax)
27
+ return self
28
+
29
+ def __getitem__(self, index):
30
+ return self.dict[index]
31
+
32
+ @property
33
+ def xmin(self):
34
+ return self.dict["@xmin"]
35
+
36
+ @property
37
+ def xmax(self):
38
+ return self.dict["@xmax"]
39
+
40
+ @property
41
+ def ymin(self):
42
+ return self.dict["@ymin"]
43
+
44
+ @property
45
+ def ymax(self):
46
+ return self.dict["@ymax"]
47
+
48
+ @property
49
+ def width(self):
50
+ return self.xmax - self.xmin
51
+
52
+ @property
53
+ def height(self):
54
+ return self.ymax - self.ymin
55
+
56
+ @property
57
+ def text(self):
58
+ return self.dict["#text"]
59
+
60
+ @property
61
+ def id(self):
62
+ return self.dict["@id"]
63
+
64
+ @property
65
+ def list(self):
66
+ return [self.xmin, self.ymin, self.xmax, self.ymax]
67
+
68
+ @property
69
+ def is_null(self):
70
+ return self.xmin is None or self.ymin is None or self.xmax is None or self.ymax is None
71
+
72
+ @property
73
+ def area(self):
74
+ if self.xmax is None or self.xmin is None or self.ymax is None or self.ymin is None:
75
+ return 0
76
+ return (self.xmax - self.xmin) * (self.ymax - self.ymin)
77
+
78
+ @property
79
+ def base_panels(self):
80
+ return len(self.panels)
81
+
82
+
83
+ def __getitem__(self, item):
84
+ return self.dict[item]
85
+
86
+ def __add__(self, a):
87
+ assert issubclass(type(a), BoundingBox)
88
+ if a.is_null:
89
+ return self
90
+ elif self.is_null:
91
+ return a
92
+ return BoundingBox(xmin=min(self.xmin, a.xmin),
93
+ ymin=min(self.ymin, a.ymin),
94
+ xmax=max(self.xmax, a.xmax),
95
+ ymax=max(self.ymax, a.ymax),
96
+ panels=self.panels + a.panels)
97
+
98
+ def __mul__(self, a):
99
+ assert issubclass(type(a), BoundingBox)
100
+ bb = BoundingBox(xmin=max(self.xmin, a.xmin),
101
+ ymin=max(self.ymin, a.ymin),
102
+ xmax=min(self.xmax, a.xmax),
103
+ ymax=min(self.ymax, a.ymax),
104
+ panels=self.panels + a.panels)
105
+ if bb.xmin > bb.xmax or bb.ymin > bb.ymax:
106
+ return BoundingBox()
107
+ else:
108
+ return bb
109
+
110
+ def __repr__(self):
111
+ return "<BoundingBox({},{}) {},{},{},{},{}>".format(self.bbtype, self.id, *self.list, self.base_panels)
112
+
113
+
114
+
115
+ def get_pivot_side(zmin, zmax, pivot):
116
+ interception_ratio_threshold = 0.25
117
+ if pivot <= zmin:
118
+ return 1
119
+ elif zmax <= pivot:
120
+ return 0
121
+ else:
122
+ pivot_z_ratio = (pivot - zmin) / (zmax - zmin)
123
+ interception_ratio = min(pivot_z_ratio, 1 - pivot_z_ratio)
124
+
125
+ if interception_ratio > interception_ratio_threshold:
126
+ return -1
127
+ else:
128
+ return 0 if pivot_z_ratio > 0.5 else 1
129
+
130
+
131
+ class BoxSet(set):
132
+ def get_highest_priority_division(self):
133
+ # Horizontal division
134
+ ydivs = sorted([bb.ymin for bb in self] + [bb.ymax for bb in self])
135
+ for pivot in ydivs:
136
+ division = self.get_pivot_division(pivot,
137
+ is_horizontal_division=True)
138
+ if len(division) > 1:
139
+ return division
140
+
141
+ # Vertical division
142
+ xdivs = sorted([bb.xmin for bb in self] + [bb.xmax for bb in self], reverse=True)
143
+ for pivot in xdivs:
144
+ division = self.get_pivot_division(pivot,
145
+ is_horizontal_division=False)
146
+ if len(division) > 1:
147
+ return division
148
+
149
+ # Undividable box set
150
+ return [self]
151
+
152
+ def get_pivot_division(self, pivot, is_horizontal_division):
153
+ divs = [BoxSet(), BoxSet()]
154
+ for bb in self:
155
+ if is_horizontal_division:
156
+ side = get_pivot_side(bb.ymin, bb.ymax, pivot)
157
+ else:
158
+ side = get_pivot_side(-bb.xmax, -bb.xmin, -pivot)
159
+
160
+ if side == -1:
161
+ return [self]
162
+ else:
163
+ divs[side].add(bb)
164
+ if len(divs[0]) == 0 or len(divs[1]) == 0:
165
+ return [self]
166
+ return divs
167
+
168
+ def get_multicut_division(self, cuts):
169
+ curset = self
170
+ cur_division = []
171
+ for cut in cuts:
172
+ pivot, is_horizontal_division = cut
173
+ division = curset.get_pivot_division(pivot, is_horizontal_division)
174
+ if len(division) > 1:
175
+ cur_division.append(division[0])
176
+ curset = division[1]
177
+ if len(cur_division) > 0:
178
+ return cur_division + [curset]
179
+ else:
180
+ return [self]
181
+
182
+ def yield_ordered_bbs(self):
183
+ if len(self) == 0:
184
+ pass
185
+ elif len(self) > 1:
186
+ yield self.sum(), False
187
+ else:
188
+ yield next(iter(self)), True
189
+
190
+ def sum(self):
191
+ if len(self) == 0:
192
+ return BoundingBox()
193
+ else:
194
+ l = list(self)
195
+ return sum(l[1:], l[0])
196
+
197
+ class BoxNode(object):
198
+ def __init__(self, bbset, initial_cuts=None):
199
+ if initial_cuts:
200
+ division = bbset.get_multicut_division(initial_cuts)
201
+ else:
202
+ division = [bbset]
203
+
204
+ if len(division) == 1:
205
+ division = bbset.get_highest_priority_division()
206
+
207
+ isLeaf = len(division) <= 1
208
+ self.division = division if isLeaf else [BoxNode(section) for section in division]
209
+
210
+ def yield_ordered_bbs(self):
211
+ for section in self.division:
212
+ for bb in section.yield_ordered_bbs():
213
+ yield bb
214
+
215
+ class BoxOrderEstimator(object):
216
+ def __init__(self, bbs, pagewidth=None, initial_cut_option=None):
217
+ if initial_cut_option == "two-page-four-panel":
218
+ initial_cuts = [(pagewidth * n / 4, False)
219
+ for n in reversed(range(1, 4))]
220
+ elif initial_cut_option == "two-page":
221
+ initial_cuts = [(pagewidth / 2, False)]
222
+ else:
223
+ initial_cuts = None
224
+
225
+ self.boxnode = BoxNode(BoxSet(bbs), initial_cuts)
226
+ t = tuple(zip(*self.boxnode.yield_ordered_bbs()))
227
+ if len(t) > 0:
228
+ self.ordered_bbs, self.bb_estimation_statuses = t
229
+ else:
230
+ self.ordered_bbs, self.bb_estimation_statuses = (), ()
231
+
232
+
233
+ def panel_ordering(test_image,dets):
234
+ image = cv2.imread(test_image)
235
+ image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB) # Convert BGR to RGB for display
236
+ # interception_ratio_threshold = 0.25
237
+
238
+ # print(predictions_frame)
239
+
240
+ panels = set()
241
+ # for panel in dets['frame']:
242
+ for panel in dets.panels:
243
+ panel = panel['bbox']
244
+ panels.add(BoundingBox(panel[0],panel[1],panel[2],panel[3]))
245
+ # print(panels)
246
+
247
+ # image = page.get_image()
248
+ pagewidth = image.size
249
+ pageheight, pagewidth, pagechannels = image.shape
250
+
251
+ # panels = page.get_bbs()["frame"]
252
+ # print(panels)
253
+ boxOrderEstimator = BoxOrderEstimator(
254
+ panels,
255
+ pagewidth=pagewidth,
256
+ initial_cut_option="two-page")
257
+
258
+ return boxOrderEstimator
utils/train_speaker.py ADDED
@@ -0,0 +1,385 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Complete GCN Training Pipeline for Speech Bubble to Speaker Association
4
+ Fixed version that handles the dataset format correctly and resolves training issues.
5
+ """
6
+
7
+ import json
8
+ import torch
9
+ import numpy as np
10
+ import random
11
+ from torch_geometric.data import HeteroData, Batch
12
+ import torch.nn as nn
13
+ from scipy.optimize import linear_sum_assignment
14
+ from typing import Dict, List, Any, Optional, Tuple
15
+ # from utils.utilities import save_checkpoint
16
+ import os
17
+
18
+ from pathlib import Path
19
+ CHECKPOINT_DIR = Path("checkpoints")
20
+ CHECKPOINT_DIR.mkdir(exist_ok=True)
21
+
22
+ def save_checkpoint(model: torch.nn.Module,
23
+ epoch: int,
24
+ loss: float,
25
+ path: Path = CHECKPOINT_DIR / "assoc_gcn.pt") -> None:
26
+ """
27
+ Persist full training state so you can resume fine-tuning later.
28
+ """
29
+ path = Path(path)
30
+ torch.save({
31
+ "epoch": epoch,
32
+ "loss": loss,
33
+ "model_state": model.state_dict()
34
+ }, path)
35
+ print(f"✅ Model checkpoint saved to {path.resolve()}")
36
+
37
+ class DatasetLoader:
38
+ """Handles loading and preprocessing of the converted GCN dataset"""
39
+
40
+ @staticmethod
41
+ def load_converted_dataset(json_path: str) -> List[HeteroData]:
42
+ """Load the converted GCN dataset and create PyTorch Geometric HeteroData objects"""
43
+ with open(json_path, 'r', encoding='utf-8') as f:
44
+ data = json.load(f)
45
+
46
+ dataset = []
47
+ panels = data.get('panels', [])
48
+
49
+ for panel in panels:
50
+ het_data = DatasetLoader.create_hetero_data_from_panel(panel)
51
+ if het_data is not None:
52
+ dataset.append(het_data)
53
+
54
+ print(f"Loaded {len(dataset)} panels from {json_path}")
55
+ return dataset
56
+
57
+ @staticmethod
58
+ def create_hetero_data_from_panel(panel: Dict) -> Optional[HeteroData]:
59
+ """Convert a single panel from the converted dataset into HeteroData format"""
60
+ bubbles = panel.get('bubbles', [])
61
+ faces = panel.get('faces', [])
62
+ links = panel.get('links', [])
63
+
64
+ if len(bubbles) == 0 or len(faces) == 0:
65
+ return None
66
+
67
+ W, H = panel['width'], panel['height']
68
+
69
+ # Create node features
70
+ bubble_features = []
71
+ for bubble in bubbles:
72
+ x1, y1, x2, y2 = bubble['bbox']
73
+ cx, cy = (x1 + x2) / (2 * W), (y1 + y2) / (2 * H)
74
+ w, h = (x2 - x1) / W, (y2 - y1) / H
75
+ area = w * h
76
+ aspect = w / h if h > 0 else 1.0
77
+ bubble_features.append([cx, cy, w, h, area, aspect])
78
+
79
+ face_features = []
80
+ for face in faces:
81
+ x1, y1, x2, y2 = face['bbox']
82
+ cx, cy = (x1 + x2) / (2 * W), (y1 + y2) / (2 * H)
83
+ w, h = (x2 - x1) / W, (y2 - y1) / H
84
+ area = w * h
85
+ aspect = w / h if h > 0 else 1.0
86
+ face_features.append([cx, cy, w, h, area, aspect])
87
+
88
+ # Create edge indices and features
89
+ edge_indices, edge_features, edge_labels = [], [], []
90
+
91
+ # Create mappings
92
+ bubble_id_to_idx = {bubble['bubble_id']: i for i, bubble in enumerate(bubbles)}
93
+ face_id_to_idx = {face['face_id']: i for i, face in enumerate(faces)}
94
+
95
+ # Create ground truth mapping
96
+ gt_links = {}
97
+ for link in links:
98
+ if link['bubble_id'] in bubble_id_to_idx and link['face_id'] in face_id_to_idx:
99
+ bubble_idx = bubble_id_to_idx[link['bubble_id']]
100
+ face_idx = face_id_to_idx[link['face_id']]
101
+ gt_links[(bubble_idx, face_idx)] = 1
102
+
103
+ # Create all possible bubble-face edges
104
+ for i, bubble in enumerate(bubbles):
105
+ for j, face in enumerate(faces):
106
+ # Calculate edge features
107
+ b_x1, b_y1, b_x2, b_y2 = bubble['bbox']
108
+ f_x1, f_y1, f_x2, f_y2 = face['bbox']
109
+
110
+ b_cx, b_cy = (b_x1 + b_x2) / (2 * W), (b_y1 + b_y2) / (2 * H)
111
+ f_cx, f_cy = (f_x1 + f_x2) / (2 * W), (f_y1 + f_y2) / (2 * H)
112
+
113
+ dx, dy = b_cx - f_cx, b_cy - f_cy
114
+ dist = (dx**2 + dy**2)**0.5
115
+
116
+ # Calculate IoU
117
+ xx1, yy1 = max(b_x1, f_x1), max(b_y1, f_y1)
118
+ xx2, yy2 = min(b_x2, f_x2), min(b_y2, f_y2)
119
+ inter = max(0, xx2 - xx1) * max(0, yy2 - yy1)
120
+ union = (b_x2 - b_x1) * (b_y2 - b_y1) + (f_x2 - f_x1) * (f_y2 - f_y1) - inter
121
+ iou = inter / union if union > 0 else 0
122
+
123
+ edge_indices.append([i, j])
124
+ edge_features.append([dx, dy, dist, iou])
125
+ edge_labels.append(1.0 if (i, j) in gt_links else 0.0)
126
+
127
+ if len(edge_indices) == 0:
128
+ return None
129
+
130
+ # Create HeteroData object
131
+ data = HeteroData()
132
+ data['bubble'].x = torch.tensor(bubble_features, dtype=torch.float)
133
+ data['face'].x = torch.tensor(face_features, dtype=torch.float)
134
+ data['bubble', 'to', 'face'].edge_index = torch.tensor(edge_indices, dtype=torch.long).t().contiguous()
135
+ data['bubble', 'to', 'face'].edge_attr = torch.tensor(edge_features, dtype=torch.float)
136
+ data['bubble', 'to', 'face'].edge_label = torch.tensor(edge_labels, dtype=torch.float)
137
+
138
+ # Add metadata
139
+ data.panel_id = panel['panel_id']
140
+ data.width = W
141
+ data.height = H
142
+
143
+ return data
144
+
145
+
146
+ class AssocGCN(nn.Module):
147
+ """Graph Convolutional Network for Speech Bubble to Speaker Association"""
148
+
149
+ def __init__(self, in_feats: int = 6, hid: int = 128):
150
+ super().__init__()
151
+ self.node_encoder = nn.Sequential(
152
+ nn.Linear(in_feats, hid),
153
+ nn.ReLU(),
154
+ nn.Linear(hid, hid)
155
+ )
156
+
157
+ # Message passing layers
158
+ self.conv1 = nn.Sequential(
159
+ nn.Linear(hid * 2 + 4, hid), # node features + edge features
160
+ nn.ReLU(),
161
+ nn.Linear(hid, hid)
162
+ )
163
+
164
+ self.conv2 = nn.Sequential(
165
+ nn.Linear(hid * 2 + 4, hid),
166
+ nn.ReLU(),
167
+ nn.Linear(hid, hid)
168
+ )
169
+
170
+ self.conv3 = nn.Sequential(
171
+ nn.Linear(hid * 2 + 4, hid),
172
+ nn.ReLU(),
173
+ nn.Linear(hid, hid)
174
+ )
175
+
176
+ # Edge classifier
177
+ self.edge_mlp = nn.Sequential(
178
+ nn.Linear(2 * hid + 4, hid),
179
+ nn.ReLU(),
180
+ nn.Dropout(0.1),
181
+ nn.Linear(hid, 1)
182
+ )
183
+
184
+ def forward(self, data):
185
+ bubble_x = self.node_encoder(data['bubble'].x)
186
+ face_x = self.node_encoder(data['face'].x)
187
+
188
+ edge_index = data['bubble', 'to', 'face'].edge_index
189
+ edge_attr = data['bubble', 'to', 'face'].edge_attr
190
+ src_idx, dst_idx = edge_index[0], edge_index[1]
191
+
192
+ # Apply message passing
193
+ for conv in [self.conv1, self.conv2, self.conv3]:
194
+ src_features = bubble_x[src_idx]
195
+ dst_features = face_x[dst_idx]
196
+
197
+ edge_input = torch.cat([src_features, dst_features, edge_attr], dim=1)
198
+ edge_updates = conv(edge_input)
199
+
200
+ # Update node features (simplified aggregation)
201
+ bubble_updates = torch.zeros_like(bubble_x)
202
+ face_updates = torch.zeros_like(face_x)
203
+
204
+ for i in range(len(src_idx)):
205
+ s, d = src_idx[i].item(), dst_idx[i].item()
206
+ bubble_updates[s] += edge_updates[i]
207
+ face_updates[d] += edge_updates[i]
208
+
209
+ # Normalize by degree
210
+ bubble_degrees = torch.bincount(src_idx, minlength=bubble_x.size(0)).float().clamp(min=1)
211
+ face_degrees = torch.bincount(dst_idx, minlength=face_x.size(0)).float().clamp(min=1)
212
+
213
+ bubble_updates = bubble_updates / bubble_degrees.unsqueeze(1)
214
+ face_updates = face_updates / face_degrees.unsqueeze(1)
215
+
216
+ # Residual connection
217
+ bubble_x = bubble_x + bubble_updates
218
+ face_x = face_x + face_updates
219
+
220
+ # Final edge prediction
221
+ src_final = bubble_x[src_idx]
222
+ dst_final = face_x[dst_idx]
223
+ edge_input = torch.cat([src_final, dst_final, edge_attr], dim=1)
224
+ logits = self.edge_mlp(edge_input).squeeze(-1)
225
+
226
+ return logits
227
+
228
+
229
+ def hungarian_matching(scores: torch.Tensor, src_indices, dst_indices):
230
+ """Apply Hungarian algorithm for optimal bipartite matching"""
231
+ if len(scores) == 0:
232
+ return {}
233
+
234
+ num_bubbles = src_indices.max().item() + 1 if len(src_indices) > 0 else 0
235
+ num_faces = dst_indices.max().item() + 1 if len(dst_indices) > 0 else 0
236
+
237
+ cost_matrix = np.full((num_bubbles, num_faces), 1e6, dtype=np.float32)
238
+
239
+ scores_np = scores.detach().cpu().sigmoid().numpy()
240
+ for i, (s, d, score) in enumerate(zip(src_indices.cpu(), dst_indices.cpu(), scores_np)):
241
+ cost_matrix[s, d] = -score # Negative for minimization
242
+
243
+ row_indices, col_indices = linear_sum_assignment(cost_matrix)
244
+
245
+ mapping = {}
246
+ for r, c in zip(row_indices, col_indices):
247
+ if cost_matrix[r, c] < 0: # Valid assignment
248
+ mapping[int(r)] = int(c)
249
+
250
+ return mapping
251
+
252
+
253
+ def train_gcn(dataset: List[HeteroData], epochs: int = 200, batch_size: int = 16, lr: float = 1e-4):
254
+ """Train the GCN model on the dataset"""
255
+ if len(dataset) == 0:
256
+ raise ValueError("Dataset is empty!")
257
+
258
+ print(f"Training on {len(dataset)} panels...")
259
+
260
+ device = 'cuda' if torch.cuda.is_available() else 'cpu'
261
+ model = AssocGCN().to(device)
262
+ optimizer = torch.optim.AdamW(model.parameters(), lr=lr)
263
+
264
+ # Calculate class weights
265
+ total_positive = sum(data['bubble', 'to', 'face'].edge_label.sum().item() for data in dataset)
266
+ total_edges = sum(len(data['bubble', 'to', 'face'].edge_label) for data in dataset)
267
+ pos_weight = (total_edges - total_positive) / total_positive if total_positive > 0 else 9.0
268
+
269
+ print(f"Positive edges: {total_positive}/{total_edges} ({100*total_positive/total_edges:.1f}%)")
270
+ print(f"Using pos_weight: {pos_weight:.2f}")
271
+
272
+ loss_fn = nn.BCEWithLogitsLoss(pos_weight=torch.tensor([pos_weight], device=device))
273
+
274
+ model.train()
275
+ best_loss = float("inf")
276
+ for epoch in range(epochs):
277
+ total_correct = 0
278
+ total_samples = 0
279
+ total_tp = 0
280
+ total_fp = 0
281
+ total_fn = 0
282
+ # FIXED: Properly shuffle the dataset (it's a list, not a dict)
283
+ shuffled_dataset = dataset.copy()
284
+ random.shuffle(shuffled_dataset)
285
+
286
+ total_loss = 0.0
287
+ num_batches = (len(shuffled_dataset) + batch_size - 1) // batch_size
288
+
289
+ for batch_idx in range(num_batches):
290
+ start_idx = batch_idx * batch_size
291
+ end_idx = min(start_idx + batch_size, len(shuffled_dataset))
292
+ batch_data = shuffled_dataset[start_idx:end_idx]
293
+
294
+ # Create batch
295
+ batch = Batch.from_data_list(batch_data).to(device)
296
+
297
+ # Forward pass
298
+ logits = model(batch)
299
+ labels = batch['bubble', 'to', 'face'].edge_label
300
+
301
+ # Compute loss
302
+ loss = loss_fn(logits, labels)
303
+
304
+ # Backward pass
305
+ optimizer.zero_grad()
306
+ loss.backward()
307
+ optimizer.step()
308
+
309
+ total_loss += loss.item() * len(batch_data)
310
+
311
+ # Accuracy computation
312
+ probs = torch.sigmoid(logits)
313
+ preds = (probs > 0.5).float()
314
+ correct = (preds == labels).sum().item()
315
+ total_correct += correct
316
+ total_samples += labels.numel()
317
+
318
+ avg_loss = total_loss / len(shuffled_dataset)
319
+ print(f"Epoch {epoch+1:02d}/{epochs}: Loss = {avg_loss:.4f}")
320
+
321
+ avg_loss = total_loss / len(shuffled_dataset)
322
+ accuracy = total_correct / total_samples
323
+ if avg_loss < best_loss:
324
+ best_loss = avg_loss
325
+ save_checkpoint(model, epoch+1, best_loss) # epoch is 0-indexed
326
+
327
+ # Compute recall and F1
328
+ recall = total_tp / (total_tp + total_fn) if (total_tp + total_fn) > 0 else 0.0
329
+ precision = total_tp / (total_tp + total_fp) if (total_tp + total_fp) > 0 else 0.0
330
+ f1 = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0.0
331
+
332
+ print(f"Epoch {epoch+1:02d}/{epochs}: Loss = {avg_loss:.4f}, Accuracy = {accuracy:.4f}, Recall = {recall:.4f}, F1 = {f1:.4f}")
333
+
334
+
335
+ print("Training completed!")
336
+ return model
337
+
338
+
339
+ def infer_associations(model, data):
340
+ """Infer speech bubble to speaker associations"""
341
+ device = next(model.parameters()).device
342
+ data = data.to(device)
343
+
344
+ model.eval()
345
+ with torch.no_grad():
346
+ logits = model(data)
347
+ src, dst = data['bubble', 'to', 'face'].edge_index
348
+ mapping = hungarian_matching(logits, src, dst)
349
+
350
+ return mapping
351
+
352
+
353
+ # Example usage and testing
354
+ def train_speaker(config):
355
+ # Test with sample data
356
+ dataset = []
357
+ for panel_data_file in os.listdir(os.path.join(config["root"]+"panel_data/")):
358
+ try:
359
+ print(panel_data_file)
360
+ # Load your converted dataset
361
+ dataset += DatasetLoader.load_converted_dataset(os.path.join(config["root"]+"panel_data/",panel_data_file))
362
+
363
+ # if len(dataset) == 0:
364
+ # print("No valid panels found in dataset!")
365
+ # else:
366
+ # # Train the model
367
+ # model = train_gcn(dataset, epochs=10, batch_size=16) # Reduced epochs for testing
368
+
369
+ # # Test inference on first panel
370
+ # test_data = dataset[0]
371
+ # print(test_data)
372
+ # mapping = infer_associations(model, test_data)
373
+
374
+ # print("\nInference Results:")
375
+ # for bubble_id, face_id in mapping.items():
376
+ # print(f"Bubble {bubble_id} → Face {face_id}")
377
+
378
+ except FileNotFoundError:
379
+ print("Error: ./output.json not found!")
380
+ print("Please ensure your converted dataset file exists.")
381
+ except Exception as e:
382
+ print(f"Error: {e}")
383
+ print("Please check your dataset format and file paths.")
384
+
385
+ model = train_gcn(dataset, epochs=30, batch_size=16) # Reduced epochs for testing
utils/transcript_page.py ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from utils.japanese_ocr import MangaOcr
2
+ import cv2
3
+ from PIL import Image
4
+
5
+ def transcript_page(test_image,translator,predictions):
6
+ image = cv2.imread(test_image)
7
+ mocr = MangaOcr()
8
+ # for bubble in predictions.speech_bubbles:
9
+ # xmin,ymin,xmax,ymax = bubble['bbox']
10
+ # text = mocr(Image.fromarray(cv2.cvtColor(image[ymin:ymax, xmin:xmax], cv2.COLOR_BGR2RGB)))
11
+ # print(text)
12
+ # result = translator(text)
13
+ # print(result[0]['translation_text'])
14
+
15
+ # for panel in predictions.panels:
16
+ # print(panel)
17
+ japanese_transcript = []
18
+ english_transcript = []
19
+ for i in range(len(predictions.panels)):
20
+ # print("ver",i)
21
+ cur_panel = [b for b in predictions.panels if b['order'] == i]
22
+ # print("ver",cur_panel)
23
+ if(len(cur_panel) > 0):
24
+ # If subpanels exist , no of panels will exceed the actual amount of panels
25
+ panel_bubbles = [b for b in predictions.speech_bubbles if b['panel_id'] == cur_panel[0]['id']]
26
+ panel_bubbles.sort(key=lambda b: b['seq'])
27
+ for bubble in panel_bubbles:
28
+ xmin,ymin,xmax,ymax = bubble['bbox']
29
+ char_id = bubble['char_id'] if 'char_id' in bubble.keys() else None
30
+ text = mocr(Image.fromarray(cv2.cvtColor(image[ymin:ymax, xmin:xmax], cv2.COLOR_BGR2RGB)))
31
+ print("Character ",char_id," : ",text)
32
+ japanese_transcript.append({"char_id":char_id,"text":text})
33
+ result = translator(text)
34
+ print("Character ",char_id," : ",result[0]['translation_text'])
35
+ english_transcript.append({"char_id":char_id,"text":result[0]['translation_text']})
36
+
37
+ return japanese_transcript,english_transcript
utils/utilities.py ADDED
@@ -0,0 +1,306 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # ✨ utils.py ── reusable helpers
2
+ import torch
3
+ from pathlib import Path
4
+ from typing import List, Dict
5
+ import cv2
6
+ import matplotlib.pyplot as plt
7
+ import numpy as np
8
+ import matplotlib.pyplot as plt
9
+ import matplotlib.patches as patches
10
+ import cv2
11
+ from utils.identify_speaker import DetectionPredictions
12
+
13
+
14
+
15
+ def load_model(path: Path,
16
+ device: str = "cpu") -> "AssocGCN":
17
+ """
18
+ Re-create the network and load the saved weights.
19
+ """
20
+ from train_speaker import AssocGCN # import lazily to avoid circular ref
21
+ ckpt = torch.load(path, map_location=device)
22
+ model = AssocGCN().to(device)
23
+ model.load_state_dict(ckpt["model_state"])
24
+ model.eval() # switch to inference mode
25
+ print(f"🔄 Loaded checkpoint from {path} (epoch {ckpt['epoch']})")
26
+ return model
27
+
28
+ def show_prediction_per_page(img_path,dets):
29
+ # %matplotlib inline
30
+ # Load your manga panel (image)
31
+ image = cv2.imread(img_path)
32
+ image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB) # Convert BGR to RGB for display
33
+
34
+ # Draw bounding boxes for body
35
+ for box in dets.bodies:
36
+ x_min, y_min, x_max, y_max = box['bbox']
37
+ cv2.rectangle(image, (x_min, y_min), (x_max, y_max), color=(255, 0, 0), thickness=2)
38
+
39
+ # Draw bounding boxes for face
40
+ for box in dets.faces:
41
+ x_min, y_min, x_max, y_max = box['bbox']
42
+ cv2.rectangle(image, (x_min, y_min), (x_max, y_max), color=(0, 255, 0), thickness=2)
43
+
44
+ # Draw bounding boxes for body
45
+ for box in dets.panels:
46
+ x_min, y_min, x_max, y_max = box['bbox']
47
+ cv2.rectangle(image, (x_min, y_min), (x_max, y_max), color=(0, 0, 255), thickness=2)
48
+
49
+ # Draw bounding boxes for body
50
+ for box in dets.speech_bubbles:
51
+ x_min, y_min, x_max, y_max = box['bbox']
52
+ cv2.rectangle(image, (x_min, y_min), (x_max, y_max), color=(255, 255, 0), thickness=2)
53
+
54
+ # Display the image
55
+ plt.imshow(image)
56
+ plt.axis('off')
57
+ plt.title('Manga Panel with Bounding Boxes')
58
+ plt.show()
59
+
60
+ def show_panel_order(img_path,boxOrderEstimator):
61
+ image = cv2.imread(img_path)
62
+ image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB) # Convert BGR to RGB for display
63
+
64
+ def show_panel(panel, labeltext, linestyle="solid", edgecolor="red", ax=None):
65
+ fontsize = 12 # reduce fontsize for memory efficiency
66
+ if ax is None:
67
+ ax = plt.gca()
68
+
69
+ rect = patches.Rectangle(
70
+ (panel.xmin, panel.ymin),
71
+ panel.width,
72
+ panel.height,
73
+ linewidth=2,
74
+ linestyle=linestyle,
75
+ edgecolor=edgecolor,
76
+ facecolor="none"
77
+ )
78
+ ax.add_patch(rect)
79
+
80
+ g_sub_bb_x = panel.xmin + panel.width / 2
81
+ g_sub_bb_y = panel.ymin + panel.height / 2
82
+ ax.text(g_sub_bb_x, g_sub_bb_y, labeltext,
83
+ fontsize=fontsize, color=edgecolor,
84
+ ha='center', va='center')
85
+
86
+
87
+ fig, ax = plt.subplots(figsize=(10, 7)) # This avoids multiple axes
88
+ # Show the image
89
+ if len(np.array(image).shape) == 2:
90
+ ax.imshow(image, cmap="gray")
91
+ else:
92
+ ax.imshow(image)
93
+
94
+ # Draw bounding boxes
95
+ for i_panel, panel in enumerate(boxOrderEstimator.ordered_bbs):
96
+ # print(i_panel,panel)
97
+ order = i_panel + 1
98
+ if len(panel.panels) == 1:
99
+ show_panel(panel, labeltext=f"{order}", edgecolor="red", ax=ax)
100
+ else:
101
+ for subpanel in panel.panels:
102
+ show_panel(subpanel, edgecolor="orange", linestyle="dotted", labeltext=f"({order})", ax=ax)
103
+
104
+ plt.axis('off') # cleaner output
105
+ plt.tight_layout()
106
+ plt.show()
107
+
108
+
109
+ # ============================================================================
110
+ # VISUALIZATION
111
+ # ============================================================================
112
+
113
+ def visualize_speaker_results(image_path: str, predictions: DetectionPredictions,
114
+ associations: Dict[str, Dict[int, int]], output_path: str = None):
115
+ """
116
+ Visualize detection results and speech-to-speaker associations
117
+ """
118
+ img = cv2.imread(image_path)
119
+ if img is None:
120
+ print(f"❌ Could not load image for visualization: {image_path}")
121
+ return
122
+
123
+ # Colors for different elements (BGR format for OpenCV)
124
+ colors = {
125
+ 'panel': (255, 0, 0), # Blue
126
+ 'bubble': (0, 255, 0), # Green
127
+ 'face': (0, 0, 255), # Red
128
+ 'body': (255, 255, 0), # Cyan
129
+ 'association': (255, 0, 255) # Magenta
130
+ }
131
+
132
+ # Draw panels/frames
133
+ for panel in predictions.panels:
134
+ bbox = [int(x) for x in panel['bbox']]
135
+ cv2.rectangle(img, (bbox[0], bbox[1]), (bbox[2], bbox[3]),
136
+ colors['panel'], 2)
137
+ cv2.putText(img, f"Panel {panel['id']}", (bbox[0], bbox[1]-10),
138
+ cv2.FONT_HERSHEY_SIMPLEX, 0.6, colors['panel'], 2)
139
+
140
+ # Draw speech bubbles/text
141
+ for bubble in predictions.speech_bubbles:
142
+ bbox = [int(x) for x in bubble['bbox']]
143
+ cv2.rectangle(img, (bbox[0], bbox[1]), (bbox[2], bbox[3]),
144
+ colors['bubble'], 2)
145
+ cv2.putText(img, f"T{bubble['id']}", (bbox[0], bbox[1]-5),
146
+ cv2.FONT_HERSHEY_SIMPLEX, 0.5, colors['bubble'], 2)
147
+
148
+ # Draw faces
149
+ for face in predictions.faces:
150
+ bbox = [int(x) for x in face['bbox']]
151
+ cv2.rectangle(img, (bbox[0], bbox[1]), (bbox[2], bbox[3]),
152
+ colors['face'], 2)
153
+ cv2.putText(img, f"F{face['id']}", (bbox[0], bbox[1]-5),
154
+ cv2.FONT_HERSHEY_SIMPLEX, 0.5, colors['face'], 2)
155
+
156
+ # Draw bodies (optional)
157
+ for body in predictions.bodies:
158
+ bbox = [int(x) for x in body['bbox']]
159
+ cv2.rectangle(img, (bbox[0], bbox[1]), (bbox[2], bbox[3]),
160
+ colors['body'], 1)
161
+ cv2.putText(img, f"B{body['id']}", (bbox[0], bbox[1]-5),
162
+ cv2.FONT_HERSHEY_SIMPLEX, 0.4, colors['body'], 1)
163
+
164
+ # Draw associations
165
+ for panel_id, mapping in associations.items():
166
+ # Extract panel number from panel_id string
167
+ try:
168
+ panel_num = int(panel_id.split('_')[-1])
169
+ except:
170
+ panel_num = 0
171
+
172
+ panel_bubbles = [b for b in predictions.speech_bubbles
173
+ if b['panel_id'] == panel_num]
174
+ panel_faces = [f for f in predictions.faces
175
+ if f['panel_id'] == panel_num]
176
+
177
+ for bubble_idx, face_idx in mapping.items():
178
+ if bubble_idx < len(panel_bubbles) and face_idx < len(panel_faces):
179
+ b_bbox = panel_bubbles[bubble_idx]['bbox']
180
+ f_bbox = panel_faces[face_idx]['bbox']
181
+
182
+ b_center = (int((b_bbox[0] + b_bbox[2]) / 2),
183
+ int((b_bbox[1] + b_bbox[3]) / 2))
184
+ f_center = (int((f_bbox[0] + f_bbox[2]) / 2),
185
+ int((f_bbox[1] + f_bbox[3]) / 2))
186
+
187
+ # Draw association line
188
+ cv2.line(img, b_center, f_center, colors['association'], 3)
189
+ cv2.circle(img, b_center, 5, colors['association'], -1)
190
+ cv2.circle(img, f_center, 5, colors['association'], -1)
191
+
192
+ # Add association number
193
+ mid_point = (int((b_center[0] + f_center[0]) / 2),
194
+ int((b_center[1] + f_center[1]) / 2))
195
+ cv2.putText(img, f"{bubble_idx}→{face_idx}", mid_point,
196
+ cv2.FONT_HERSHEY_SIMPLEX, 0.5, colors['association'], 2)
197
+
198
+ # Save or display
199
+ if output_path:
200
+ cv2.imwrite(output_path, img)
201
+ print(f"📷 Visualization saved to {output_path}")
202
+ else:
203
+ # cv2.imshow("Speech-to-Speaker Associations", img)
204
+ # cv2.waitKey(0)
205
+ # cv2.destroyAllWindows()
206
+ # Display the image
207
+ plt.imshow(img)
208
+ plt.axis('off')
209
+ plt.title('Speech-to-Speaker Associations')
210
+ plt.show()
211
+
212
+ def process_results(predictions,boxOrderEstimator,pairs,labels,results):
213
+ # Assign order to each panel
214
+ for i_panel, panel in enumerate(boxOrderEstimator.ordered_bbs):
215
+ x1 = panel.xmin
216
+ y1 = panel.ymin
217
+ x2 = panel.xmax
218
+ y2 = panel.ymax
219
+ for pred_panel in predictions.panels:
220
+ x3,y3,x4,y4 = pred_panel['bbox']
221
+ # print(x1 == x3 , y1 == y3 , x2 == x4 , y2 == y4,x1,y1,x2,y2,x3,y3,x4,y4)
222
+ if x1 == x3 and y1 == y3 and x2 == x4 and y2 == y4:
223
+ # print("ok")
224
+ pred_panel['order'] = i_panel
225
+
226
+ if len(panel.panels) > 1:
227
+ for subpanel in panel.panels:
228
+ x1 = subpanel.xmin
229
+ y1 = subpanel.ymin
230
+ x2 = subpanel.xmax
231
+ y2 = subpanel.ymax
232
+ for pred_panel in predictions.panels:
233
+ x3,y3,x4,y4 = pred_panel['bbox']
234
+ # print(x1 == x3 , y1 == y3 , x2 == x4 , y2 == y4,x1,y1,x2,y2,x3,y3,x4,y4)
235
+ if x1 == x3 and y1 == y3 and x2 == x4 and y2 == y4:
236
+ # print("ok")
237
+ pred_panel['order'] = i_panel
238
+
239
+ # Assign character id to each face
240
+ for i,(face,body) in enumerate(pairs):
241
+ # print(i,face,body)
242
+ x1,y1,x2,y2 = face
243
+ for pred_panel in predictions.faces:
244
+ x3,y3,x4,y4 = pred_panel['bbox']
245
+ # print(x1 == x3 , y1 == y3 , x2 == x4 , y2 == y4,x1,y1,x2,y2,x3,y3,x4,y4)
246
+ if x1 == x3 and y1 == y3 and x2 == x4 and y2 == y4:
247
+ # print("ok")
248
+ pred_panel['char_id'] = int(labels[i])
249
+
250
+ # Assign character id to each text
251
+ for panel_id, mapping in results.items():
252
+ # print(panel_id,mapping)
253
+ try:
254
+ panel_num = int(panel_id.split('_')[-1])
255
+ except:
256
+ panel_num = 0
257
+
258
+ panel_bubbles = [b for b in predictions.speech_bubbles
259
+ if b['panel_id'] == panel_num]
260
+ panel_faces = [f for f in predictions.faces
261
+ if f['panel_id'] == panel_num]
262
+
263
+ # print("aaa",panel_num,panel_bubbles,panel_faces)
264
+ for bubble_idx, face_idx in mapping.items():
265
+ if bubble_idx < len(panel_bubbles) and face_idx < len(panel_faces):
266
+ bubble_id = panel_bubbles[bubble_idx]['id']
267
+ face_id = panel_faces[face_idx]['id']
268
+ # print("bbb",bubble_id,face_id)
269
+
270
+ # Safely add char_id to the bubble by matching IDs
271
+ # (in case panel_bubbles and predictions.speech_bubbles are different lists)
272
+ for b in predictions.speech_bubbles:
273
+ if b['id'] == bubble_id:
274
+ for f in predictions.faces:
275
+ if f['id'] == face_id:
276
+ b['char_id'] = f.get('char_id', None)
277
+ break # Stop after finding the bubble
278
+
279
+ return predictions
280
+
281
+ def get_speakers(gender_list):
282
+ speakers = [['Alexandra_Hisakawa','f','o'],['Annmarie_Nele','f','o'],['Luis_Moray','m','y'],['Baldur_Sanjin','m','o'],['Ige_Behringer','m','o'],['Eugenio_Mataracı','m','y'],['Camilla_Holmström','f','y'],['Uta_Obando','f','o'],['Alma_María','f','o'],['Gitta_Nikolina','f','o'],['Kazuhiko_Atallah','m','y'],['Tammie_Ema','f','o'],['Barbora_MacLean','f','y'],['Rosemary_Okafor','f','o'],['Craig_Gutsy','m','o'],['Gilberto_Mathias','m','o'],['Suad_Qasim','f','o'],['Chandra_MacFarland','f','o'],['Marcos_Rudaski','m','o'],['Brenda_Stern','f','y'],['Ana_Florence','f','y'],['Royston_Min','m','y'],['Tammy_Grit','f','y'],['Tanja_Adelina','f','y'],['Kumar_Dahl','m','o'],['Abrahan_Mack','m','o'],['Daisy_Studious','f','o'],['Alison_Dietlinde','f','o'],['Ilkin_Urbano','m','o'],['Dionisio_Schuyler','m','o'],['Narelle_Moon','f','o'],['Asya_Anara','f','o'],['Viktor_Eka','m','o'],['Vjollca_Johnnie','f','o'],['Adde_Michal','m','y'],['Viktor_Menelaos','m','y']]
283
+ new_speakers = {"Male":[],"Female":[]}
284
+ for speaker in speakers:
285
+ # print(speaker)
286
+ if speaker[1] == 'm' and speaker[2] == 'y':
287
+ new_speakers['Male'].append(speaker[0])
288
+ elif speaker[1] == 'f' and speaker[2] == 'y':
289
+ new_speakers['Female'].append(speaker[0])
290
+ for speaker in speakers:
291
+ # print(speaker)
292
+ if speaker[1] == 'm' and speaker[2] == 'o':
293
+ new_speakers['Male'].append(speaker[0])
294
+ elif speaker[1] == 'f' and speaker[2] == 'o':
295
+ new_speakers['Female'].append(speaker[0])
296
+
297
+ # print(new_speakers['Male'])
298
+ # print(new_speakers['Female'])
299
+ # return new_speakers
300
+ speaker_list = []
301
+
302
+ for gender in gender_list:
303
+ speaker_list.append(new_speakers[gender[0]].pop(0))
304
+ speaker_list.append(new_speakers['Male'].pop(0))
305
+
306
+ return speaker_list
utils/visual_llm_gender.py ADDED
@@ -0,0 +1,79 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import Qwen2_5_VLForConditionalGeneration, AutoTokenizer, AutoProcessor
2
+ from qwen_vl_utils import process_vision_info
3
+
4
+ def load_llm():
5
+
6
+ # default: Load the model on the available device(s)
7
+ model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
8
+ "Qwen/Qwen2.5-VL-3B-Instruct", torch_dtype="auto", device_map="auto"
9
+ )
10
+ min_pixels = 256*28*28
11
+ max_pixels = 1280*28*28
12
+ processor = AutoProcessor.from_pretrained("Qwen/Qwen2.5-VL-3B-Instruct", min_pixels=min_pixels, max_pixels=max_pixels)
13
+ return model,processor
14
+
15
+ import cv2
16
+ import base64
17
+ from PIL import Image
18
+ from io import BytesIO
19
+
20
+ def check_gender(model,processor,img_list,test_image):
21
+ content = [
22
+ {"type": "text", "text": "Both of the images belong to same person. Identify the gender and respond only Male or Female"},
23
+ ]
24
+ # Load image and crop face
25
+ img = cv2.imread(test_image)
26
+ for image in img_list[:3]:
27
+ x1, y1, x2, y2 = image['bbox']
28
+ face_crop = img[y1:y2, x1:x2]
29
+ # Encode to JPEG format in memory
30
+ _, buffer = cv2.imencode('.jpg', face_crop)
31
+ # Convert to Base64
32
+ face_base64 = base64.b64encode(buffer).decode('utf-8')
33
+ image_bytes = base64.b64decode(face_base64)
34
+ image_64 = Image.open(BytesIO(image_bytes)).convert("RGB")
35
+ content.append({"type": "image", "image": image_64})
36
+ messages = [
37
+ {
38
+ "role": "user",
39
+ "content": content
40
+ }
41
+ ]
42
+
43
+ # Preparation for inference
44
+ text = processor.apply_chat_template(
45
+ messages, tokenize=False, add_generation_prompt=True
46
+ )
47
+ image_inputs, video_inputs = process_vision_info(messages)
48
+ inputs = processor(
49
+ text=[text],
50
+ images=image_inputs,
51
+ videos=video_inputs,
52
+ padding=True,
53
+ return_tensors="pt",
54
+ )
55
+ inputs = inputs.to("cuda")
56
+
57
+ # Inference
58
+ generated_ids = model.generate(**inputs, max_new_tokens=128)
59
+ generated_ids_trimmed = [
60
+ out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
61
+ ]
62
+ output_text = processor.batch_decode(
63
+ generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
64
+ )
65
+ print(output_text)
66
+ return output_text
67
+
68
+ def visual_llm_gender(unique_chars,predictions,test_image):
69
+ model,processor = load_llm()
70
+ # for panel in predictions.faces:
71
+ # print(panel)
72
+ gender_list = []
73
+ for i in range(unique_chars):
74
+ print(i)
75
+ new_panels = [p for p in predictions.faces if p['char_id'] == i]
76
+ print(new_panels)
77
+ gender = check_gender(model,processor,new_panels,test_image)
78
+ gender_list.append(gender)
79
+ return gender_list
utils/xml_to_json.py ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Extract speaker info and add to parsed-XML json object
2
+ import xml.etree.ElementTree as ET
3
+ import manga109api
4
+ import os
5
+ import json
6
+ def xml_to_json(config):
7
+ manga109_root_dir = config["manga109_root_dir"]
8
+
9
+ def create_json(book):
10
+ def parse_xml(xml_path):
11
+ tree = ET.parse(xml_path)
12
+ return tree.getroot()
13
+
14
+ root_speaker = parse_xml(manga109_root_dir+"annotations_Manga109Dialog/"+book+".xml")
15
+
16
+ print(len(root_speaker.find('pages')))
17
+ fullPage = []
18
+ for page in root_speaker.find('pages'):
19
+ index = page.attrib.get('index')
20
+ width = page.attrib.get('width')
21
+ height = page.attrib.get('height')
22
+ curpage = {}
23
+ for speaker in page.findall('speaker_to_text'):
24
+ id = speaker.attrib.get('id')
25
+ text_id = speaker.attrib.get('text_id')
26
+ speaker_id = speaker.attrib.get('speaker_id')
27
+ curpage[text_id] = speaker_id
28
+ fullPage.append(curpage)
29
+
30
+ p = manga109api.Parser(root_dir=manga109_root_dir)
31
+
32
+ annotation = p.get_annotation(book=book)
33
+
34
+ for page in annotation['page']:
35
+ index = page['@index']
36
+ for obj in page['text']:
37
+ if obj['@id'] in fullPage[index]:
38
+ obj['@bodyid'] = fullPage[index][obj['@id']]
39
+
40
+ with open(config["root"] + "json/"+book+".json","w",encoding='utf-8') as f:
41
+ json.dump(annotation,f,ensure_ascii=False, indent=2)
42
+
43
+ for book in os.listdir(os.path.join(manga109_root_dir,"images")):
44
+ print(book)
45
+ create_json(book)
yolo11m/runs/detect/train/args.yaml ADDED
@@ -0,0 +1,105 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ task: detect
2
+ mode: train
3
+ model: yolo11m.pt
4
+ data: ./data.yaml
5
+ epochs: 100
6
+ time: null
7
+ patience: 100
8
+ batch: 16
9
+ imgsz: 640
10
+ save: true
11
+ save_period: -1
12
+ cache: false
13
+ device: '0'
14
+ workers: 8
15
+ project: null
16
+ name: train
17
+ exist_ok: false
18
+ pretrained: true
19
+ optimizer: auto
20
+ verbose: true
21
+ seed: 0
22
+ deterministic: true
23
+ single_cls: false
24
+ rect: false
25
+ cos_lr: false
26
+ close_mosaic: 10
27
+ resume: false
28
+ amp: true
29
+ fraction: 1.0
30
+ profile: false
31
+ freeze: null
32
+ multi_scale: false
33
+ overlap_mask: true
34
+ mask_ratio: 4
35
+ dropout: 0.0
36
+ val: true
37
+ split: val
38
+ save_json: false
39
+ conf: null
40
+ iou: 0.7
41
+ max_det: 300
42
+ half: false
43
+ dnn: false
44
+ plots: true
45
+ source: null
46
+ vid_stride: 1
47
+ stream_buffer: false
48
+ visualize: false
49
+ augment: false
50
+ agnostic_nms: false
51
+ classes: null
52
+ retina_masks: false
53
+ embed: null
54
+ show: false
55
+ save_frames: false
56
+ save_txt: false
57
+ save_conf: false
58
+ save_crop: false
59
+ show_labels: true
60
+ show_conf: true
61
+ show_boxes: true
62
+ line_width: null
63
+ format: torchscript
64
+ keras: false
65
+ optimize: false
66
+ int8: false
67
+ dynamic: false
68
+ simplify: true
69
+ opset: null
70
+ workspace: null
71
+ nms: false
72
+ lr0: 0.01
73
+ lrf: 0.01
74
+ momentum: 0.937
75
+ weight_decay: 0.0005
76
+ warmup_epochs: 3.0
77
+ warmup_momentum: 0.8
78
+ warmup_bias_lr: 0.1
79
+ box: 7.5
80
+ cls: 0.5
81
+ dfl: 1.5
82
+ pose: 12.0
83
+ kobj: 1.0
84
+ nbs: 64
85
+ hsv_h: 0.015
86
+ hsv_s: 0.7
87
+ hsv_v: 0.4
88
+ degrees: 0.0
89
+ translate: 0.1
90
+ scale: 0.5
91
+ shear: 0.0
92
+ perspective: 0.0
93
+ flipud: 0.0
94
+ fliplr: 0.5
95
+ bgr: 0.0
96
+ mosaic: 1.0
97
+ mixup: 0.0
98
+ cutmix: 0.0
99
+ copy_paste: 0.0
100
+ copy_paste_mode: flip
101
+ auto_augment: randaugment
102
+ erasing: 0.4
103
+ cfg: null
104
+ tracker: botsort.yaml
105
+ save_dir: runs/detect/train
yolo11m/runs/detect/train2/args.yaml ADDED
@@ -0,0 +1,105 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ task: detect
2
+ mode: train
3
+ model: yolo11m.pt
4
+ data: ./data.yaml
5
+ epochs: 30
6
+ time: null
7
+ patience: 100
8
+ batch: 16
9
+ imgsz: 640
10
+ save: true
11
+ save_period: -1
12
+ cache: false
13
+ device: '0'
14
+ workers: 8
15
+ project: null
16
+ name: train2
17
+ exist_ok: false
18
+ pretrained: true
19
+ optimizer: auto
20
+ verbose: true
21
+ seed: 0
22
+ deterministic: true
23
+ single_cls: false
24
+ rect: false
25
+ cos_lr: false
26
+ close_mosaic: 10
27
+ resume: false
28
+ amp: true
29
+ fraction: 1.0
30
+ profile: false
31
+ freeze: null
32
+ multi_scale: false
33
+ overlap_mask: true
34
+ mask_ratio: 4
35
+ dropout: 0.0
36
+ val: true
37
+ split: val
38
+ save_json: false
39
+ conf: null
40
+ iou: 0.7
41
+ max_det: 300
42
+ half: false
43
+ dnn: false
44
+ plots: true
45
+ source: null
46
+ vid_stride: 1
47
+ stream_buffer: false
48
+ visualize: false
49
+ augment: false
50
+ agnostic_nms: false
51
+ classes: null
52
+ retina_masks: false
53
+ embed: null
54
+ show: false
55
+ save_frames: false
56
+ save_txt: false
57
+ save_conf: false
58
+ save_crop: false
59
+ show_labels: true
60
+ show_conf: true
61
+ show_boxes: true
62
+ line_width: null
63
+ format: torchscript
64
+ keras: false
65
+ optimize: false
66
+ int8: false
67
+ dynamic: false
68
+ simplify: true
69
+ opset: null
70
+ workspace: null
71
+ nms: false
72
+ lr0: 0.01
73
+ lrf: 0.01
74
+ momentum: 0.937
75
+ weight_decay: 0.0005
76
+ warmup_epochs: 3.0
77
+ warmup_momentum: 0.8
78
+ warmup_bias_lr: 0.1
79
+ box: 7.5
80
+ cls: 0.5
81
+ dfl: 1.5
82
+ pose: 12.0
83
+ kobj: 1.0
84
+ nbs: 64
85
+ hsv_h: 0.015
86
+ hsv_s: 0.7
87
+ hsv_v: 0.4
88
+ degrees: 0.0
89
+ translate: 0.1
90
+ scale: 0.5
91
+ shear: 0.0
92
+ perspective: 0.0
93
+ flipud: 0.0
94
+ fliplr: 0.5
95
+ bgr: 0.0
96
+ mosaic: 1.0
97
+ mixup: 0.0
98
+ cutmix: 0.0
99
+ copy_paste: 0.0
100
+ copy_paste_mode: flip
101
+ auto_augment: randaugment
102
+ erasing: 0.4
103
+ cfg: null
104
+ tracker: botsort.yaml
105
+ save_dir: runs/detect/train2
yolo11m/runs/detect/train2/results.csv ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ epoch,time,train/box_loss,train/cls_loss,train/dfl_loss,metrics/precision(B),metrics/recall(B),metrics/mAP50(B),metrics/mAP50-95(B),val/box_loss,val/cls_loss,val/dfl_loss,lr/pg0,lr/pg1,lr/pg2
2
+ 1,364.701,0.99528,0.88434,1.10876,0.84374,0.81339,0.86346,0.61416,0.9399,0.67931,1.06036,0.000415851,0.000415851,0.000415851
3
+ 2,717.67,0.93907,0.68799,1.07341,0.8555,0.81756,0.8734,0.6256,0.91886,0.66057,1.05007,0.000805045,0.000805045,0.000805045
4
+ 3,1070.28,0.9246,0.66399,1.06682,0.82583,0.78783,0.84902,0.58408,1.00997,0.71844,1.10452,0.00116674,0.00116674,0.00116674
5
+ 4,1415.94,0.89772,0.63638,1.05457,0.87076,0.83342,0.88502,0.6424,0.88072,0.59905,1.0277,0.00112625,0.00112625,0.00112625
6
+ 5,1761.7,0.87716,0.61356,1.04405,0.88033,0.83517,0.89177,0.6547,0.87207,0.57414,1.0236,0.001085,0.001085,0.001085
7
+ 6,2110.53,0.86309,0.59526,1.03721,0.88965,0.8458,0.90099,0.66986,0.83239,0.53844,1.01,0.00104375,0.00104375,0.00104375
8
+ 7,2458.47,0.84727,0.5812,1.02879,0.88126,0.84073,0.89735,0.65739,0.86915,0.56387,1.01918,0.0010025,0.0010025,0.0010025
9
+ 8,2806.22,0.84261,0.57434,1.02717,0.89677,0.84979,0.90884,0.67838,0.81902,0.52912,0.99507,0.00096125,0.00096125,0.00096125
10
+ 9,3154.13,0.83402,0.56153,1.02006,0.89845,0.85878,0.91456,0.68555,0.81409,0.51218,0.98894,0.00092,0.00092,0.00092
11
+ 10,3505.97,0.8254,0.55079,1.01678,0.89708,0.86145,0.91757,0.68555,0.83014,0.51276,0.9874,0.00087875,0.00087875,0.00087875
12
+ 11,3850.26,0.81667,0.54212,1.01165,0.89666,0.87261,0.92286,0.69549,0.79841,0.49455,0.97811,0.0008375,0.0008375,0.0008375
13
+ 12,4198.48,0.8088,0.53515,1.0076,0.90241,0.86783,0.92325,0.69524,0.80291,0.49307,0.98062,0.00079625,0.00079625,0.00079625
14
+ 13,4544.49,0.80596,0.53055,1.0057,0.89774,0.86735,0.92057,0.69194,0.80916,0.50261,0.97499,0.000755,0.000755,0.000755
15
+ 14,4890.59,0.79576,0.52245,1.00188,0.90424,0.86295,0.92337,0.69801,0.79489,0.49061,0.97536,0.00071375,0.00071375,0.00071375
16
+ 15,5238.24,0.79264,0.51495,0.99893,0.90668,0.87894,0.92957,0.70712,0.78733,0.47507,0.96654,0.0006725,0.0006725,0.0006725
17
+ 16,5583.58,0.78354,0.50664,0.99404,0.90396,0.87658,0.9299,0.70605,0.78008,0.47593,0.96723,0.00063125,0.00063125,0.00063125
18
+ 17,5929.19,0.78022,0.50345,0.99207,0.90272,0.87945,0.93169,0.71304,0.76638,0.46915,0.96346,0.00059,0.00059,0.00059
19
+ 18,6280.28,0.77585,0.4996,0.98929,0.9089,0.87976,0.93364,0.71495,0.76637,0.46142,0.95967,0.00054875,0.00054875,0.00054875
20
+ 19,6631.13,0.77146,0.49338,0.98748,0.90486,0.8868,0.9333,0.71573,0.76155,0.45881,0.96285,0.0005075,0.0005075,0.0005075
21
+ 20,6977.4,0.76437,0.48726,0.98545,0.91121,0.87741,0.93274,0.71797,0.75895,0.45138,0.95755,0.00046625,0.00046625,0.00046625
22
+ 21,7309.74,0.76783,0.47192,0.98514,0.90617,0.88781,0.93546,0.72133,0.75577,0.45381,0.95819,0.000425,0.000425,0.000425
23
+ 22,7644.28,0.75947,0.46115,0.97943,0.90908,0.88789,0.9366,0.72428,0.74821,0.44937,0.95353,0.00038375,0.00038375,0.00038375
24
+ 23,7976.83,0.75231,0.45512,0.97455,0.90643,0.89093,0.93685,0.72362,0.74938,0.4485,0.95206,0.0003425,0.0003425,0.0003425
25
+ 24,8308.26,0.74468,0.44702,0.97117,0.90852,0.88729,0.93576,0.72268,0.75086,0.44312,0.95287,0.00030125,0.00030125,0.00030125
26
+ 25,8638.72,0.73652,0.44072,0.96737,0.91305,0.89204,0.94021,0.73021,0.7399,0.43814,0.94657,0.00026,0.00026,0.00026
27
+ 26,8970.02,0.73089,0.43389,0.96468,0.91071,0.89248,0.93932,0.73064,0.7382,0.43585,0.94573,0.00021875,0.00021875,0.00021875
28
+ 27,9302.82,0.72381,0.42706,0.96115,0.91403,0.89302,0.94056,0.73294,0.73565,0.42938,0.94517,0.0001775,0.0001775,0.0001775
29
+ 28,9635.86,0.71664,0.4207,0.95751,0.91457,0.8922,0.94121,0.73333,0.73483,0.42626,0.94493,0.00013625,0.00013625,0.00013625
30
+ 29,9971.96,0.71077,0.41469,0.95334,0.91221,0.89692,0.94196,0.7358,0.72995,0.42248,0.94099,9.5e-05,9.5e-05,9.5e-05
31
+ 30,10303.7,0.70721,0.4106,0.95196,0.91199,0.8969,0.94201,0.7353,0.73065,0.42129,0.94193,5.375e-05,5.375e-05,5.375e-05
yolo11m/runs/detect/train2/weights/best.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:472dcc9fcf237d5d9ede9afa30eb7da4ffbad514d5a1bca537918bcd2aa7120a
3
+ size 40502700
yolo11m/runs/detect/train2/weights/last.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:755404e647007f3dfb3dac583bea81b176114b2c6c706ae43a177e968a29e98e
3
+ size 40502700