Spaces:

janmayjay
/

Sonofica

Sleeping

App Files Files Community

janmayjay commited on Aug 5

Commit

39a7537

1 Parent(s): faad144

Add application file

Browse files

Files changed (45) hide show

.gitignore +4 -0
app.py +47 -0
checkpoints/assoc_gcn.pt +3 -0
config.json +11 -0
models/model.pth +3 -0
page_transcription.py +100 -0
preprocess.py +21 -0
requirements.txt +140 -0
train.py +15 -0
utils/__pycache__/character_to_character.cpython-311.pyc +0 -0
utils/__pycache__/coco_to_yolo.cpython-311.pyc +0 -0
utils/__pycache__/create_audiobook.cpython-311.pyc +0 -0
utils/__pycache__/create_directories.cpython-311.pyc +0 -0
utils/__pycache__/get_yolo_predictions.cpython-311.pyc +0 -0
utils/__pycache__/identify_speaker.cpython-311.pyc +0 -0
utils/__pycache__/japanese_ocr.cpython-311.pyc +0 -0
utils/__pycache__/json_to_coco.cpython-311.pyc +0 -0
utils/__pycache__/json_to_gcn.cpython-311.pyc +0 -0
utils/__pycache__/panel_ordering.cpython-311.pyc +0 -0
utils/__pycache__/train_speaker.cpython-311.pyc +0 -0
utils/__pycache__/transcript_page.cpython-311.pyc +0 -0
utils/__pycache__/utilities.cpython-311.pyc +0 -0
utils/__pycache__/visual_llm_gender.cpython-311.pyc +0 -0
utils/__pycache__/xml_to_json.cpython-311.pyc +0 -0
utils/character_to_character.py +192 -0
utils/coco_to_yolo.py +52 -0
utils/create_audiobook.py +55 -0
utils/create_directories.py +13 -0
utils/directories.txt +13 -0
utils/get_yolo_predictions.py +11 -0
utils/identify_speaker.py +443 -0
utils/japanese_ocr.py +52 -0
utils/json_to_coco.py +131 -0
utils/json_to_gcn.py +484 -0
utils/panel_ordering.py +258 -0
utils/train_speaker.py +385 -0
utils/transcript_page.py +37 -0
utils/utilities.py +306 -0
utils/visual_llm_gender.py +79 -0
utils/xml_to_json.py +45 -0
yolo11m/runs/detect/train/args.yaml +105 -0
yolo11m/runs/detect/train2/args.yaml +105 -0
yolo11m/runs/detect/train2/results.csv +31 -0
yolo11m/runs/detect/train2/weights/best.pt +3 -0
yolo11m/runs/detect/train2/weights/last.pt +3 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,4 @@

+**/*.jpg
+**/*.png
+**/*.jpg
+**/*.png

app.py ADDED Viewed

	@@ -0,0 +1,47 @@

+import gradio as gr
+import os
+import shutil
+import uuid
+import subprocess
+# Function to handle image upload and transcription
+def process_image(image):
+    # Save uploaded image to temp folder
+    temp_folder = "./temp_uploads"
+    os.makedirs(temp_folder, exist_ok=True)
+    # Generate unique filename
+    image_filename = f"{uuid.uuid4().hex}.jpg"
+    image_path = os.path.join(temp_folder, image_filename)
+    # Save image
+    image.save(image_path)
+    # Run your script with subprocess
+    try:
+        subprocess.run(["python", "page_transcription.py", f"-img={image_path}"], check=True)
+    except subprocess.CalledProcessError as e:
+        return f"Error during transcription: {e}", None
+    # Check if output.wav is generated
+    output_audio_path = "output.wav"
+    if os.path.exists(output_audio_path):
+        return "Transcription complete. Playing audio...", output_audio_path
+    else:
+        return "Failed to generate audio file.", None
+# Gradio interface
+iface = gr.Interface(
+    fn=process_image,
+    inputs=gr.Image(type="pil", label="Upload a Manga Page"),
+    outputs=[
+        gr.Textbox(label="Status"),
+        gr.Audio(label="Generated Audio", type="filepath")
+    ],
+    title="Manga Page Audio Transcription",
+    description="Upload a manga image page, and this tool will transcribe and play the audio using a backend Python script."
+)
+# Launch app
+if __name__ == "__main__":
+    iface.launch()

checkpoints/assoc_gcn.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:4c09dc23e06af2a1f373cb03748a92f134dcbeab2829543f928a5247e51839fe
+size 810267

config.json ADDED Viewed

	@@ -0,0 +1,11 @@

+{
+    "root" : "./dataset/",
+    "dir_path" : "./utils/directories.txt",
+    "manga109_root_dir" : "./dataset/Manga109/",
+    "root_images" : "./dataset/images/",
+    "root_annotations" : "./dataset/annotations/",
+    "root_labels" : "./dataset/labels/",
+    "root_json" : "./dataset/json/",
+    "yolo_path" : "./yolo11m/runs/detect/train2/weights/best.pt",
+    "speaker_association_path" : "./checkpoints/assoc_gcn.pt"
+}

models/model.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b7b8228bcec85fb40eaf2781a63816ccc1e8d6cc43e90a0cd257406e3abc85fc
+size 102802172

page_transcription.py ADDED Viewed

	@@ -0,0 +1,100 @@

+import argparse
+import json
+import torch
+from ultralytics import YOLO
+from utils.identify_speaker import get_predictions_from_yolo
+from utils.panel_ordering import panel_ordering
+from utils.utilities import show_prediction_per_page
+from utils.utilities import show_panel_order
+from utils.character_to_character import count_characters_single_page
+from utils.identify_speaker import identify_speaker,load_trained_speaker_model
+from utils.utilities import visualize_speaker_results
+from utils.utilities import process_results
+from transformers import pipeline
+from utils.transcript_page import transcript_page
+from utils.visual_llm_gender import visual_llm_gender
+from utils.utilities import get_speakers
+from scipy.io.wavfile import write
+from utils.create_audiobook import create_audiobook
+import sounddevice as sd
+import soundfile as sf
+# def main():
+#     parser = argparse.ArgumentParser(description="Script to transcript one single image")
+#     # parser.add_argument("filename", help="Path to the input file")
+#     parser.add_argument("-img", "--test_image", type=str, default="./dataset/images/test/AkkeraKanjinchou_083.jpg", help="Test Image on which transcription needs to be done")
+#     parser.add_argument("-vis", "--visualize", type=bool, default=False, help="Visualize predictions")
+#     args = parser.parse_args()
+#     print(f"Test image at : {args.test_image}")
+#     print(f"Test image at : {args.visualize}")
+def main():
+    parser = argparse.ArgumentParser(description="Script to transcript one single image")
+    parser.add_argument(
+        "-img", "--test_image",
+        type=str,
+        default="./dataset/images/test/AkkeraKanjinchou_083.jpg",
+        help="Test Image on which transcription needs to be done"
+    )
+    parser.add_argument(
+        "-vis", "--visualize",
+        action="store_true",
+        help="Provide only flagname to Visualize predictions"
+    )
+    args = parser.parse_args()
+    print(f"Test image at : {args.test_image}")
+    print(f"Visualize? : {args.visualize}")
+    config = {}
+    with open('config.json') as f:
+        config = json.load(f)
+    device = 'cuda' if torch.cuda.is_available() else 'cpu'
+    yolo_model = YOLO(config["yolo_path"])
+    test_image = args.test_image
+    predictions = get_predictions_from_yolo(test_image, yolo_model)
+    boxOrderEstimator = panel_ordering(test_image,predictions)
+    unique_chars, labels,pairs = count_characters_single_page(test_image,predictions,eps=0.3,min_samples=1,device='cuda')
+    assoc_model = load_trained_speaker_model(config['speaker_association_path'], device)
+    results,predictions = identify_speaker(
+                assoc_model, test_image, yolo_model, 'cuda'
+            )
+    processed_predictions = process_results(predictions,boxOrderEstimator,pairs,labels,results)
+    # print("Panels predicted present in predictions.panels")
+    # for panel in predictions.panels:
+    #     print(panel)
+    # print("Sppech bubbles predicted present in predictions.speech_bubbles")
+    # for panel in predictions.speech_bubbles:
+    #     print(panel)
+    # print("Faces predicted present in predictions.faces")
+    # for panel in predictions.faces:
+    #     print(panel)
+    # print("Bodies predicted present in predictions.bodies")
+    # for panel in predictions.bodies:
+    #     print(panel)
+    translator = pipeline('translation', model='Mitsua/elan-mt-bt-ja-en')
+    japanese_transcript,english_transcript = transcript_page(test_image,translator,processed_predictions)
+    gender_list = visual_llm_gender(unique_chars,predictions,test_image)
+    speaker_list = get_speakers(gender_list)
+    combined,sample_rate = create_audiobook(gender_list,english_transcript,speaker_list)
+    write("output.wav", sample_rate, combined)
+    data, fs = sf.read('output.wav')
+    sd.play(data, fs)
+    sd.wait()
+    if args.visualize :
+        show_prediction_per_page(test_image,processed_predictions)
+        show_panel_order(test_image,boxOrderEstimator)
+        visualize_speaker_results(test_image,processed_predictions,results)
+if __name__ == "__main__":
+    main()

preprocess.py ADDED Viewed

	@@ -0,0 +1,21 @@

+import json
+from utils.create_directories import create_directories
+from utils.xml_to_json import xml_to_json
+from utils.json_to_coco import json_to_coco
+from utils.coco_to_yolo import coco_to_yolo
+from utils.json_to_gcn import json_to_gcn
+def preprocess():
+    config = {}
+    with open('config.json') as f:
+        config = json.load(f)
+    create_directories(config)
+    xml_to_json(config)
+    json_to_coco(config)
+    coco_to_yolo(config)
+    json_to_gcn(config)

requirements.txt ADDED Viewed

	@@ -0,0 +1,140 @@

+absl-py==2.3.1
+accelerate==1.9.0
+aiohappyeyeballs==2.6.1
+aiohttp==3.12.14
+aiosignal==1.4.0
+anyascii==0.3.3
+asttokens==3.0.0
+attrs==25.3.0
+audioread==3.0.1
+av==15.0.0
+babel==2.17.0
+certifi==2025.7.14
+cffi==1.17.1
+charset-normalizer==3.4.2
+colorama==0.4.6
+comm==0.2.3
+contourpy==1.3.3
+coqpit-config==0.2.1
+coqui-tts==0.27.0
+coqui-tts-trainer==0.3.1
+cycler==0.12.1
+Cython==3.1.2
+dateparser==1.1.8
+debugpy==1.8.15
+decorator==5.2.1
+decord==0.6.0
+docopt==0.6.2
+einops==0.8.1
+encodec==0.1.1
+executing==2.2.0
+filelock==3.13.1
+fire==0.7.0
+fonttools==4.59.0
+frozenlist==1.7.0
+fsspec==2024.6.1
+fugashi==1.5.1
+grpcio==1.74.0
+gruut==2.4.0
+gruut-ipa==0.13.0
+gruut_lang_de==2.0.1
+gruut_lang_en==2.0.1
+gruut_lang_es==2.0.1
+gruut_lang_fr==2.0.2
+hf-xet==1.1.5
+huggingface-hub==0.34.3
+idna==3.10
+inflect==7.5.0
+ipykernel==6.30.0
+ipython==9.4.0
+ipython_pygments_lexers==1.1.1
+jaconv==0.4.0
+jedi==0.19.2
+Jinja2==3.1.4
+joblib==1.5.1
+jsonlines==1.2.0
+jupyter_client==8.6.3
+jupyter_core==5.8.1
+kiwisolver==1.4.8
+lazy_loader==0.4
+librosa==0.11.0
+llvmlite==0.44.0
+loguru==0.7.3
+manga-ocr==0.1.14
+manga109api==0.3.1
+Markdown==3.8.2
+MarkupSafe==2.1.5
+matplotlib==3.10.3
+matplotlib-inline==0.1.7
+monotonic-alignment-search==0.2.0
+more-itertools==10.7.0
+mpmath==1.3.0
+msgpack==1.1.1
+multidict==6.6.3
+nest-asyncio==1.6.0
+networkx==3.3
+num2words==0.5.14
+numba==0.61.2
+numpy==2.1.2
+opencv-python==4.12.0.88
+packaging==25.0
+pandas==2.3.1
+parso==0.8.4
+pillow==11.0.0
+platformdirs==4.3.8
+pooch==1.8.2
+prompt_toolkit==3.0.51
+propcache==0.3.2
+protobuf==6.31.1
+psutil==7.0.0
+pure_eval==0.2.3
+py-cpuinfo==9.0.0
+pycparser==2.22
+Pygments==2.19.2
+pyparsing==3.2.3
+pyperclip==1.9.0
+pysbd==0.3.4
+python-crfsuite==0.9.11
+python-dateutil==2.9.0.post0
+pytz==2025.2
+pywin32==311
+PyYAML==6.0.2
+pyzmq==27.0.0
+qwen-vl-utils==0.0.8
+regex==2025.7.34
+requests==2.32.4
+safetensors==0.5.3
+scikit-learn==1.7.1
+scipy==1.16.0
+sentencepiece==0.2.0
+six==1.17.0
+sounddevice==0.5.2
+soundfile==0.13.1
+soxr==0.5.0.post1
+stack-data==0.6.3
+sympy==1.13.3
+tensorboard==2.20.0
+tensorboard-data-server==0.7.2
+termcolor==3.1.0
+threadpoolctl==3.6.0
+tokenizers==0.21.4
+torch==2.7.1+cu118
+torch-geometric==2.6.1
+torchaudio==2.7.1+cu118
+torchvision==0.22.1+cu118
+tornado==6.5.1
+tqdm==4.67.1
+traitlets==5.14.3
+transformers @ git+https://github.com/huggingface/transformers@7dca2ff8cfd6102430acd3af3fdc3831b9779884
+typeguard==4.4.4
+typing_extensions==4.14.1
+tzdata==2025.2
+tzlocal==5.3.1
+ultralytics==8.3.170
+ultralytics-thop==2.0.14
+unidic-lite==1.0.8
+urllib3==2.5.0
+wcwidth==0.2.13
+Werkzeug==3.1.3
+win32_setctime==1.2.0
+yarl==1.20.1

train.py ADDED Viewed

	@@ -0,0 +1,15 @@

+import torch
+torch.cuda.is_available()
+from utils.train_speaker import train_speaker
+from ultralytics import YOLO
+def train(config):
+    model = YOLO("yolo11m.pt")
+    model.info()
+    model.train(data='data.yaml',epochs=10, device='cuda', imgsz=640, batch=16)
+    train_speaker(config)

utils/__pycache__/character_to_character.cpython-311.pyc ADDED Viewed

Binary file (9.75 kB). View file

utils/__pycache__/coco_to_yolo.cpython-311.pyc ADDED Viewed

Binary file (3.96 kB). View file

utils/__pycache__/create_audiobook.cpython-311.pyc ADDED Viewed

Binary file (2.58 kB). View file

utils/__pycache__/create_directories.cpython-311.pyc ADDED Viewed

Binary file (1.12 kB). View file

utils/__pycache__/get_yolo_predictions.cpython-311.pyc ADDED Viewed

Binary file (1.03 kB). View file

utils/__pycache__/identify_speaker.cpython-311.pyc ADDED Viewed

Binary file (16.4 kB). View file

utils/__pycache__/japanese_ocr.cpython-311.pyc ADDED Viewed

Binary file (4.18 kB). View file

utils/__pycache__/json_to_coco.cpython-311.pyc ADDED Viewed

Binary file (8.37 kB). View file

utils/__pycache__/json_to_gcn.cpython-311.pyc ADDED Viewed

Binary file (23.5 kB). View file

utils/__pycache__/panel_ordering.cpython-311.pyc ADDED Viewed

Binary file (14.2 kB). View file

utils/__pycache__/train_speaker.cpython-311.pyc ADDED Viewed

Binary file (21.7 kB). View file

utils/__pycache__/transcript_page.cpython-311.pyc ADDED Viewed

Binary file (2.67 kB). View file

utils/__pycache__/utilities.cpython-311.pyc ADDED Viewed

Binary file (18.3 kB). View file

utils/__pycache__/visual_llm_gender.cpython-311.pyc ADDED Viewed

Binary file (4.28 kB). View file

utils/__pycache__/xml_to_json.cpython-311.pyc ADDED Viewed

Binary file (3.35 kB). View file

utils/character_to_character.py ADDED Viewed

	@@ -0,0 +1,192 @@

+def compute_iou(boxA, boxB):
+    """
+    Compute Intersection over Union (IoU) between two boxes.
+    Boxes are (x1, y1, x2, y2).
+    """
+    xA = max(boxA[0], boxB[0])
+    yA = max(boxA[1], boxB[1])
+    xB = min(boxA[2], boxB[2])
+    yB = min(boxA[3], boxB[3])
+    interW = max(0, xB - xA)
+    interH = max(0, yB - yA)
+    interArea = interW * interH
+    boxAArea = (boxA[2] - boxA[0]) * (boxA[3] - boxA[1])
+    # boxBArea = (boxB[2] - boxB[0]) * (boxB[3] - boxB[1])
+    # unionArea = boxAArea + boxBArea - interArea
+    return interArea / boxAArea if boxAArea > 0 else 0.0
+def match_faces_to_bodies(face_boxes, body_boxes, iou_threshold=0.9):
+    """
+    Match each body box to the face box with highest IoU above threshold.
+    Return list of (face_box or None, body_box).
+    If no face meets the threshold for a body, face_box is None.
+    """
+    matches = []
+    used_faces = set()
+    for b in body_boxes:
+        best_iou = 0.0
+        best_f = None
+        for i, f in enumerate(face_boxes):
+            if i in used_faces:
+                continue
+            iou = compute_iou(f, b)
+            if iou > best_iou:
+                best_iou = iou
+                best_f = i
+        if best_iou >= iou_threshold:
+            matches.append((face_boxes[best_f], b))
+            used_faces.add(best_f)
+        else:
+            # No face sufficiently inside this body → treat body alone
+            matches.append((None, b))
+    return matches
+# matches = match_faces_to_bodies(predictions_face,predictions_body)
+# for face_box, body_box in matches:
+#     print(face_box,body_box)
+import cv2
+import numpy as np
+def crop_and_prepare_embeddings(image_path, face_boxes, body_boxes, model, fuse_features):
+    """
+    For each matched (face, body) pair:
+      - If both present: crop both and fuse embeddings.
+      - If only face: crop face and use face embedding alone.
+      - If only body: crop body and use body embedding alone.
+    Returns list of embeddings.
+    """
+    img = cv2.imread(image_path)
+    pairs = match_faces_to_bodies(face_boxes, body_boxes)
+    embeddings = []
+    new_pairs = []
+    for face_box, body_box in pairs:
+        if face_box is not None:
+            x1, y1, x2, y2 = face_box
+            face_crop = img[y1:y2, x1:x2]
+            face_emb = model(face_crop)
+        else:
+            face_emb = None
+        if body_box is not None:
+            x1, y1, x2, y2 = body_box
+            body_crop = img[y1:y2, x1:x2]
+            body_emb = model(body_crop)
+        else:
+            body_emb = None
+        if face_emb is not None and body_emb is not None:
+            emb = fuse_features(face_emb, body_emb)
+        # elif face_emb is not None:
+        #     emb = face_emb
+        # else:
+        #     emb = body_emb
+            embeddings.append(emb)
+            new_pairs.append((face_box, body_box))
+    return embeddings,new_pairs
+"""
+Prerequisites
+-------------
+pip install torch torchvision scikit-learn opencv-python pillow tqdm
+Download a backbone checkpoint that outputs 128-D embeddings.
+Here we assume `reid_backbone.pth` is a ResNet-50 fine-tuned on manga faces/bodies.
+"""
+import cv2
+import torch
+import torch.nn as nn
+import torchvision.transforms as T
+import numpy as np
+from sklearn.cluster import DBSCAN
+from tqdm import tqdm
+from pathlib import Path
+# -------------------------------------------------------------------
+# 1.  Detection post-processing: crop_faces() is given; add crop_bodies()
+# -------------------------------------------------------------------
+def crop_bodies(image_path, boxes):
+    """Crop body regions given YOLO boxes (x1,y1,x2,y2)."""
+    img = cv2.imread(image_path)
+    crops = []
+    for (x1, y1, x2, y2) in boxes:
+        body = img[y1:y2, x1:x2]
+        crops.append(body)
+    return crops
+def crop_faces(image_path, boxes):
+    image = cv2.imread(image_path)
+    faces = []
+    for (x1, y1, x2, y2) in boxes:
+        face = image[y1:y2, x1:x2]  # crop the face
+        faces.append(face)
+    return faces
+# -------------------------------------------------------------------
+# 2.  Embedding network wrapper
+# -------------------------------------------------------------------
+class ReIDNet(nn.Module):
+    """Backbone that outputs a 128-D L2-normalised embedding."""
+    def __init__(self, ckpt="./models/model.pth", dim=1031):
+        super().__init__()
+        base = torch.hub.load('pytorch/vision:v0.14.0', 'resnet50', weights=None)
+        base.fc = nn.Linear(base.fc.in_features, dim)
+        self.backbone = base
+        def add_prefix(state_dict, prefix="backbone."):
+            return {f"{prefix}{k}" if not k.startswith(prefix) else k: v
+                    for k, v in state_dict.items()}
+        raw = torch.load(ckpt, map_location="cpu")
+        patched = add_prefix(raw)
+        self.load_state_dict(patched)
+        self.backbone.eval()
+        # Normalisation identical to training
+        self.preprocess = T.Compose([
+            T.ToPILImage(),
+            T.Resize((112, 112)),
+            T.ToTensor(),
+            T.Normalize(mean=[0.5]*3, std=[0.5]*3)
+        ])
+    @torch.inference_mode()
+    def forward(self, np_image):
+        # tensor = self.preprocess(np_image).unsqueeze(0)  # 1×3×112×112
+        tensor = self.preprocess(np_image).unsqueeze(0).to(next(self.backbone.parameters()).device)
+        feat = self.backbone(tensor)
+        return nn.functional.normalize(feat, p=2, dim=1).cpu().numpy().ravel()
+# -------------------------------------------------------------------
+# 3.  Utility to fuse face & body features
+# -------------------------------------------------------------------
+def fuse_features(face_f, body_f, w_face=0.6):
+    """Simple weighted late fusion."""
+    return np.concatenate([w_face * face_f, (1 - w_face) * body_f])
+# -------------------------------------------------------------------
+# 4.  Main pipeline for a single page
+# -------------------------------------------------------------------
+def count_characters_single_page(img_path,
+                                 predictions,
+                                 eps=0.3,
+                                 min_samples=3,
+                                 device='cpu'):
+    model = ReIDNet().to(device)
+    face_boxes = [face['bbox'] for face in predictions.faces]
+    body_boxes = [body['bbox'] for body in predictions.bodies]
+    embeddings,pairs = crop_and_prepare_embeddings(img_path, face_boxes, body_boxes, model, fuse_features)
+    fused_embeds = np.vstack(embeddings)     # N×256
+    # ----------------------------------------------------------------
+    # 5.  DBSCAN clustering
+    # ----------------------------------------------------------------
+    db = DBSCAN(metric='cosine', eps=eps, min_samples=min_samples)
+    labels = db.fit_predict(fused_embeds)      # -1 => noise
+    # Count unique positive cluster IDs
+    unique_chars = len(set(l for l in labels if l != -1))
+    return unique_chars, labels,pairs

utils/coco_to_yolo.py ADDED Viewed

	@@ -0,0 +1,52 @@

+def coco_to_yolo(config):
+    import os
+    import json
+    from pathlib import Path
+    from tqdm import tqdm
+    labels_dir = config["root_labels"]
+    annotations_dir = config["root_annotations"]
+    def convert_coco_json_to_yolo(coco_json_path, output_dir):
+        with open(coco_json_path) as f:
+            data = json.load(f)
+        images = {img['id']: img for img in data['images']}
+        categories = {cat['id']: cat['name'] for cat in data['categories']}
+        category_id_map = {cat_id: i for i, cat_id in enumerate(sorted(categories.keys()))}
+        os.makedirs(output_dir, exist_ok=True)
+        print(len(images))
+        for ann in tqdm(data['annotations']):
+            img = images[ann['image_id']]
+            image_name = Path(img['file_name']).stem
+            img_width = img['width']
+            img_height = img['height']
+            bbox = ann['bbox']  # [x_min, y_min, width, height]
+            x_center = (bbox[0] + bbox[2] / 2) / img_width
+            y_center = (bbox[1] + bbox[3] / 2) / img_height
+            w = bbox[2] / img_width
+            h = bbox[3] / img_height
+            class_id = category_id_map[ann['category_id']]
+            label_path = os.path.join(output_dir, f"{image_name}.txt")
+            with open(label_path, "a") as f:
+                f.write(f"{class_id} {x_center:.6f} {y_center:.6f} {w:.6f} {h:.6f}\n")
+        print(f"Converted labels saved to: {output_dir}")
+    # Example usage
+    convert_coco_json_to_yolo(
+        coco_json_path=annotations_dir+"instances_train.json",
+        output_dir=labels_dir+"train/"
+    )
+    convert_coco_json_to_yolo(
+        coco_json_path=annotations_dir+"instances_val.json",
+        output_dir=labels_dir+"val/"
+    )
+    convert_coco_json_to_yolo(
+        coco_json_path=annotations_dir+"instances_test.json",
+        output_dir=labels_dir+"test/"
+    )

utils/create_audiobook.py ADDED Viewed

	@@ -0,0 +1,55 @@

+import torch
+from TTS.api import TTS
+import os
+import numpy as np
+from scipy.io import wavfile
+# Save the original torch.load function
+_original_torch_load = torch.load
+# Define a custom function that forces weights_only=False
+def custom_torch_load(*args, **kwargs):
+    if "weights_only" not in kwargs:
+        kwargs["weights_only"] = False
+    return _original_torch_load(*args, **kwargs)
+# Override torch.load globally
+torch.load = custom_torch_load
+device = "cuda" if torch.cuda.is_available() else "cpu"
+print(f"Using device: {device}")
+# generate_samples()
+def create_audiobook(gender_list,english_transcript,speaker_list):
+    parent_wav = []
+    tts = TTS("tts_models/multilingual/multi-dataset/xtts_v2").to(device)
+    for text in english_transcript:
+        print(text['char_id'],text['text'])
+        voice_gender = 'Male'
+        voice_speaker = ''
+        if text['char_id'] != None:
+            voice_gender = gender_list[int(text['char_id'])][0]
+            print("voice person",speaker_list[text['char_id']])
+            voice_speaker = speaker_list[text['char_id']]
+        else:
+            print("voice person",speaker_list[-1])
+            voice_speaker = speaker_list[-1]
+        print("Voice Gender",voice_gender)
+        try:
+            wav = tts.tts(
+                text=text['text'],
+                speaker=voice_speaker.replace('_',' '),
+                language="en"
+            )
+            print("✓ Audio Generated")
+            parent_wav.append(wav)
+        except Exception as e:
+            print(f"✗ Error with : {e}")
+    combined = np.concatenate(parent_wav)
+    return combined,tts.synthesizer.output_sample_rate
+# parent_wav = generate_audio(gender_list,english_transcript,tts)

utils/create_directories.py ADDED Viewed

	@@ -0,0 +1,13 @@

+import os
+def create_directories(config):
+    dir_path = config["dir_path"]
+    dir_names = ""
+    if os.path.exists(dir_path):
+        with open(dir_path,"r") as f:
+            dir_names = f.readlines()
+        for dir_name in dir_names:
+            dir_name = dir_name.strip()
+            if not os.path.exists(dir_name):
+                os.mkdir(dir_name)
+    else:
+        print(f"Ensure {dir_path} exists in utils")

utils/directories.txt ADDED Viewed

	@@ -0,0 +1,13 @@

+./dataset
+./dataset/images
+./dataset/labels
+./dataset/json
+./dataset/panels
+./dataset/panel_data
+./dataset/annotations
+./dataset/images/train
+./dataset/images/val
+./dataset/images/test
+./dataset/labels/train
+./dataset/labels/val
+./dataset/labels/test

utils/get_yolo_predictions.py ADDED Viewed

	@@ -0,0 +1,11 @@

+CLASSES = {0:"body", 1:"face", 2:"frame", 3:"text"}     # confirmed by user
+def get_yolo_predictions(yolo_model,img_path:str):
+    results = yolo_model.predict(source=img_path, device='cuda')
+    dets = {"body":[], "face":[], "frame":[], "text":[]}
+    for box in results[0].boxes:
+        c  = int(box.cls[0])
+        x1, y1, x2, y2 = map(int, box.xyxy[0])
+        dets[CLASSES[c]].append([x1,y1,x2,y2])
+    return dets

utils/identify_speaker.py ADDED Viewed

	@@ -0,0 +1,443 @@

+#!/usr/bin/env python3
+"""
+Complete Standalone End-to-End Speech-to-Speaker Association Inference Script
+Includes all necessary functions without external dependencies on custom files
+"""
+import json
+import torch
+import cv2
+from torch_geometric.data import HeteroData, Batch
+from typing import Dict, List, Any, Optional, Tuple, Union
+import os
+from ultralytics import YOLO
+from utils.train_speaker import hungarian_matching,AssocGCN,infer_associations,DatasetLoader
+# ============================================================================
+# NEW CLASSES FOR UNSEEN IMAGE PROCESSING
+# ============================================================================
+class DetectionPredictions:
+    """Container for object detection predictions from an unseen image"""
+    def __init__(self, image_path: str):
+        self.image_path = image_path
+        self.panels: List[Dict] = []
+        self.speech_bubbles: List[Dict] = []
+        self.faces: List[Dict] = []
+        self.bodies: List[Dict] = []
+        self.image_size: Tuple[int, int] = (0, 0)  # (width, height)
+    def add_panel(self, bbox: List[float], confidence: float, panel_id: int):
+        """Add panel detection (frame in YOLO terms)"""
+        self.panels.append({
+            'bbox': bbox, 'confidence': confidence, 'id': panel_id
+        })
+    def add_speech_bubble(self, bbox: List[float], confidence: float,
+                         panel_id: int, bubble_id: int):
+        """Add speech bubble detection (text in YOLO terms)"""
+        self.speech_bubbles.append({
+            'bbox': bbox, 'confidence': confidence,
+            'panel_id': panel_id, 'id': bubble_id
+        })
+    def add_face(self, bbox: List[float], confidence: float,
+                panel_id: int, face_id: int):
+        """Add face detection"""
+        self.faces.append({
+            'bbox': bbox, 'confidence': confidence,
+            'panel_id': panel_id, 'id': face_id
+        })
+    def add_body(self, bbox: List[float], confidence: float,
+                panel_id: int, body_id: int):
+        """Add body detection"""
+        self.bodies.append({
+            'bbox': bbox, 'confidence': confidence,
+            'panel_id': panel_id, 'id': body_id
+        })
+# ============================================================================
+# ENHANCED DATA PROCESSING FUNCTIONS
+# ============================================================================
+def create_panel_dict_from_predictions(predictions: DetectionPredictions,
+                                     panel_id: int) -> Dict:
+    """
+    ADAPTED: Creates a panel dictionary from detection predictions
+    Uses the same format as original create_hetero_data_from_panel expects
+    """
+    # Filter predictions for this panel
+    panel_bubbles = [b for b in predictions.speech_bubbles if b['panel_id'] == panel_id]
+    panel_faces = [f for f in predictions.faces if f['panel_id'] == panel_id]
+    # Create bubbles list in expected format
+    bubbles = []
+    for i, bubble in enumerate(panel_bubbles):
+        bubbles.append({
+            'bubble_id': bubble['id'],
+            'bbox': bubble['bbox']
+        })
+    # Create faces list in expected format
+    faces = []
+    for i, face in enumerate(panel_faces):
+        faces.append({
+            'face_id': face['id'],
+            'bbox': face['bbox']
+        })
+    # Create panel dict in expected format
+    panel_dict = {
+        'panel_id': f"unseen_panel_{panel_id}",
+        'width': predictions.image_size[0],
+        'height': predictions.image_size[1],
+        'bubbles': bubbles,
+        'faces': faces,
+        'links': []  # No ground truth links for unseen images
+    }
+    return panel_dict
+def create_hetero_data_from_predictions(predictions: DetectionPredictions,
+                                      panel_id: int) -> Optional[HeteroData]:
+    """
+    ENHANCED: Creates HeteroData from detection predictions
+    Leverages existing create_hetero_data_from_panel function
+    """
+    # Convert predictions to panel dict format
+    panel_dict = create_panel_dict_from_predictions(predictions, panel_id)
+    # Use existing function to create HeteroData
+    hetero_data = DatasetLoader.create_hetero_data_from_panel(panel_dict)
+    if hetero_data is not None:
+        # Remove ground truth labels since we don't have them for unseen images
+        if hasattr(hetero_data['bubble', 'to', 'face'], 'edge_label'):
+            delattr(hetero_data['bubble', 'to', 'face'], 'edge_label')
+    return hetero_data
+# ============================================================================
+# YOLO DETECTION INTEGRATION
+# ============================================================================
+def get_predictions_from_yolo(img_path: str, yolo_model) -> DetectionPredictions:
+    """
+    INTEGRATED: YOLO detection pipeline that maps to our detection categories
+    Maps YOLO classes: {0:"body", 1:"face", 2:"frame", 3:"text"}
+    """
+    CLASSES = {0: "body", 1: "face", 2: "frame", 3: "text"}
+    # Create predictions container
+    predictions = DetectionPredictions(img_path)
+    # Load image to get dimensions
+    img = cv2.imread(img_path)
+    if img is None:
+        raise ValueError(f"Could not load image: {img_path}")
+    height, width = img.shape[:2]
+    predictions.image_size = (width, height)
+    # Get YOLO predictions
+    results = yolo_model.predict(source=img_path, device='cuda', verbose=False)
+    # Process detections and organize by type
+    detections_by_type = {"body": [], "face": [], "frame": [], "text": []}
+    for box in results[0].boxes:
+        c = int(box.cls[0])
+        x1, y1, x2, y2 = map(int, box.xyxy[0])
+        confidence = float(box.conf[0])
+        detection_type = CLASSES[c]
+        detections_by_type[detection_type].append({
+            'bbox': [x1, y1, x2, y2],
+            'confidence': confidence
+        })
+    # Process frame detections as panels
+    for i, frame in enumerate(detections_by_type["frame"]):
+        predictions.add_panel(frame['bbox'], frame['confidence'], i)
+    # If no frames detected, create a default full-image panel
+    if len(predictions.panels) == 0:
+        predictions.add_panel([0, 0, width, height], 1.0, 0)
+    # Process text detections as speech bubbles
+    for i, text in enumerate(detections_by_type["text"]):
+        panel_id = find_containing_panel(text['bbox'], predictions.panels)
+        predictions.add_speech_bubble(text['bbox'], text['confidence'], panel_id, i)
+    from collections import defaultdict
+    # Group bubbles by panel_id
+    grouped_texts = defaultdict(list)
+    for bubble in predictions.speech_bubbles:
+        grouped_texts[bubble['panel_id']].append(bubble)
+    # Assign seq based on right-to-left order (sort by bbox[0] descending)
+    for panel_id, bubbles in grouped_texts.items():
+        sorted_bubbles = sorted(bubbles, key=lambda b: b['bbox'][0], reverse=True)
+        for seq, bubble in enumerate(sorted_bubbles):
+            bubble['seq'] = seq
+    # # Optional: print results
+    # for bubble in speech_bubbles:
+    #     print(bubble)
+    # Process face detections
+    for i, face in enumerate(detections_by_type["face"]):
+        panel_id = find_containing_panel(face['bbox'], predictions.panels)
+        predictions.add_face(face['bbox'], face['confidence'], panel_id, i)
+    # Process body detections (optional - can be used for additional context)
+    for i, body in enumerate(detections_by_type["body"]):
+        panel_id = find_containing_panel(body['bbox'], predictions.panels)
+        predictions.add_body(body['bbox'], body['confidence'], panel_id, i)
+    return predictions
+def find_containing_panel(bbox: List[float], panels: List[Dict]) -> int:
+    """
+    Helper function to determine which panel contains a detection
+    """
+    bbox_center_x = (bbox[0] + bbox[2]) / 2
+    bbox_center_y = (bbox[1] + bbox[3]) / 2
+    for panel in panels:
+        p_bbox = panel['bbox']
+        if (p_bbox[0] <= bbox_center_x <= p_bbox[2] and
+            p_bbox[1] <= bbox_center_y <= p_bbox[3]):
+            return panel['id']
+    # Return first panel if not contained in any
+    return panels[0]['id'] if panels else 0
+# ============================================================================
+# MODEL LOADING WITH ERROR HANDLING
+# ============================================================================
+def load_trained_speaker_model(model_path: str, device: str = "cuda") -> AssocGCN:
+    """
+    FIXED: Load trained AssocGCN model with proper error handling
+    Handles different checkpoint formats including 'model_state' key
+    """
+    # Create model instance
+    model = AssocGCN().to(device)
+    # Load the checkpoint
+    checkpoint = torch.load(model_path, map_location=device)
+    # Handle different checkpoint formats
+    if isinstance(checkpoint, dict):
+        # Check for different possible keys where model state is stored
+        if 'model_state_dict' in checkpoint:
+            state_dict = checkpoint['model_state_dict']
+        elif 'model_state' in checkpoint:  # This handles your specific case
+            state_dict = checkpoint['model_state']
+        elif 'state_dict' in checkpoint:
+            state_dict = checkpoint['state_dict']
+        else:
+            # If no specific key, assume the entire checkpoint is the state dict
+            # but filter out non-model keys
+            state_dict = {k: v for k, v in checkpoint.items()
+                         if k not in {'epoch', 'loss', 'optimizer_state_dict'}}
+    else:
+        # Assume checkpoint is directly the state dict
+        state_dict = checkpoint
+    # Handle DataParallel module prefix if present
+    if any(key.startswith('module.') for key in state_dict.keys()):
+        new_state_dict = {}
+        for key, value in state_dict.items():
+            new_key = key.replace('module.', '') if key.startswith('module.') else key
+            new_state_dict[new_key] = value
+        state_dict = new_state_dict
+    try:
+        # Try to load with strict=True first
+        model.load_state_dict(state_dict, strict=True)
+        print("✅ Model loaded successfully with strict=True")
+    except RuntimeError as e:
+        print(f"⚠️ Warning: {str(e)}")
+        print("Attempting to load with strict=False...")
+        try:
+            # Try with strict=False to ignore missing/unexpected keys
+            missing_keys, unexpected_keys = model.load_state_dict(state_dict, strict=False)
+            if missing_keys:
+                print(f"⚠️ Missing keys: {missing_keys}")
+            if unexpected_keys:
+                print(f"⚠️ Unexpected keys: {unexpected_keys}")
+            print("✅ Model loaded successfully with strict=False")
+        except Exception as e2:
+            print(f"❌ Failed to load model: {str(e2)}")
+            raise e2
+    model.eval()
+    return model
+# ============================================================================
+# MAIN INFERENCE PIPELINE
+# ============================================================================
+@torch.no_grad()
+def identify_speaker(model: AssocGCN, image_path: str,
+                            yolo_model, device: str = "cuda") -> Dict[str, Dict[int, int]]:
+    """
+    COMPLETE: End-to-end inference pipeline for unseen images
+    """
+    try:
+        # Step 1: Run YOLO detection pipeline
+        print(f"🔍 Running YOLO detection on {os.path.basename(image_path)}...")
+        predictions = get_predictions_from_yolo(image_path, yolo_model)
+        # print(f"📊 Detection Results:")
+        # print(f"  - Panels/Frames: {len(predictions.panels)}")
+        # print(f"  - Speech Bubbles/Text: {len(predictions.speech_bubbles)}")
+        # print(f"  - Faces: {len(predictions.faces)}")
+        # print(f"  - Bodies: {len(predictions.bodies)}")
+        # Check if we have the minimum required detections
+        if len(predictions.speech_bubbles) == 0:
+            print("⚠️ No speech bubbles/text detected. Cannot perform association.")
+            return {}
+        if len(predictions.faces) == 0:
+            print("⚠️ No faces detected. Cannot perform association.")
+            return {}
+        # Step 2: Create HeteroData for each panel
+        panels_data = []
+        panel_ids = list(set([p['id'] for p in predictions.panels]))
+        for panel_id in panel_ids:
+            panel_data = create_hetero_data_from_predictions(predictions, panel_id)
+            # print("panel data", panel_data , dir(panel_data))
+            # print(( 'bubble' in panel_data.node_types , 'face' in panel_data.node_types,
+            #     panel_data['bubble'].x.size(0) > 0 , panel_data['face'].x.size(0) > 0))
+            if panel_data is not None:
+                panel_data = panel_data.to(device)
+                panels_data.append(panel_data)
+        # Step 3: Run model inference
+        results = {}
+        for panel_data in panels_data:
+            # print("ppppp",panel_data == None)
+            # if (hasattr(panel_data, 'bubble') and hasattr(panel_data, 'face') and
+            #     panel_data['bubble'].x.size(0) > 0 and panel_data['face'].x.size(0) > 0):
+            if ( panel_data != None and 'bubble' in panel_data.node_types and 'face' in panel_data.node_types and
+                panel_data['bubble'].x.size(0) > 0 and panel_data['face'].x.size(0) > 0):
+                # Run inference using existing function
+                mapping = infer_associations(model, panel_data)
+                # print(f"\\n🖼️ {panel_data.panel_id}:")
+                # if mapping:
+                    # for bubble_idx, face_idx in mapping.items():
+                        # print(f"  Text/Bubble {bubble_idx} → Face {face_idx}")
+                # else:
+                    # print("  No associations found")
+                results[panel_data.panel_id] = mapping
+            else:
+                print(f"⚠️ {panel_data.panel_id}: No valid bubbles or faces, skipping...")
+        total_associations = 0
+        if results:
+            for panel_id, mapping in results.items():
+                if mapping:
+                    # print(f"\\n🖼️ {panel_id}:")
+                    for bubble_id, face_id in mapping.items():
+                        # print(f"   Text/Bubble {bubble_id} ← → Face {face_id}")
+                        total_associations += 1
+                # else:
+                    # print(f"\\n🖼️ {panel_id}: No associations found")
+        else:
+            print("No associations found in the image.")
+        print(f"\\n✅ Total associations found: {total_associations}")
+        return results,predictions
+    except Exception as e:
+        print(f"❌ Error during inference: {str(e)}")
+        raise e
+# #!/usr/bin/env python3
+# """
+# inference.py ── Run saved AssocGCN on new images / panels.
+# Generate per-panel mappings and (optionally) aggregate metrics.
+# """
+# import argparse
+# import torch
+# from pathlib import Path
+# from torch_geometric.data import Batch
+# from typing import Dict, List
+# from utils import load_model
+# from train_speaker import DatasetLoader, infer_associations   # your original file
+# @torch.no_grad()
+# def evaluate_panels(model,
+#                     panels: List["HeteroData"],
+#                     compute_metrics: bool = True) -> None:
+#     device = next(model.parameters()).device
+#     tp = fp = fn = 0
+#     for idx, data in enumerate(panels):
+#         mapping = infer_associations(model, data)
+#         print(f"\n🖼️ Panel {data.panel_id}:")
+#         for bub, face in mapping.items():
+#             print(f"  Bubble {bub} → Face {face}")
+#         if compute_metrics and "edge_label" in data["bubble", "to", "face"]:
+#             gt = {(i.item(), j.item())
+#                   for i, j, lbl in zip(*data["bubble", "to", "face"].edge_index,
+#                                        data["bubble", "to", "face"].edge_label)
+#                   if lbl == 1}
+#             pred = {(b, f) for b, f in mapping.items()}
+#             tp += len(gt & pred)
+#             fp += len(pred - gt)
+#             fn += len(gt - pred)
+#     if compute_metrics:
+#         prec = tp / (tp + fp) if (tp + fp) else 0
+#         rec  = tp / (tp + fn) if (tp + fn) else 0
+#         f1   = 2*prec*rec/(prec+rec) if (prec+rec) else 0
+#         print("\n📊 Aggregated metrics")
+#         print(f"  Precision: {prec:.3f}")
+#         print(f"  Recall   : {rec:.3f}")
+#         print(f"  F1 Score : {f1:.3f}")
+# def identify_speaker(config):
+#     pass

utils/japanese_ocr.py ADDED Viewed

	@@ -0,0 +1,52 @@

+import re
+from pathlib import Path
+import jaconv
+import torch
+from PIL import Image
+from loguru import logger
+from transformers import ViTImageProcessor, AutoTokenizer, VisionEncoderDecoderModel, GenerationMixin
+class MangaOcrModel(VisionEncoderDecoderModel, GenerationMixin):
+    pass
+class MangaOcr:
+    def __init__(self, pretrained_model_name_or_path="kha-white/manga-ocr-base", force_cpu=False):
+        logger.info(f"Loading OCR model from {pretrained_model_name_or_path}")
+        self.processor = ViTImageProcessor.from_pretrained(pretrained_model_name_or_path)
+        self.tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path)
+        self.model = MangaOcrModel.from_pretrained(pretrained_model_name_or_path)
+        if not force_cpu and torch.cuda.is_available():
+            logger.info("Using CUDA")
+            self.model.cuda()
+        elif not force_cpu and torch.backends.mps.is_available():
+            logger.info("Using MPS")
+            self.model.to("mps")
+        else:
+            logger.info("Using CPU")
+        logger.info("OCR ready")
+    def __call__(self, img_or_path):
+        img = img_or_path.convert("L").convert("RGB")
+        x = self._preprocess(img)
+        x = self.model.generate(x[None].to(self.model.device), max_length=300)[0].cpu()
+        x = self.tokenizer.decode(x, skip_special_tokens=True)
+        x = post_process(x)
+        return x
+    def _preprocess(self, img):
+        pixel_values = self.processor(img, return_tensors="pt").pixel_values
+        return pixel_values.squeeze()
+def post_process(text):
+    text = "".join(text.split())
+    text = text.replace("…", "...")
+    text = re.sub("[・.]{2,}", lambda x: (x.end() - x.start()) * ".", text)
+    text = jaconv.h2z(text, ascii=True, digit=True)
+    return text

utils/json_to_coco.py ADDED Viewed

	@@ -0,0 +1,131 @@

+import json
+import os
+def json_to_coco(config):
+    base_dir = config["root"]
+    image_dir = config["root_images"]
+    annotations_dir = config["root_annotations"]
+    def create_coco(json_dir):
+        # COCO base structure
+        coco = {
+            "images": [],
+            "annotations": [],
+            "categories": []
+        }
+        uniq_filename_mapping = {}
+        # Category mapping
+        category_names = ["body", "face", "frame", "text"]
+        category_map = {name: idx + 1 for idx, name in enumerate(category_names)}
+        coco["categories"] = [{"id": cid, "name": name, "supercategory": "object"} for name, cid in category_map.items()]
+        import copy
+        coco_train = copy.deepcopy(coco)
+        coco_test = copy.deepcopy(coco)
+        coco_val = copy.deepcopy(coco)
+        uniq = 0
+        for book_json in os.listdir(json_dir):
+            book = str(book_json).split(".")[0]
+            data = []
+            with open(base_dir +"json/"+book+".json", 'r',encoding='utf-8') as file:
+                data = json.load(file)
+            pages = data["page"]
+            train_output = []
+            val_output = []
+            test_output = []
+            import math
+            import shutil
+            page_count = len(pages) - 2
+            div = page_count / 10
+            div = int(math.ceil(div))
+            # train_count = page_count - (2 * div)
+            # val_count = div
+            # test_count = div
+            # print(train_count,val_count,test_count)
+            train_arr = []
+            val_arr = []
+            test_arr = []
+            count = 0
+            for i in range(page_count,0,-1):
+                if count < div :
+                    test_arr.append(i)
+                elif count < 2*div :
+                    val_arr.append(i)
+                else:
+                    train_arr.append(i)
+                count += 1
+            # Annotation ID counter
+            ann_id = 1
+            # Helper to create COCO bbox annotation
+            def create_annotation(ann, category_name, image_id, ann_id,uniq):
+                x_min = int(ann["@xmin"])
+                y_min = int(ann["@ymin"])
+                x_max = int(ann["@xmax"])
+                y_max = int(ann["@ymax"])
+                width = x_max - x_min
+                height = y_max - y_min
+                return {
+                    "id": ann_id,
+                    "image_id": uniq,
+                    "category_id": category_map[category_name],
+                    "bbox": [x_min, y_min, width, height],
+                    "area": width * height,
+                    "iscrowd": 0
+                }
+            def append_data(coco, ann_id,uniq):
+                # Add image entry
+                coco["images"].append({
+                    "id": uniq,
+                    "width": page["@width"],
+                    "height": page["@height"],
+                    "file_name": f"{book}_{str(img_id).zfill(3)}.jpg"
+                })
+                # Loop through each category and add annotations
+                for category in category_names:
+                    for ann in page.get(category, []):
+                        coco["annotations"].append(
+                            create_annotation(ann, category, img_id, ann_id,uniq)
+                        )
+                        ann_id += 1
+            for page in pages:
+                uniq += 1
+                img_id = int(page["@index"])
+                if img_id == 0 or img_id == 1 :
+                    continue
+                curr_img_path = os.path.join(config["manga109_root_dir"],"images",book,f"{str(img_id).zfill(3)}.jpg")
+                uniq_filename_mapping[uniq] = book+"_"+(curr_img_path.split("\\")[-1])
+                if img_id in train_arr:
+                    append_data(coco_train, ann_id,uniq)
+                    train_output.append(coco)
+                    shutil.copyfile(curr_img_path,os.path.join(image_dir,"train/",book+"_"+(curr_img_path.split("\\")[-1])))
+                if img_id in val_arr:
+                    append_data(coco_val, ann_id,uniq)
+                    val_output.append(coco)
+                    shutil.copyfile(curr_img_path,os.path.join(image_dir,"val/",book+"_"+(curr_img_path.split("\\")[-1])))
+                if img_id in test_arr:
+                    append_data(coco_test, ann_id,uniq)
+                    test_output.append(coco)
+                    shutil.copyfile(curr_img_path,os.path.join(image_dir,"test/",book+"_"+(curr_img_path.split("\\")[-1])))
+        with open(base_dir+"temp_uniq_file_name_mapping.json","w",encoding="utf-8") as f:
+            json.dump(uniq_filename_mapping, f, ensure_ascii=False, indent=2)
+        with open(annotations_dir+"instances_train.json", "w", encoding="utf-8") as f:
+            json.dump(coco_train, f, ensure_ascii=False, indent=2)
+        with open(annotations_dir+"instances_val.json", "w", encoding="utf-8") as f:
+            json.dump(coco_val, f, ensure_ascii=False, indent=2)
+        with open(annotations_dir+"instances_test.json", "w", encoding="utf-8") as f:
+            json.dump(coco_test, f, ensure_ascii=False, indent=2)
+    create_coco(base_dir+"json/")

utils/json_to_gcn.py ADDED Viewed

	@@ -0,0 +1,484 @@

+"""
+Enhanced Manga Panel Dataset Converter: Extract frames as panels and prepare GCN-ready dataset.
+Crops each frame from manga pages and creates dataset with elements within each panel.
+"""
+import json
+import os
+import argparse
+from pathlib import Path
+from typing import Dict, List, Any, Optional, Tuple
+import difflib
+import math
+import cv2
+import numpy as np
+class MangaPanelDatasetConverter:
+    def __init__(self, fuzzy_mapping: bool = True, proximity_threshold: float = 200.0):
+        self.fuzzy_mapping = fuzzy_mapping
+        self.proximity_threshold = proximity_threshold
+        self.character_mapping = {}
+        self.stats = {
+            'total_pages': 0,
+            'total_panels': 0,
+            'total_bubbles': 0,
+            'total_faces': 0,
+            'total_bodies': 0,
+            'successful_links': 0,
+            'failed_links': 0,
+            'unique_characters': set(),
+            'extracted_panels': 0
+        }
+    def load_manga_data(self, file_path: str) -> Dict[str, Any]:
+        """Load manga data from JSON file."""
+        with open(file_path, 'r', encoding='utf-8') as f:
+            return json.load(f)
+    def build_character_mapping(self, character_list: List[Dict]) -> Dict[str, str]:
+        """Build character ID to name mapping from character list."""
+        mapping = {}
+        for char in character_list:
+            mapping_id = char.get('@id', '')
+            char_name = char.get('@name', '')
+            if mapping_id and char_name:
+                mapping[mapping_id] = char_name
+        return mapping
+    def check_overlap(self, bbox1: List[int], bbox2: List[int]) -> bool:
+        """Check if two bounding boxes overlap."""
+        x1_min, y1_min, x1_max, y1_max = bbox1
+        x2_min, y2_min, x2_max, y2_max = bbox2
+        return not (x1_max <= x2_min or x2_max <= x1_min or
+                   y1_max <= y2_min or y2_max <= y1_min)
+    def is_element_in_frame(self, element_bbox: List[int], frame_bbox: List[int],
+                           overlap_threshold: float = 0.5) -> bool:
+        """
+        Check if an element (face, body, text) is within or significantly overlaps with a frame.
+        Args:
+            element_bbox: [xmin, ymin, xmax, ymax] of the element
+            frame_bbox: [xmin, ymin, xmax, ymax] of the frame
+            overlap_threshold: Minimum overlap ratio to consider element as belonging to frame
+        """
+        if not self.check_overlap(element_bbox, frame_bbox):
+            return False
+        # Calculate intersection area
+        x_left = max(element_bbox[0], frame_bbox[0])
+        y_top = max(element_bbox[1], frame_bbox[1])
+        x_right = min(element_bbox[2], frame_bbox[2])
+        y_bottom = min(element_bbox[3], frame_bbox[3])
+        if x_right <= x_left or y_bottom <= y_top:
+            return False
+        intersection_area = (x_right - x_left) * (y_bottom - y_top)
+        element_area = (element_bbox[2] - element_bbox[0]) * (element_bbox[3] - element_bbox[1])
+        # Check if significant portion of element is within frame
+        overlap_ratio = intersection_area / element_area if element_area > 0 else 0
+        return overlap_ratio >= overlap_threshold
+    def adjust_coordinates_to_frame(self, element_bbox: List[int], frame_bbox: List[int]) -> List[int]:
+        """Adjust element coordinates to be relative to the cropped frame."""
+        frame_x_min, frame_y_min = frame_bbox[0], frame_bbox[1]
+        adjusted_bbox = [
+            element_bbox[0] - frame_x_min,  # xmin
+            element_bbox[1] - frame_y_min,  # ymin
+            element_bbox[2] - frame_x_min,  # xmax
+            element_bbox[3] - frame_y_min   # ymax
+        ]
+        # Ensure coordinates are not negative
+        adjusted_bbox = [max(0, coord) for coord in adjusted_bbox]
+        return adjusted_bbox
+    def extract_panel_from_page(self, image_path: str, frame_data: Dict,
+                               output_dir: str, page_index: int, frame_index: int) -> Optional[str]:
+        """
+        Extract a single panel (frame) from manga page image.
+        Args:
+            image_path: Path to the manga page image
+            frame_data: Frame annotation with coordinates
+            output_dir: Directory to save extracted panel
+            page_index: Index of the current page
+            frame_index: Index of the frame within the page
+        Returns:
+            Path to extracted panel image or None if failed
+        """
+        # print(image_path)
+        # Load the image
+        image = cv2.imread(image_path)
+        if image is None:
+            print(f"Warning: Could not load image {image_path}")
+            return None
+        try:
+            # Get frame coordinates
+            xmin = int(frame_data.get('@xmin', 0))
+            ymin = int(frame_data.get('@ymin', 0))
+            xmax = int(frame_data.get('@xmax', image.shape[1]))
+            ymax = int(frame_data.get('@ymax', image.shape[0]))
+            # Validate and clip coordinates
+            xmin = max(0, xmin)
+            ymin = max(0, ymin)
+            xmax = min(image.shape[1], xmax)
+            ymax = min(image.shape[0], ymax)
+            if xmax <= xmin or ymax <= ymin:
+                print(f"Warning: Invalid frame coordinates for page {page_index}, frame {frame_index}")
+                return None
+            # Crop the panel
+            cropped_panel = image[ymin:ymax, xmin:xmax]
+            # Generate panel filename
+            panel_filename = f"page_{page_index:04d}_panel_{frame_index:03d}.jpg"
+            panel_path = os.path.join(output_dir, panel_filename)
+            # Save the cropped panel
+            success = cv2.imwrite(panel_path, cropped_panel)
+            if success:
+                self.stats['extracted_panels'] += 1
+                # print(f"Extracted panel: {panel_path}")
+                return panel_path
+            else:
+                print(f"Warning: Failed to save panel {panel_path}")
+                return None
+        except Exception as e:
+            print(f"Error extracting panel from page {page_index}, frame {frame_index}: {str(e)}")
+            return None
+    def calculate_distance(self, bbox1: List[int], bbox2: List[int]) -> float:
+        """Calculate Euclidean distance between centers of two bounding boxes."""
+        cx1, cy1 = (bbox1[0] + bbox1[2]) / 2, (bbox1[1] + bbox1[3]) / 2
+        cx2, cy2 = (bbox2[0] + bbox2[2]) / 2, (bbox2[1] + bbox2[3]) / 2
+        return math.sqrt((cx1 - cx2)**2 + (cy1 - cy2)**2)
+    def find_character_by_body_id(self, body_id: str, bodies: List[Dict]) -> Optional[str]:
+        """Find character ID associated with a body ID."""
+        for body in bodies:
+            if body.get('@id') == body_id:
+                return body.get('@character')
+        return None
+    def find_face_by_character(self, character_id: str, faces: List[Dict]) -> Optional[int]:
+        """Find face index by character ID."""
+        for i, face in enumerate(faces):
+            if face.get('@character') == character_id:
+                return i
+        return None
+    def find_closest_face(self, bubble_bbox: List[int], faces: List[Dict]) -> Optional[int]:
+        """Find closest face to bubble based on spatial proximity."""
+        if not faces:
+            return None
+        min_distance = float('inf')
+        closest_face_idx = None
+        for i, face in enumerate(faces):
+            face_bbox = [
+                face['bbox'][0], face['bbox'][1],
+                face['bbox'][2], face['bbox'][3]
+            ]
+            distance = self.calculate_distance(bubble_bbox, face_bbox)
+            if distance < min_distance and distance <= self.proximity_threshold:
+                min_distance = distance
+                closest_face_idx = i
+        return closest_face_idx
+    def associate_bubble_to_face(self, text_item: Dict, faces: List[Dict],
+                                bodies: List[Dict], original_bodies: List[Dict]) -> Optional[int]:
+        """Associate a speech bubble to a face using multiple strategies."""
+        bubble_bbox = text_item['bbox']
+        # Strategy 1: Direct body ID to character mapping
+        body_id = text_item.get('body_ref')
+        if body_id:
+            character_id = self.find_character_by_body_id(body_id, original_bodies)
+            if character_id:
+                face_idx = self.find_face_by_character(character_id, faces)
+                if face_idx is not None:
+                    return face_idx
+        # Strategy 2: Spatial proximity - find closest face
+        closest_face_idx = self.find_closest_face(bubble_bbox, faces)
+        if closest_face_idx is not None:
+            return closest_face_idx
+        # Strategy 3: Single face fallback
+        if len(faces) == 1:
+            return 0
+        return None
+    def process_panel(self, page_data: Dict, frame_data: Dict, page_index: int,
+                     frame_index: int, character_mapping: Dict[str, str],
+                     image_path: str = None, panels_output_dir: str = None) -> Dict[str, Any]:
+        """Process a single panel (frame) and create GCN dataset entry."""
+        # Extract panel image if paths provided
+        panel_image_path = None
+        if image_path and panels_output_dir:
+            panel_image_path = self.extract_panel_from_page(
+                image_path, frame_data, panels_output_dir, page_index, frame_index
+            )
+        # Get frame bounding box
+        frame_bbox = [
+            frame_data['@xmin'], frame_data['@ymin'],
+            frame_data['@xmax'], frame_data['@ymax']
+        ]
+        panel_width = frame_bbox[2] - frame_bbox[0]
+        panel_height = frame_bbox[3] - frame_bbox[1]
+        # Find elements within this frame
+        panel_faces = []
+        panel_bodies = []
+        panel_bubbles = []
+        # Process faces
+        for face in page_data.get('face', []):
+            face_bbox = [face['@xmin'], face['@ymin'], face['@xmax'], face['@ymax']]
+            if self.is_element_in_frame(face_bbox, frame_bbox):
+                adjusted_bbox = self.adjust_coordinates_to_frame(face_bbox, frame_bbox)
+                character_id = face.get('@character', '')
+                character_name = character_mapping.get(character_id, character_id)
+                panel_faces.append({
+                    "face_id": len(panel_faces),
+                    "bbox": adjusted_bbox,
+                    "original_id": face.get('@id', ''),
+                    "attributes": {
+                        "character_id": character_id,
+                        "character_name": character_name
+                    }
+                })
+                if character_id:
+                    self.stats['unique_characters'].add(character_id)
+        # Process bodies
+        for body in page_data.get('body', []):
+            body_bbox = [body['@xmin'], body['@ymin'], body['@xmax'], body['@ymax']]
+            if self.is_element_in_frame(body_bbox, frame_bbox):
+                adjusted_bbox = self.adjust_coordinates_to_frame(body_bbox, frame_bbox)
+                character_id = body.get('@character', '')
+                character_name = character_mapping.get(character_id, character_id)
+                panel_bodies.append({
+                    "body_id": len(panel_bodies),
+                    "bbox": adjusted_bbox,
+                    "original_id": body.get('@id', ''),
+                    "attributes": {
+                        "character_id": character_id,
+                        "character_name": character_name
+                    }
+                })
+        # Process text/speech bubbles
+        for text in page_data.get('text', []):
+            text_bbox = [text['@xmin'], text['@ymin'], text['@xmax'], text['@ymax']]
+            if self.is_element_in_frame(text_bbox, frame_bbox):
+                adjusted_bbox = self.adjust_coordinates_to_frame(text_bbox, frame_bbox)
+                panel_bubbles.append({
+                    "bubble_id": len(panel_bubbles),
+                    "bbox": adjusted_bbox,
+                    "text": text.get('#text', ''),
+                    "original_id": text.get('@id', ''),
+                    "body_ref": text.get('@bodyid', '')
+                })
+        # Create bubble-to-face links
+        links = []
+        original_bodies = page_data.get('body', [])
+        for bubble in panel_bubbles:
+            face_idx = self.associate_bubble_to_face(
+                bubble, panel_faces, panel_bodies, original_bodies
+            )
+            if face_idx is not None:
+                links.append({
+                    "bubble_id": bubble['bubble_id'],
+                    "face_id": face_idx
+                })
+                self.stats['successful_links'] += 1
+            else:
+                self.stats['failed_links'] += 1
+        # Update statistics
+        self.stats['total_panels'] += 1
+        self.stats['total_bubbles'] += len(panel_bubbles)
+        self.stats['total_faces'] += len(panel_faces)
+        self.stats['total_bodies'] += len(panel_bodies)
+        # Create panel ID
+        panel_id = f"page_{page_index:04d}_panel_{frame_index:03d}"
+        return {
+            "panel_id": panel_id,
+            "width": panel_width,
+            "height": panel_height,
+            "panel_image_path": panel_image_path,
+            "bubbles": panel_bubbles,
+            "faces": panel_faces,
+            "bodies": panel_bodies,
+            "links": links,
+            "metadata": {
+                "original_page_index": page_index,
+                "original_frame_index": frame_index,
+                "original_frame_id": frame_data.get('@id', ''),
+                "original_frame_bbox": frame_bbox
+            }
+        }
+    def convert_dataset(self, input_file: str, output_file: str,
+                       image_dir: str = None, panels_output_dir: str = None) -> Dict[str, Any]:
+        """Convert manga dataset to panel-based GCN format."""
+        print(f"Loading manga data from: {input_file}")
+        manga_data = self.load_manga_data(input_file)
+        # Extract title and character mapping
+        title = manga_data.get('title', 'Unknown')
+        character_list = manga_data.get('character', [])
+        character_mapping = self.build_character_mapping(character_list)
+        print(f"Title: {title}")
+        print(f"Characters found: {len(character_mapping)}")
+        # Create panels output directory if specified
+        if panels_output_dir:
+            os.makedirs(panels_output_dir, exist_ok=True)
+            print(f"Panel extraction enabled. Output directory: {panels_output_dir}")
+        # Process all pages and frames
+        pages = manga_data.get('page', [])
+        converted_panels = []
+        for page_idx, page_data in enumerate(pages):
+            self.stats['total_pages'] += 1
+            # Construct image path if image directory provided
+            image_path = os.path.join(image_dir,f"{str(page_data.get('@index', page_idx)).zfill(3)}.jpg")
+            # Process each frame in the page
+            frames = page_data.get('frame', [])
+            for frame_idx, frame_data in enumerate(frames):
+                panel = self.process_panel(
+                    page_data, frame_data, page_idx, frame_idx,
+                    character_mapping, image_path, panels_output_dir
+                )
+                converted_panels.append(panel)
+        # Create final dataset
+        dataset = {
+            "title": title,
+            "character_mapping": character_mapping,
+            "panels": converted_panels,
+            "conversion_stats": {
+                "total_pages": self.stats['total_pages'],
+                "total_panels": self.stats['total_panels'],
+                "total_bubbles": self.stats['total_bubbles'],
+                "total_faces": self.stats['total_faces'],
+                "total_bodies": self.stats['total_bodies'],
+                "successful_links": self.stats['successful_links'],
+                "failed_links": self.stats['failed_links'],
+                "extracted_panels": self.stats['extracted_panels'],
+                "unique_characters": len(self.stats['unique_characters']),
+                "link_success_rate": (self.stats['successful_links'] /
+                                    max(1, self.stats['total_bubbles'])) * 100
+            }
+        }
+        # Save converted dataset
+        print(f"Saving converted dataset to: {output_file}")
+        with open(output_file, 'w', encoding='utf-8') as f:
+            json.dump(dataset, f, ensure_ascii=False, indent=2)
+        # Print statistics
+        self.print_conversion_stats()
+        return dataset
+    def print_conversion_stats(self):
+        """Print conversion statistics."""
+        print("\n=== Conversion Statistics ===")
+        print(f"Total pages processed: {self.stats['total_pages']}")
+        print(f"Total panels extracted: {self.stats['total_panels']}")
+        print(f"Total speech bubbles: {self.stats['total_bubbles']}")
+        print(f"Total faces: {self.stats['total_faces']}")
+        print(f"Total bodies: {self.stats['total_bodies']}")
+        print(f"Successful links: {self.stats['successful_links']}")
+        print(f"Failed links: {self.stats['failed_links']}")
+        print(f"Panel images extracted: {self.stats['extracted_panels']}")
+        print(f"Unique characters: {len(self.stats['unique_characters'])}")
+        if self.stats['total_bubbles'] > 0:
+            success_rate = (self.stats['successful_links'] / self.stats['total_bubbles']) * 100
+            print(f"Link success rate: {success_rate:.1f}%")
+        print("=" * 30)
+def json_to_gcn(config):
+    # !python ./frame.py ../json/AkkeraKanjinchou.json ./output2.json --image-dir ../Manga109/images/AkkeraKanjinchou/  --panels-output-dir ./frames1/
+    # parser = argparse.ArgumentParser(description='Convert manga dataset to panel-based GCN format')
+    # parser.add_argument('input_file', help='Input JSON file with manga annotations')
+    # parser.add_argument('output_file', help='Output JSON file for GCN dataset')
+    # parser.add_argument('--image-dir', type=str, required=True,
+    #                    help='Directory containing manga page images')
+    # parser.add_argument('--panels-output-dir', type=str, required=True,
+    #                    help='Directory to save extracted panel images')
+    # parser.add_argument('--fuzzy-mapping', action='store_true', default=True,
+    #                    help='Enable fuzzy ID matching (default: True)')
+    # parser.add_argument('--proximity-threshold', type=float, default=200.0,
+    #                    help='Maximum distance for spatial proximity matching (default: 200.0)')
+    # parser.add_argument('--overlap-threshold', type=float, default=0.5,
+    #                    help='Minimum overlap ratio for element-frame association (default: 0.5)')
+    # args = parser.parse_args()
+    base_dir = config["root"]
+    manga109_root_dir = config["manga109_root_dir"]
+    root_json = config["root_json"]
+    for book_json in os.listdir(root_json):
+        book_name = str(book_json).split(".")[0]
+        panel_dir = os.path.join(base_dir,"panels",book_name)
+        if not os.path.exists(panel_dir):
+            os.makedirs(panel_dir)
+        # Convert dataset
+        converter = MangaPanelDatasetConverter(
+            # fuzzy_mapping=args.fuzzy_mapping,
+            # proximity_threshold=args.proximity_threshold
+        )
+        input_file = os.path.join(root_json,book_json)
+        output_file = os.path.join(base_dir,"panel_data",book_json)
+        image_dir = os.path.join(manga109_root_dir,"images",book_name)
+        try:
+            result = converter.convert_dataset(
+                input_file,
+                output_file,
+                image_dir,
+                panel_dir
+            )
+            print(f"\nConversion completed successfully!")
+            print(f"GCN dataset saved to: {output_file}")
+            print(f"Panel images saved to: {panel_dir}")
+        except Exception as e:
+            print(f"Error during conversion: {str(e)}")

utils/panel_ordering.py ADDED Viewed

	@@ -0,0 +1,258 @@

+import cv2
+class BoundingBox(object):
+    def __init__(self, xmin=None, ymin=None, xmax=None, ymax=None,
+    panels=None,
+    bbtype=None,
+    id_=""):
+        self.dict = {
+            "@xmin": xmin,
+            "@ymin": ymin,
+            "@xmax": xmax,
+            "@ymax": ymax,
+            "@id": id_,
+        }
+        if panels is None:
+            self.panels = [self]
+        else:
+            self.panels = panels
+        self.bbtype = bbtype
+    def init_dict(self, d):
+        self.dict = d
+        self.dict["@xmin"] = float(self.xmin)
+        self.dict["@ymin"] = float(self.ymin)
+        self.dict["@xmax"] = float(self.xmax)
+        self.dict["@ymax"] = float(self.ymax)
+        return self
+    def __getitem__(self, index):
+        return self.dict[index]
+    @property
+    def xmin(self):
+        return self.dict["@xmin"]
+    @property
+    def xmax(self):
+        return self.dict["@xmax"]
+    @property
+    def ymin(self):
+        return self.dict["@ymin"]
+    @property
+    def ymax(self):
+        return self.dict["@ymax"]
+    @property
+    def width(self):
+        return self.xmax - self.xmin
+    @property
+    def height(self):
+        return self.ymax - self.ymin
+    @property
+    def text(self):
+        return self.dict["#text"]
+    @property
+    def id(self):
+        return self.dict["@id"]
+    @property
+    def list(self):
+        return [self.xmin, self.ymin, self.xmax, self.ymax]
+    @property
+    def is_null(self):
+        return self.xmin is None or self.ymin is None or self.xmax is None or self.ymax is None
+    @property
+    def area(self):
+        if self.xmax is None or self.xmin is None or self.ymax is None or self.ymin is None:
+            return 0
+        return (self.xmax - self.xmin) * (self.ymax - self.ymin)
+    @property
+    def base_panels(self):
+        return len(self.panels)
+    def __getitem__(self, item):
+        return self.dict[item]
+    def __add__(self, a):
+        assert issubclass(type(a), BoundingBox)
+        if a.is_null:
+            return self
+        elif self.is_null:
+            return a
+        return BoundingBox(xmin=min(self.xmin, a.xmin),
+                           ymin=min(self.ymin, a.ymin),
+                           xmax=max(self.xmax, a.xmax),
+                           ymax=max(self.ymax, a.ymax),
+                           panels=self.panels + a.panels)
+    def __mul__(self, a):
+        assert issubclass(type(a), BoundingBox)
+        bb = BoundingBox(xmin=max(self.xmin, a.xmin),
+                         ymin=max(self.ymin, a.ymin),
+                         xmax=min(self.xmax, a.xmax),
+                         ymax=min(self.ymax, a.ymax),
+                         panels=self.panels + a.panels)
+        if bb.xmin > bb.xmax or bb.ymin > bb.ymax:
+            return BoundingBox()
+        else:
+            return bb
+    def __repr__(self):
+        return "<BoundingBox({},{}) {},{},{},{},{}>".format(self.bbtype, self.id, *self.list, self.base_panels)
+def get_pivot_side(zmin, zmax, pivot):
+    interception_ratio_threshold = 0.25
+    if pivot <= zmin:
+        return 1
+    elif zmax <= pivot:
+        return 0
+    else:
+        pivot_z_ratio = (pivot - zmin) / (zmax - zmin)
+        interception_ratio = min(pivot_z_ratio, 1 - pivot_z_ratio)
+        if interception_ratio > interception_ratio_threshold:
+            return -1
+        else:
+            return 0 if pivot_z_ratio > 0.5 else 1
+class BoxSet(set):
+    def get_highest_priority_division(self):
+        # Horizontal division
+        ydivs = sorted([bb.ymin for bb in self] + [bb.ymax for bb in self])
+        for pivot in ydivs:
+            division = self.get_pivot_division(pivot,
+                                               is_horizontal_division=True)
+            if len(division) > 1:
+                return division
+        # Vertical division
+        xdivs = sorted([bb.xmin for bb in self] + [bb.xmax for bb in self], reverse=True)
+        for pivot in xdivs:
+            division = self.get_pivot_division(pivot,
+                                               is_horizontal_division=False)
+            if len(division) > 1:
+                return division
+        # Undividable box set
+        return [self]
+    def get_pivot_division(self, pivot, is_horizontal_division):
+        divs = [BoxSet(), BoxSet()]
+        for bb in self:
+            if is_horizontal_division:
+                side = get_pivot_side(bb.ymin, bb.ymax, pivot)
+            else:
+                side = get_pivot_side(-bb.xmax, -bb.xmin, -pivot)
+            if side == -1:
+                return [self]
+            else:
+                divs[side].add(bb)
+        if len(divs[0]) == 0 or len(divs[1]) == 0:
+            return [self]
+        return divs
+    def get_multicut_division(self, cuts):
+        curset = self
+        cur_division = []
+        for cut in cuts:
+            pivot, is_horizontal_division = cut
+            division = curset.get_pivot_division(pivot, is_horizontal_division)
+            if len(division) > 1:
+                cur_division.append(division[0])
+                curset = division[1]
+        if len(cur_division) > 0:
+            return cur_division + [curset]
+        else:
+            return [self]
+    def yield_ordered_bbs(self):
+        if len(self) == 0:
+            pass
+        elif len(self) > 1:
+            yield self.sum(), False
+        else:
+            yield next(iter(self)), True
+    def sum(self):
+        if len(self) == 0:
+            return BoundingBox()
+        else:
+            l = list(self)
+            return sum(l[1:], l[0])
+class BoxNode(object):
+    def __init__(self, bbset, initial_cuts=None):
+        if initial_cuts:
+            division = bbset.get_multicut_division(initial_cuts)
+        else:
+            division = [bbset]
+        if len(division) == 1:
+            division = bbset.get_highest_priority_division()
+        isLeaf = len(division) <= 1
+        self.division = division if isLeaf else [BoxNode(section) for section in division]
+    def yield_ordered_bbs(self):
+        for section in self.division:
+            for bb in section.yield_ordered_bbs():
+                yield bb
+class BoxOrderEstimator(object):
+    def __init__(self, bbs, pagewidth=None, initial_cut_option=None):
+        if initial_cut_option == "two-page-four-panel":
+            initial_cuts = [(pagewidth * n / 4, False)
+                            for n in reversed(range(1, 4))]
+        elif initial_cut_option == "two-page":
+            initial_cuts = [(pagewidth / 2, False)]
+        else:
+            initial_cuts = None
+        self.boxnode = BoxNode(BoxSet(bbs), initial_cuts)
+        t = tuple(zip(*self.boxnode.yield_ordered_bbs()))
+        if len(t) > 0:
+            self.ordered_bbs, self.bb_estimation_statuses = t
+        else:
+            self.ordered_bbs, self.bb_estimation_statuses = (), ()
+def panel_ordering(test_image,dets):
+    image = cv2.imread(test_image)
+    image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)  # Convert BGR to RGB for display
+    # interception_ratio_threshold = 0.25
+    # print(predictions_frame)
+    panels = set()
+    # for panel in dets['frame']:
+    for panel in dets.panels:
+        panel = panel['bbox']
+        panels.add(BoundingBox(panel[0],panel[1],panel[2],panel[3]))
+    # print(panels)
+    # image = page.get_image()
+    pagewidth = image.size
+    pageheight, pagewidth, pagechannels = image.shape
+    # panels = page.get_bbs()["frame"]
+    # print(panels)
+    boxOrderEstimator = BoxOrderEstimator(
+        panels,
+        pagewidth=pagewidth,
+        initial_cut_option="two-page")
+    return boxOrderEstimator

utils/train_speaker.py ADDED Viewed

	@@ -0,0 +1,385 @@

+#!/usr/bin/env python3
+"""
+Complete GCN Training Pipeline for Speech Bubble to Speaker Association
+Fixed version that handles the dataset format correctly and resolves training issues.
+"""
+import json
+import torch
+import numpy as np
+import random
+from torch_geometric.data import HeteroData, Batch
+import torch.nn as nn
+from scipy.optimize import linear_sum_assignment
+from typing import Dict, List, Any, Optional, Tuple
+# from utils.utilities import save_checkpoint
+import os
+from pathlib import Path
+CHECKPOINT_DIR = Path("checkpoints")
+CHECKPOINT_DIR.mkdir(exist_ok=True)
+def save_checkpoint(model: torch.nn.Module,
+                    epoch: int,
+                    loss: float,
+                    path: Path = CHECKPOINT_DIR / "assoc_gcn.pt") -> None:
+    """
+    Persist full training state so you can resume fine-tuning later.
+    """
+    path = Path(path)
+    torch.save({
+        "epoch": epoch,
+        "loss": loss,
+        "model_state": model.state_dict()
+    }, path)
+    print(f"✅ Model checkpoint saved to {path.resolve()}")
+class DatasetLoader:
+    """Handles loading and preprocessing of the converted GCN dataset"""
+    @staticmethod
+    def load_converted_dataset(json_path: str) -> List[HeteroData]:
+        """Load the converted GCN dataset and create PyTorch Geometric HeteroData objects"""
+        with open(json_path, 'r', encoding='utf-8') as f:
+            data = json.load(f)
+        dataset = []
+        panels = data.get('panels', [])
+        for panel in panels:
+            het_data = DatasetLoader.create_hetero_data_from_panel(panel)
+            if het_data is not None:
+                dataset.append(het_data)
+        print(f"Loaded {len(dataset)} panels from {json_path}")
+        return dataset
+    @staticmethod
+    def create_hetero_data_from_panel(panel: Dict) -> Optional[HeteroData]:
+        """Convert a single panel from the converted dataset into HeteroData format"""
+        bubbles = panel.get('bubbles', [])
+        faces = panel.get('faces', [])
+        links = panel.get('links', [])
+        if len(bubbles) == 0 or len(faces) == 0:
+            return None
+        W, H = panel['width'], panel['height']
+        # Create node features
+        bubble_features = []
+        for bubble in bubbles:
+            x1, y1, x2, y2 = bubble['bbox']
+            cx, cy = (x1 + x2) / (2 * W), (y1 + y2) / (2 * H)
+            w, h = (x2 - x1) / W, (y2 - y1) / H
+            area = w * h
+            aspect = w / h if h > 0 else 1.0
+            bubble_features.append([cx, cy, w, h, area, aspect])
+        face_features = []
+        for face in faces:
+            x1, y1, x2, y2 = face['bbox']
+            cx, cy = (x1 + x2) / (2 * W), (y1 + y2) / (2 * H)
+            w, h = (x2 - x1) / W, (y2 - y1) / H
+            area = w * h
+            aspect = w / h if h > 0 else 1.0
+            face_features.append([cx, cy, w, h, area, aspect])
+        # Create edge indices and features
+        edge_indices, edge_features, edge_labels = [], [], []
+        # Create mappings
+        bubble_id_to_idx = {bubble['bubble_id']: i for i, bubble in enumerate(bubbles)}
+        face_id_to_idx = {face['face_id']: i for i, face in enumerate(faces)}
+        # Create ground truth mapping
+        gt_links = {}
+        for link in links:
+            if link['bubble_id'] in bubble_id_to_idx and link['face_id'] in face_id_to_idx:
+                bubble_idx = bubble_id_to_idx[link['bubble_id']]
+                face_idx = face_id_to_idx[link['face_id']]
+                gt_links[(bubble_idx, face_idx)] = 1
+        # Create all possible bubble-face edges
+        for i, bubble in enumerate(bubbles):
+            for j, face in enumerate(faces):
+                # Calculate edge features
+                b_x1, b_y1, b_x2, b_y2 = bubble['bbox']
+                f_x1, f_y1, f_x2, f_y2 = face['bbox']
+                b_cx, b_cy = (b_x1 + b_x2) / (2 * W), (b_y1 + b_y2) / (2 * H)
+                f_cx, f_cy = (f_x1 + f_x2) / (2 * W), (f_y1 + f_y2) / (2 * H)
+                dx, dy = b_cx - f_cx, b_cy - f_cy
+                dist = (dx**2 + dy**2)**0.5
+                # Calculate IoU
+                xx1, yy1 = max(b_x1, f_x1), max(b_y1, f_y1)
+                xx2, yy2 = min(b_x2, f_x2), min(b_y2, f_y2)
+                inter = max(0, xx2 - xx1) * max(0, yy2 - yy1)
+                union = (b_x2 - b_x1) * (b_y2 - b_y1) + (f_x2 - f_x1) * (f_y2 - f_y1) - inter
+                iou = inter / union if union > 0 else 0
+                edge_indices.append([i, j])
+                edge_features.append([dx, dy, dist, iou])
+                edge_labels.append(1.0 if (i, j) in gt_links else 0.0)
+        if len(edge_indices) == 0:
+            return None
+        # Create HeteroData object
+        data = HeteroData()
+        data['bubble'].x = torch.tensor(bubble_features, dtype=torch.float)
+        data['face'].x = torch.tensor(face_features, dtype=torch.float)
+        data['bubble', 'to', 'face'].edge_index = torch.tensor(edge_indices, dtype=torch.long).t().contiguous()
+        data['bubble', 'to', 'face'].edge_attr = torch.tensor(edge_features, dtype=torch.float)
+        data['bubble', 'to', 'face'].edge_label = torch.tensor(edge_labels, dtype=torch.float)
+        # Add metadata
+        data.panel_id = panel['panel_id']
+        data.width = W
+        data.height = H
+        return data
+class AssocGCN(nn.Module):
+    """Graph Convolutional Network for Speech Bubble to Speaker Association"""
+    def __init__(self, in_feats: int = 6, hid: int = 128):
+        super().__init__()
+        self.node_encoder = nn.Sequential(
+            nn.Linear(in_feats, hid),
+            nn.ReLU(),
+            nn.Linear(hid, hid)
+        )
+        # Message passing layers
+        self.conv1 = nn.Sequential(
+            nn.Linear(hid * 2 + 4, hid),  # node features + edge features
+            nn.ReLU(),
+            nn.Linear(hid, hid)
+        )
+        self.conv2 = nn.Sequential(
+            nn.Linear(hid * 2 + 4, hid),
+            nn.ReLU(),
+            nn.Linear(hid, hid)
+        )
+        self.conv3 = nn.Sequential(
+            nn.Linear(hid * 2 + 4, hid),
+            nn.ReLU(),
+            nn.Linear(hid, hid)
+        )
+        # Edge classifier
+        self.edge_mlp = nn.Sequential(
+            nn.Linear(2 * hid + 4, hid),
+            nn.ReLU(),
+            nn.Dropout(0.1),
+            nn.Linear(hid, 1)
+        )
+    def forward(self, data):
+        bubble_x = self.node_encoder(data['bubble'].x)
+        face_x = self.node_encoder(data['face'].x)
+        edge_index = data['bubble', 'to', 'face'].edge_index
+        edge_attr = data['bubble', 'to', 'face'].edge_attr
+        src_idx, dst_idx = edge_index[0], edge_index[1]
+        # Apply message passing
+        for conv in [self.conv1, self.conv2, self.conv3]:
+            src_features = bubble_x[src_idx]
+            dst_features = face_x[dst_idx]
+            edge_input = torch.cat([src_features, dst_features, edge_attr], dim=1)
+            edge_updates = conv(edge_input)
+            # Update node features (simplified aggregation)
+            bubble_updates = torch.zeros_like(bubble_x)
+            face_updates = torch.zeros_like(face_x)
+            for i in range(len(src_idx)):
+                s, d = src_idx[i].item(), dst_idx[i].item()
+                bubble_updates[s] += edge_updates[i]
+                face_updates[d] += edge_updates[i]
+            # Normalize by degree
+            bubble_degrees = torch.bincount(src_idx, minlength=bubble_x.size(0)).float().clamp(min=1)
+            face_degrees = torch.bincount(dst_idx, minlength=face_x.size(0)).float().clamp(min=1)
+            bubble_updates = bubble_updates / bubble_degrees.unsqueeze(1)
+            face_updates = face_updates / face_degrees.unsqueeze(1)
+            # Residual connection
+            bubble_x = bubble_x + bubble_updates
+            face_x = face_x + face_updates
+        # Final edge prediction
+        src_final = bubble_x[src_idx]
+        dst_final = face_x[dst_idx]
+        edge_input = torch.cat([src_final, dst_final, edge_attr], dim=1)
+        logits = self.edge_mlp(edge_input).squeeze(-1)
+        return logits
+def hungarian_matching(scores: torch.Tensor, src_indices, dst_indices):
+    """Apply Hungarian algorithm for optimal bipartite matching"""
+    if len(scores) == 0:
+        return {}
+    num_bubbles = src_indices.max().item() + 1 if len(src_indices) > 0 else 0
+    num_faces = dst_indices.max().item() + 1 if len(dst_indices) > 0 else 0
+    cost_matrix = np.full((num_bubbles, num_faces), 1e6, dtype=np.float32)
+    scores_np = scores.detach().cpu().sigmoid().numpy()
+    for i, (s, d, score) in enumerate(zip(src_indices.cpu(), dst_indices.cpu(), scores_np)):
+        cost_matrix[s, d] = -score  # Negative for minimization
+    row_indices, col_indices = linear_sum_assignment(cost_matrix)
+    mapping = {}
+    for r, c in zip(row_indices, col_indices):
+        if cost_matrix[r, c] < 0:  # Valid assignment
+            mapping[int(r)] = int(c)
+    return mapping
+def train_gcn(dataset: List[HeteroData], epochs: int = 200, batch_size: int = 16, lr: float = 1e-4):
+    """Train the GCN model on the dataset"""
+    if len(dataset) == 0:
+        raise ValueError("Dataset is empty!")
+    print(f"Training on {len(dataset)} panels...")
+    device = 'cuda' if torch.cuda.is_available() else 'cpu'
+    model = AssocGCN().to(device)
+    optimizer = torch.optim.AdamW(model.parameters(), lr=lr)
+    # Calculate class weights
+    total_positive = sum(data['bubble', 'to', 'face'].edge_label.sum().item() for data in dataset)
+    total_edges = sum(len(data['bubble', 'to', 'face'].edge_label) for data in dataset)
+    pos_weight = (total_edges - total_positive) / total_positive if total_positive > 0 else 9.0
+    print(f"Positive edges: {total_positive}/{total_edges} ({100*total_positive/total_edges:.1f}%)")
+    print(f"Using pos_weight: {pos_weight:.2f}")
+    loss_fn = nn.BCEWithLogitsLoss(pos_weight=torch.tensor([pos_weight], device=device))
+    model.train()
+    best_loss = float("inf")
+    for epoch in range(epochs):
+        total_correct = 0
+        total_samples = 0
+        total_tp = 0
+        total_fp = 0
+        total_fn = 0
+        # FIXED: Properly shuffle the dataset (it's a list, not a dict)
+        shuffled_dataset = dataset.copy()
+        random.shuffle(shuffled_dataset)
+        total_loss = 0.0
+        num_batches = (len(shuffled_dataset) + batch_size - 1) // batch_size
+        for batch_idx in range(num_batches):
+            start_idx = batch_idx * batch_size
+            end_idx = min(start_idx + batch_size, len(shuffled_dataset))
+            batch_data = shuffled_dataset[start_idx:end_idx]
+            # Create batch
+            batch = Batch.from_data_list(batch_data).to(device)
+            # Forward pass
+            logits = model(batch)
+            labels = batch['bubble', 'to', 'face'].edge_label
+            # Compute loss
+            loss = loss_fn(logits, labels)
+            # Backward pass
+            optimizer.zero_grad()
+            loss.backward()
+            optimizer.step()
+            total_loss += loss.item() * len(batch_data)
+            # Accuracy computation
+            probs = torch.sigmoid(logits)
+            preds = (probs > 0.5).float()
+            correct = (preds == labels).sum().item()
+            total_correct += correct
+            total_samples += labels.numel()
+        avg_loss = total_loss / len(shuffled_dataset)
+        print(f"Epoch {epoch+1:02d}/{epochs}: Loss = {avg_loss:.4f}")
+        avg_loss = total_loss / len(shuffled_dataset)
+        accuracy = total_correct / total_samples
+        if avg_loss < best_loss:
+            best_loss = avg_loss
+            save_checkpoint(model, epoch+1, best_loss)  # epoch is 0-indexed
+        # Compute recall and F1
+        recall = total_tp / (total_tp + total_fn) if (total_tp + total_fn) > 0 else 0.0
+        precision = total_tp / (total_tp + total_fp) if (total_tp + total_fp) > 0 else 0.0
+        f1 = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0.0
+        print(f"Epoch {epoch+1:02d}/{epochs}: Loss = {avg_loss:.4f}, Accuracy = {accuracy:.4f}, Recall = {recall:.4f}, F1 = {f1:.4f}")
+    print("Training completed!")
+    return model
+def infer_associations(model, data):
+    """Infer speech bubble to speaker associations"""
+    device = next(model.parameters()).device
+    data = data.to(device)
+    model.eval()
+    with torch.no_grad():
+        logits = model(data)
+        src, dst = data['bubble', 'to', 'face'].edge_index
+        mapping = hungarian_matching(logits, src, dst)
+    return mapping
+# Example usage and testing
+def train_speaker(config):
+    # Test with sample data
+    dataset = []
+    for panel_data_file in os.listdir(os.path.join(config["root"]+"panel_data/")):
+        try:
+            print(panel_data_file)
+            # Load your converted dataset
+            dataset += DatasetLoader.load_converted_dataset(os.path.join(config["root"]+"panel_data/",panel_data_file))
+            # if len(dataset) == 0:
+            #     print("No valid panels found in dataset!")
+            # else:
+            #     # Train the model
+            #     model = train_gcn(dataset, epochs=10, batch_size=16)  # Reduced epochs for testing
+            #     # Test inference on first panel
+            #     test_data = dataset[0]
+            #     print(test_data)
+            #     mapping = infer_associations(model, test_data)
+            #     print("\nInference Results:")
+            #     for bubble_id, face_id in mapping.items():
+            #         print(f"Bubble {bubble_id} → Face {face_id}")
+        except FileNotFoundError:
+            print("Error: ./output.json not found!")
+            print("Please ensure your converted dataset file exists.")
+        except Exception as e:
+            print(f"Error: {e}")
+            print("Please check your dataset format and file paths.")
+    model = train_gcn(dataset, epochs=30, batch_size=16)  # Reduced epochs for testing

utils/transcript_page.py ADDED Viewed

	@@ -0,0 +1,37 @@

+from utils.japanese_ocr import MangaOcr
+import cv2
+from PIL import Image
+def transcript_page(test_image,translator,predictions):
+    image = cv2.imread(test_image)
+    mocr = MangaOcr()
+    # for bubble in predictions.speech_bubbles:
+    #     xmin,ymin,xmax,ymax = bubble['bbox']
+    #     text = mocr(Image.fromarray(cv2.cvtColor(image[ymin:ymax, xmin:xmax], cv2.COLOR_BGR2RGB)))
+    #     print(text)
+    #     result = translator(text)
+    #     print(result[0]['translation_text'])
+    # for panel in predictions.panels:
+    #     print(panel)
+    japanese_transcript = []
+    english_transcript = []
+    for i in range(len(predictions.panels)):
+        # print("ver",i)
+        cur_panel = [b for b in predictions.panels if b['order'] == i]
+        # print("ver",cur_panel)
+        if(len(cur_panel) > 0):
+            # If subpanels exist , no of panels will exceed the actual amount of panels
+            panel_bubbles = [b for b in predictions.speech_bubbles if b['panel_id'] == cur_panel[0]['id']]
+            panel_bubbles.sort(key=lambda b: b['seq'])
+            for bubble in panel_bubbles:
+                xmin,ymin,xmax,ymax = bubble['bbox']
+                char_id = bubble['char_id'] if 'char_id' in bubble.keys() else None
+                text = mocr(Image.fromarray(cv2.cvtColor(image[ymin:ymax, xmin:xmax], cv2.COLOR_BGR2RGB)))
+                print("Character ",char_id," : ",text)
+                japanese_transcript.append({"char_id":char_id,"text":text})
+                result = translator(text)
+                print("Character ",char_id," : ",result[0]['translation_text'])
+                english_transcript.append({"char_id":char_id,"text":result[0]['translation_text']})
+    return japanese_transcript,english_transcript

utils/utilities.py ADDED Viewed

	@@ -0,0 +1,306 @@

+# ✨ utils.py ── reusable helpers
+import torch
+from pathlib import Path
+from typing import List, Dict
+import cv2
+import matplotlib.pyplot as plt
+import numpy as np
+import matplotlib.pyplot as plt
+import matplotlib.patches as patches
+import cv2
+from utils.identify_speaker import DetectionPredictions
+def load_model(path: Path,
+               device: str = "cpu") -> "AssocGCN":
+    """
+    Re-create the network and load the saved weights.
+    """
+    from train_speaker import AssocGCN  # import lazily to avoid circular ref
+    ckpt = torch.load(path, map_location=device)
+    model = AssocGCN().to(device)
+    model.load_state_dict(ckpt["model_state"])
+    model.eval()                     # switch to inference mode
+    print(f"🔄 Loaded checkpoint from {path} (epoch {ckpt['epoch']})")
+    return model
+def show_prediction_per_page(img_path,dets):
+    # %matplotlib inline
+    # Load your manga panel (image)
+    image = cv2.imread(img_path)
+    image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)  # Convert BGR to RGB for display
+    # Draw bounding boxes for body
+    for box in dets.bodies:
+        x_min, y_min, x_max, y_max = box['bbox']
+        cv2.rectangle(image, (x_min, y_min), (x_max, y_max), color=(255, 0, 0), thickness=2)
+    # Draw bounding boxes for face
+    for box in dets.faces:
+        x_min, y_min, x_max, y_max = box['bbox']
+        cv2.rectangle(image, (x_min, y_min), (x_max, y_max), color=(0, 255, 0), thickness=2)
+    # Draw bounding boxes for body
+    for box in dets.panels:
+        x_min, y_min, x_max, y_max = box['bbox']
+        cv2.rectangle(image, (x_min, y_min), (x_max, y_max), color=(0, 0, 255), thickness=2)
+    # Draw bounding boxes for body
+    for box in dets.speech_bubbles:
+        x_min, y_min, x_max, y_max = box['bbox']
+        cv2.rectangle(image, (x_min, y_min), (x_max, y_max), color=(255, 255, 0), thickness=2)
+    # Display the image
+    plt.imshow(image)
+    plt.axis('off')
+    plt.title('Manga Panel with Bounding Boxes')
+    plt.show()
+def show_panel_order(img_path,boxOrderEstimator):
+    image = cv2.imread(img_path)
+    image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)  # Convert BGR to RGB for display
+    def show_panel(panel, labeltext, linestyle="solid", edgecolor="red", ax=None):
+        fontsize = 12  # reduce fontsize for memory efficiency
+        if ax is None:
+            ax = plt.gca()
+        rect = patches.Rectangle(
+            (panel.xmin, panel.ymin),
+            panel.width,
+            panel.height,
+            linewidth=2,
+            linestyle=linestyle,
+            edgecolor=edgecolor,
+            facecolor="none"
+        )
+        ax.add_patch(rect)
+        g_sub_bb_x = panel.xmin + panel.width / 2
+        g_sub_bb_y = panel.ymin + panel.height / 2
+        ax.text(g_sub_bb_x, g_sub_bb_y, labeltext,
+                fontsize=fontsize, color=edgecolor,
+                ha='center', va='center')
+    fig, ax = plt.subplots(figsize=(10, 7))  # This avoids multiple axes
+    # Show the image
+    if len(np.array(image).shape) == 2:
+        ax.imshow(image, cmap="gray")
+    else:
+        ax.imshow(image)
+    # Draw bounding boxes
+    for i_panel, panel in enumerate(boxOrderEstimator.ordered_bbs):
+        # print(i_panel,panel)
+        order = i_panel + 1
+        if len(panel.panels) == 1:
+            show_panel(panel, labeltext=f"{order}", edgecolor="red", ax=ax)
+        else:
+            for subpanel in panel.panels:
+                show_panel(subpanel, edgecolor="orange", linestyle="dotted", labeltext=f"({order})", ax=ax)
+    plt.axis('off')  # cleaner output
+    plt.tight_layout()
+    plt.show()
+# ============================================================================
+# VISUALIZATION
+# ============================================================================
+def visualize_speaker_results(image_path: str, predictions: DetectionPredictions,
+                     associations: Dict[str, Dict[int, int]], output_path: str = None):
+    """
+    Visualize detection results and speech-to-speaker associations
+    """
+    img = cv2.imread(image_path)
+    if img is None:
+        print(f"❌ Could not load image for visualization: {image_path}")
+        return
+    # Colors for different elements (BGR format for OpenCV)
+    colors = {
+        'panel': (255, 0, 0),        # Blue
+        'bubble': (0, 255, 0),       # Green
+        'face': (0, 0, 255),         # Red
+        'body': (255, 255, 0),       # Cyan
+        'association': (255, 0, 255) # Magenta
+    }
+    # Draw panels/frames
+    for panel in predictions.panels:
+        bbox = [int(x) for x in panel['bbox']]
+        cv2.rectangle(img, (bbox[0], bbox[1]), (bbox[2], bbox[3]),
+                     colors['panel'], 2)
+        cv2.putText(img, f"Panel {panel['id']}", (bbox[0], bbox[1]-10),
+                   cv2.FONT_HERSHEY_SIMPLEX, 0.6, colors['panel'], 2)
+    # Draw speech bubbles/text
+    for bubble in predictions.speech_bubbles:
+        bbox = [int(x) for x in bubble['bbox']]
+        cv2.rectangle(img, (bbox[0], bbox[1]), (bbox[2], bbox[3]),
+                     colors['bubble'], 2)
+        cv2.putText(img, f"T{bubble['id']}", (bbox[0], bbox[1]-5),
+                   cv2.FONT_HERSHEY_SIMPLEX, 0.5, colors['bubble'], 2)
+    # Draw faces
+    for face in predictions.faces:
+        bbox = [int(x) for x in face['bbox']]
+        cv2.rectangle(img, (bbox[0], bbox[1]), (bbox[2], bbox[3]),
+                     colors['face'], 2)
+        cv2.putText(img, f"F{face['id']}", (bbox[0], bbox[1]-5),
+                   cv2.FONT_HERSHEY_SIMPLEX, 0.5, colors['face'], 2)
+    # Draw bodies (optional)
+    for body in predictions.bodies:
+        bbox = [int(x) for x in body['bbox']]
+        cv2.rectangle(img, (bbox[0], bbox[1]), (bbox[2], bbox[3]),
+                     colors['body'], 1)
+        cv2.putText(img, f"B{body['id']}", (bbox[0], bbox[1]-5),
+                   cv2.FONT_HERSHEY_SIMPLEX, 0.4, colors['body'], 1)
+    # Draw associations
+    for panel_id, mapping in associations.items():
+        # Extract panel number from panel_id string
+        try:
+            panel_num = int(panel_id.split('_')[-1])
+        except:
+            panel_num = 0
+        panel_bubbles = [b for b in predictions.speech_bubbles
+                        if b['panel_id'] == panel_num]
+        panel_faces = [f for f in predictions.faces
+                      if f['panel_id'] == panel_num]
+        for bubble_idx, face_idx in mapping.items():
+            if bubble_idx < len(panel_bubbles) and face_idx < len(panel_faces):
+                b_bbox = panel_bubbles[bubble_idx]['bbox']
+                f_bbox = panel_faces[face_idx]['bbox']
+                b_center = (int((b_bbox[0] + b_bbox[2]) / 2),
+                           int((b_bbox[1] + b_bbox[3]) / 2))
+                f_center = (int((f_bbox[0] + f_bbox[2]) / 2),
+                           int((f_bbox[1] + f_bbox[3]) / 2))
+                # Draw association line
+                cv2.line(img, b_center, f_center, colors['association'], 3)
+                cv2.circle(img, b_center, 5, colors['association'], -1)
+                cv2.circle(img, f_center, 5, colors['association'], -1)
+                # Add association number
+                mid_point = (int((b_center[0] + f_center[0]) / 2),
+                           int((b_center[1] + f_center[1]) / 2))
+                cv2.putText(img, f"{bubble_idx}→{face_idx}", mid_point,
+                           cv2.FONT_HERSHEY_SIMPLEX, 0.5, colors['association'], 2)
+    # Save or display
+    if output_path:
+        cv2.imwrite(output_path, img)
+        print(f"📷 Visualization saved to {output_path}")
+    else:
+        # cv2.imshow("Speech-to-Speaker Associations", img)
+        # cv2.waitKey(0)
+        # cv2.destroyAllWindows()
+        # Display the image
+        plt.imshow(img)
+        plt.axis('off')
+        plt.title('Speech-to-Speaker Associations')
+        plt.show()
+def process_results(predictions,boxOrderEstimator,pairs,labels,results):
+    # Assign order to each panel
+    for i_panel, panel in enumerate(boxOrderEstimator.ordered_bbs):
+        x1 = panel.xmin
+        y1 = panel.ymin
+        x2 = panel.xmax
+        y2 = panel.ymax
+        for pred_panel in predictions.panels:
+            x3,y3,x4,y4 = pred_panel['bbox']
+            # print(x1 == x3 , y1 == y3 , x2 == x4 , y2 == y4,x1,y1,x2,y2,x3,y3,x4,y4)
+            if x1 == x3 and y1 == y3 and x2 == x4 and y2 == y4:
+                # print("ok")
+                pred_panel['order'] = i_panel
+        if len(panel.panels) > 1:
+            for subpanel in panel.panels:
+                x1 = subpanel.xmin
+                y1 = subpanel.ymin
+                x2 = subpanel.xmax
+                y2 = subpanel.ymax
+                for pred_panel in predictions.panels:
+                    x3,y3,x4,y4 = pred_panel['bbox']
+                    # print(x1 == x3 , y1 == y3 , x2 == x4 , y2 == y4,x1,y1,x2,y2,x3,y3,x4,y4)
+                    if x1 == x3 and y1 == y3 and x2 == x4 and y2 == y4:
+                        # print("ok")
+                        pred_panel['order'] = i_panel
+    # Assign character id to each face
+    for i,(face,body) in enumerate(pairs):
+        # print(i,face,body)
+        x1,y1,x2,y2 = face
+        for pred_panel in predictions.faces:
+            x3,y3,x4,y4 = pred_panel['bbox']
+            # print(x1 == x3 , y1 == y3 , x2 == x4 , y2 == y4,x1,y1,x2,y2,x3,y3,x4,y4)
+            if x1 == x3 and y1 == y3 and x2 == x4 and y2 == y4:
+                # print("ok")
+                pred_panel['char_id'] = int(labels[i])
+    # Assign character id to each text
+    for panel_id, mapping in results.items():
+        # print(panel_id,mapping)
+        try:
+            panel_num = int(panel_id.split('_')[-1])
+        except:
+            panel_num = 0
+        panel_bubbles = [b for b in predictions.speech_bubbles
+                        if b['panel_id'] == panel_num]
+        panel_faces = [f for f in predictions.faces
+                        if f['panel_id'] == panel_num]
+        # print("aaa",panel_num,panel_bubbles,panel_faces)
+        for bubble_idx, face_idx in mapping.items():
+            if bubble_idx < len(panel_bubbles) and face_idx < len(panel_faces):
+                bubble_id = panel_bubbles[bubble_idx]['id']
+                face_id = panel_faces[face_idx]['id']
+                # print("bbb",bubble_id,face_id)
+                # Safely add char_id to the bubble by matching IDs
+                # (in case panel_bubbles and predictions.speech_bubbles are different lists)
+                for b in predictions.speech_bubbles:
+                    if b['id'] == bubble_id:
+                        for f in predictions.faces:
+                            if f['id'] == face_id:
+                                b['char_id'] = f.get('char_id', None)
+                                break  # Stop after finding the bubble
+    return predictions
+def get_speakers(gender_list):
+    speakers = [['Alexandra_Hisakawa','f','o'],['Annmarie_Nele','f','o'],['Luis_Moray','m','y'],['Baldur_Sanjin','m','o'],['Ige_Behringer','m','o'],['Eugenio_Mataracı','m','y'],['Camilla_Holmström','f','y'],['Uta_Obando','f','o'],['Alma_María','f','o'],['Gitta_Nikolina','f','o'],['Kazuhiko_Atallah','m','y'],['Tammie_Ema','f','o'],['Barbora_MacLean','f','y'],['Rosemary_Okafor','f','o'],['Craig_Gutsy','m','o'],['Gilberto_Mathias','m','o'],['Suad_Qasim','f','o'],['Chandra_MacFarland','f','o'],['Marcos_Rudaski','m','o'],['Brenda_Stern','f','y'],['Ana_Florence','f','y'],['Royston_Min','m','y'],['Tammy_Grit','f','y'],['Tanja_Adelina','f','y'],['Kumar_Dahl','m','o'],['Abrahan_Mack','m','o'],['Daisy_Studious','f','o'],['Alison_Dietlinde','f','o'],['Ilkin_Urbano','m','o'],['Dionisio_Schuyler','m','o'],['Narelle_Moon','f','o'],['Asya_Anara','f','o'],['Viktor_Eka','m','o'],['Vjollca_Johnnie','f','o'],['Adde_Michal','m','y'],['Viktor_Menelaos','m','y']]
+    new_speakers = {"Male":[],"Female":[]}
+    for speaker in speakers:
+        # print(speaker)
+        if speaker[1] == 'm' and speaker[2] == 'y':
+            new_speakers['Male'].append(speaker[0])
+        elif speaker[1] == 'f' and speaker[2] == 'y':
+            new_speakers['Female'].append(speaker[0])
+    for speaker in speakers:
+        # print(speaker)
+        if speaker[1] == 'm' and speaker[2] == 'o':
+            new_speakers['Male'].append(speaker[0])
+        elif speaker[1] == 'f' and speaker[2] == 'o':
+            new_speakers['Female'].append(speaker[0])
+    # print(new_speakers['Male'])
+    # print(new_speakers['Female'])
+    # return new_speakers
+    speaker_list = []
+    for gender in gender_list:
+        speaker_list.append(new_speakers[gender[0]].pop(0))
+    speaker_list.append(new_speakers['Male'].pop(0))
+    return speaker_list

utils/visual_llm_gender.py ADDED Viewed

	@@ -0,0 +1,79 @@

+from transformers import Qwen2_5_VLForConditionalGeneration, AutoTokenizer, AutoProcessor
+from qwen_vl_utils import process_vision_info
+def load_llm():
+    # default: Load the model on the available device(s)
+    model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
+        "Qwen/Qwen2.5-VL-3B-Instruct", torch_dtype="auto", device_map="auto"
+    )
+    min_pixels = 256*28*28
+    max_pixels = 1280*28*28
+    processor = AutoProcessor.from_pretrained("Qwen/Qwen2.5-VL-3B-Instruct", min_pixels=min_pixels, max_pixels=max_pixels)
+    return model,processor
+import cv2
+import base64
+from PIL import Image
+from io import BytesIO
+def check_gender(model,processor,img_list,test_image):
+    content = [
+                {"type": "text", "text": "Both of the images belong to same person. Identify the gender and respond only Male or Female"},
+            ]
+    # Load image and crop face
+    img = cv2.imread(test_image)
+    for image in img_list[:3]:
+        x1, y1, x2, y2 = image['bbox']
+        face_crop = img[y1:y2, x1:x2]
+        # Encode to JPEG format in memory
+        _, buffer = cv2.imencode('.jpg', face_crop)
+        # Convert to Base64
+        face_base64 = base64.b64encode(buffer).decode('utf-8')
+        image_bytes = base64.b64decode(face_base64)
+        image_64 = Image.open(BytesIO(image_bytes)).convert("RGB")
+        content.append({"type": "image", "image": image_64})
+    messages = [
+        {
+            "role": "user",
+            "content": content
+        }
+    ]
+    # Preparation for inference
+    text = processor.apply_chat_template(
+        messages, tokenize=False, add_generation_prompt=True
+    )
+    image_inputs, video_inputs = process_vision_info(messages)
+    inputs = processor(
+        text=[text],
+        images=image_inputs,
+        videos=video_inputs,
+        padding=True,
+        return_tensors="pt",
+    )
+    inputs = inputs.to("cuda")
+    # Inference
+    generated_ids = model.generate(**inputs, max_new_tokens=128)
+    generated_ids_trimmed = [
+        out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
+    ]
+    output_text = processor.batch_decode(
+        generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
+    )
+    print(output_text)
+    return output_text
+def visual_llm_gender(unique_chars,predictions,test_image):
+    model,processor = load_llm()
+    # for panel in predictions.faces:
+    #     print(panel)
+    gender_list = []
+    for i in range(unique_chars):
+        print(i)
+        new_panels = [p for p in predictions.faces if p['char_id'] == i]
+        print(new_panels)
+        gender = check_gender(model,processor,new_panels,test_image)
+        gender_list.append(gender)
+    return gender_list

utils/xml_to_json.py ADDED Viewed

	@@ -0,0 +1,45 @@

+# Extract speaker info and add to parsed-XML json object
+import xml.etree.ElementTree as ET
+import manga109api
+import os
+import json
+def xml_to_json(config):
+    manga109_root_dir = config["manga109_root_dir"]
+    def create_json(book):
+        def parse_xml(xml_path):
+            tree = ET.parse(xml_path)
+            return tree.getroot()
+        root_speaker = parse_xml(manga109_root_dir+"annotations_Manga109Dialog/"+book+".xml")
+        print(len(root_speaker.find('pages')))
+        fullPage = []
+        for page in root_speaker.find('pages'):
+            index = page.attrib.get('index')
+            width = page.attrib.get('width')
+            height = page.attrib.get('height')
+            curpage = {}
+            for speaker in page.findall('speaker_to_text'):
+                id = speaker.attrib.get('id')
+                text_id = speaker.attrib.get('text_id')
+                speaker_id = speaker.attrib.get('speaker_id')
+                curpage[text_id] = speaker_id
+            fullPage.append(curpage)
+        p = manga109api.Parser(root_dir=manga109_root_dir)
+        annotation = p.get_annotation(book=book)
+        for page in annotation['page']:
+            index = page['@index']
+            for obj in page['text']:
+                if obj['@id'] in fullPage[index]:
+                    obj['@bodyid'] = fullPage[index][obj['@id']]
+        with open(config["root"] + "json/"+book+".json","w",encoding='utf-8') as f:
+            json.dump(annotation,f,ensure_ascii=False, indent=2)
+    for book in os.listdir(os.path.join(manga109_root_dir,"images")):
+        print(book)
+        create_json(book)

yolo11m/runs/detect/train/args.yaml ADDED Viewed

	@@ -0,0 +1,105 @@

+task: detect
+mode: train
+model: yolo11m.pt
+data: ./data.yaml
+epochs: 100
+time: null
+patience: 100
+batch: 16
+imgsz: 640
+save: true
+save_period: -1
+cache: false
+device: '0'
+workers: 8
+project: null
+name: train
+exist_ok: false
+pretrained: true
+optimizer: auto
+verbose: true
+seed: 0
+deterministic: true
+single_cls: false
+rect: false
+cos_lr: false
+close_mosaic: 10
+resume: false
+amp: true
+fraction: 1.0
+profile: false
+freeze: null
+multi_scale: false
+overlap_mask: true
+mask_ratio: 4
+dropout: 0.0
+val: true
+split: val
+save_json: false
+conf: null
+iou: 0.7
+max_det: 300
+half: false
+dnn: false
+plots: true
+source: null
+vid_stride: 1
+stream_buffer: false
+visualize: false
+augment: false
+agnostic_nms: false
+classes: null
+retina_masks: false
+embed: null
+show: false
+save_frames: false
+save_txt: false
+save_conf: false
+save_crop: false
+show_labels: true
+show_conf: true
+show_boxes: true
+line_width: null
+format: torchscript
+keras: false
+optimize: false
+int8: false
+dynamic: false
+simplify: true
+opset: null
+workspace: null
+nms: false
+lr0: 0.01
+lrf: 0.01
+momentum: 0.937
+weight_decay: 0.0005
+warmup_epochs: 3.0
+warmup_momentum: 0.8
+warmup_bias_lr: 0.1
+box: 7.5
+cls: 0.5
+dfl: 1.5
+pose: 12.0
+kobj: 1.0
+nbs: 64
+hsv_h: 0.015
+hsv_s: 0.7
+hsv_v: 0.4
+degrees: 0.0
+translate: 0.1
+scale: 0.5
+shear: 0.0
+perspective: 0.0
+flipud: 0.0
+fliplr: 0.5
+bgr: 0.0
+mosaic: 1.0
+mixup: 0.0
+cutmix: 0.0
+copy_paste: 0.0
+copy_paste_mode: flip
+auto_augment: randaugment
+erasing: 0.4
+cfg: null
+tracker: botsort.yaml
+save_dir: runs/detect/train

yolo11m/runs/detect/train2/args.yaml ADDED Viewed

	@@ -0,0 +1,105 @@

+task: detect
+mode: train
+model: yolo11m.pt
+data: ./data.yaml
+epochs: 30
+time: null
+patience: 100
+batch: 16
+imgsz: 640
+save: true
+save_period: -1
+cache: false
+device: '0'
+workers: 8
+project: null
+name: train2
+exist_ok: false
+pretrained: true
+optimizer: auto
+verbose: true
+seed: 0
+deterministic: true
+single_cls: false
+rect: false
+cos_lr: false
+close_mosaic: 10
+resume: false
+amp: true
+fraction: 1.0
+profile: false
+freeze: null
+multi_scale: false
+overlap_mask: true
+mask_ratio: 4
+dropout: 0.0
+val: true
+split: val
+save_json: false
+conf: null
+iou: 0.7
+max_det: 300
+half: false
+dnn: false
+plots: true
+source: null
+vid_stride: 1
+stream_buffer: false
+visualize: false
+augment: false
+agnostic_nms: false
+classes: null
+retina_masks: false
+embed: null
+show: false
+save_frames: false
+save_txt: false
+save_conf: false
+save_crop: false
+show_labels: true
+show_conf: true
+show_boxes: true
+line_width: null
+format: torchscript
+keras: false
+optimize: false
+int8: false
+dynamic: false
+simplify: true
+opset: null
+workspace: null
+nms: false
+lr0: 0.01
+lrf: 0.01
+momentum: 0.937
+weight_decay: 0.0005
+warmup_epochs: 3.0
+warmup_momentum: 0.8
+warmup_bias_lr: 0.1
+box: 7.5
+cls: 0.5
+dfl: 1.5
+pose: 12.0
+kobj: 1.0
+nbs: 64
+hsv_h: 0.015
+hsv_s: 0.7
+hsv_v: 0.4
+degrees: 0.0
+translate: 0.1
+scale: 0.5
+shear: 0.0
+perspective: 0.0
+flipud: 0.0
+fliplr: 0.5
+bgr: 0.0
+mosaic: 1.0
+mixup: 0.0
+cutmix: 0.0
+copy_paste: 0.0
+copy_paste_mode: flip
+auto_augment: randaugment
+erasing: 0.4
+cfg: null
+tracker: botsort.yaml
+save_dir: runs/detect/train2

yolo11m/runs/detect/train2/results.csv ADDED Viewed

	@@ -0,0 +1,31 @@

+epoch,time,train/box_loss,train/cls_loss,train/dfl_loss,metrics/precision(B),metrics/recall(B),metrics/mAP50(B),metrics/mAP50-95(B),val/box_loss,val/cls_loss,val/dfl_loss,lr/pg0,lr/pg1,lr/pg2
+1,364.701,0.99528,0.88434,1.10876,0.84374,0.81339,0.86346,0.61416,0.9399,0.67931,1.06036,0.000415851,0.000415851,0.000415851
+2,717.67,0.93907,0.68799,1.07341,0.8555,0.81756,0.8734,0.6256,0.91886,0.66057,1.05007,0.000805045,0.000805045,0.000805045
+3,1070.28,0.9246,0.66399,1.06682,0.82583,0.78783,0.84902,0.58408,1.00997,0.71844,1.10452,0.00116674,0.00116674,0.00116674
+4,1415.94,0.89772,0.63638,1.05457,0.87076,0.83342,0.88502,0.6424,0.88072,0.59905,1.0277,0.00112625,0.00112625,0.00112625
+5,1761.7,0.87716,0.61356,1.04405,0.88033,0.83517,0.89177,0.6547,0.87207,0.57414,1.0236,0.001085,0.001085,0.001085
+6,2110.53,0.86309,0.59526,1.03721,0.88965,0.8458,0.90099,0.66986,0.83239,0.53844,1.01,0.00104375,0.00104375,0.00104375
+7,2458.47,0.84727,0.5812,1.02879,0.88126,0.84073,0.89735,0.65739,0.86915,0.56387,1.01918,0.0010025,0.0010025,0.0010025
+8,2806.22,0.84261,0.57434,1.02717,0.89677,0.84979,0.90884,0.67838,0.81902,0.52912,0.99507,0.00096125,0.00096125,0.00096125
+9,3154.13,0.83402,0.56153,1.02006,0.89845,0.85878,0.91456,0.68555,0.81409,0.51218,0.98894,0.00092,0.00092,0.00092
+10,3505.97,0.8254,0.55079,1.01678,0.89708,0.86145,0.91757,0.68555,0.83014,0.51276,0.9874,0.00087875,0.00087875,0.00087875
+11,3850.26,0.81667,0.54212,1.01165,0.89666,0.87261,0.92286,0.69549,0.79841,0.49455,0.97811,0.0008375,0.0008375,0.0008375
+12,4198.48,0.8088,0.53515,1.0076,0.90241,0.86783,0.92325,0.69524,0.80291,0.49307,0.98062,0.00079625,0.00079625,0.00079625
+13,4544.49,0.80596,0.53055,1.0057,0.89774,0.86735,0.92057,0.69194,0.80916,0.50261,0.97499,0.000755,0.000755,0.000755
+14,4890.59,0.79576,0.52245,1.00188,0.90424,0.86295,0.92337,0.69801,0.79489,0.49061,0.97536,0.00071375,0.00071375,0.00071375
+15,5238.24,0.79264,0.51495,0.99893,0.90668,0.87894,0.92957,0.70712,0.78733,0.47507,0.96654,0.0006725,0.0006725,0.0006725
+16,5583.58,0.78354,0.50664,0.99404,0.90396,0.87658,0.9299,0.70605,0.78008,0.47593,0.96723,0.00063125,0.00063125,0.00063125
+17,5929.19,0.78022,0.50345,0.99207,0.90272,0.87945,0.93169,0.71304,0.76638,0.46915,0.96346,0.00059,0.00059,0.00059
+18,6280.28,0.77585,0.4996,0.98929,0.9089,0.87976,0.93364,0.71495,0.76637,0.46142,0.95967,0.00054875,0.00054875,0.00054875
+19,6631.13,0.77146,0.49338,0.98748,0.90486,0.8868,0.9333,0.71573,0.76155,0.45881,0.96285,0.0005075,0.0005075,0.0005075
+20,6977.4,0.76437,0.48726,0.98545,0.91121,0.87741,0.93274,0.71797,0.75895,0.45138,0.95755,0.00046625,0.00046625,0.00046625
+21,7309.74,0.76783,0.47192,0.98514,0.90617,0.88781,0.93546,0.72133,0.75577,0.45381,0.95819,0.000425,0.000425,0.000425
+22,7644.28,0.75947,0.46115,0.97943,0.90908,0.88789,0.9366,0.72428,0.74821,0.44937,0.95353,0.00038375,0.00038375,0.00038375
+23,7976.83,0.75231,0.45512,0.97455,0.90643,0.89093,0.93685,0.72362,0.74938,0.4485,0.95206,0.0003425,0.0003425,0.0003425
+24,8308.26,0.74468,0.44702,0.97117,0.90852,0.88729,0.93576,0.72268,0.75086,0.44312,0.95287,0.00030125,0.00030125,0.00030125
+25,8638.72,0.73652,0.44072,0.96737,0.91305,0.89204,0.94021,0.73021,0.7399,0.43814,0.94657,0.00026,0.00026,0.00026
+26,8970.02,0.73089,0.43389,0.96468,0.91071,0.89248,0.93932,0.73064,0.7382,0.43585,0.94573,0.00021875,0.00021875,0.00021875
+27,9302.82,0.72381,0.42706,0.96115,0.91403,0.89302,0.94056,0.73294,0.73565,0.42938,0.94517,0.0001775,0.0001775,0.0001775
+28,9635.86,0.71664,0.4207,0.95751,0.91457,0.8922,0.94121,0.73333,0.73483,0.42626,0.94493,0.00013625,0.00013625,0.00013625
+29,9971.96,0.71077,0.41469,0.95334,0.91221,0.89692,0.94196,0.7358,0.72995,0.42248,0.94099,9.5e-05,9.5e-05,9.5e-05
+30,10303.7,0.70721,0.4106,0.95196,0.91199,0.8969,0.94201,0.7353,0.73065,0.42129,0.94193,5.375e-05,5.375e-05,5.375e-05

yolo11m/runs/detect/train2/weights/best.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:472dcc9fcf237d5d9ede9afa30eb7da4ffbad514d5a1bca537918bcd2aa7120a
+size 40502700

yolo11m/runs/detect/train2/weights/last.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:755404e647007f3dfb3dac583bea81b176114b2c6c706ae43a177e968a29e98e
+size 40502700