Add application file
Browse files- .gitignore +4 -0
- app.py +47 -0
- checkpoints/assoc_gcn.pt +3 -0
- config.json +11 -0
- models/model.pth +3 -0
- page_transcription.py +100 -0
- preprocess.py +21 -0
- requirements.txt +140 -0
- train.py +15 -0
- utils/__pycache__/character_to_character.cpython-311.pyc +0 -0
- utils/__pycache__/coco_to_yolo.cpython-311.pyc +0 -0
- utils/__pycache__/create_audiobook.cpython-311.pyc +0 -0
- utils/__pycache__/create_directories.cpython-311.pyc +0 -0
- utils/__pycache__/get_yolo_predictions.cpython-311.pyc +0 -0
- utils/__pycache__/identify_speaker.cpython-311.pyc +0 -0
- utils/__pycache__/japanese_ocr.cpython-311.pyc +0 -0
- utils/__pycache__/json_to_coco.cpython-311.pyc +0 -0
- utils/__pycache__/json_to_gcn.cpython-311.pyc +0 -0
- utils/__pycache__/panel_ordering.cpython-311.pyc +0 -0
- utils/__pycache__/train_speaker.cpython-311.pyc +0 -0
- utils/__pycache__/transcript_page.cpython-311.pyc +0 -0
- utils/__pycache__/utilities.cpython-311.pyc +0 -0
- utils/__pycache__/visual_llm_gender.cpython-311.pyc +0 -0
- utils/__pycache__/xml_to_json.cpython-311.pyc +0 -0
- utils/character_to_character.py +192 -0
- utils/coco_to_yolo.py +52 -0
- utils/create_audiobook.py +55 -0
- utils/create_directories.py +13 -0
- utils/directories.txt +13 -0
- utils/get_yolo_predictions.py +11 -0
- utils/identify_speaker.py +443 -0
- utils/japanese_ocr.py +52 -0
- utils/json_to_coco.py +131 -0
- utils/json_to_gcn.py +484 -0
- utils/panel_ordering.py +258 -0
- utils/train_speaker.py +385 -0
- utils/transcript_page.py +37 -0
- utils/utilities.py +306 -0
- utils/visual_llm_gender.py +79 -0
- utils/xml_to_json.py +45 -0
- yolo11m/runs/detect/train/args.yaml +105 -0
- yolo11m/runs/detect/train2/args.yaml +105 -0
- yolo11m/runs/detect/train2/results.csv +31 -0
- yolo11m/runs/detect/train2/weights/best.pt +3 -0
- yolo11m/runs/detect/train2/weights/last.pt +3 -0
.gitignore
ADDED
|
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
**/*.jpg
|
| 2 |
+
**/*.png
|
| 3 |
+
**/*.jpg
|
| 4 |
+
**/*.png
|
app.py
ADDED
|
@@ -0,0 +1,47 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import gradio as gr
|
| 2 |
+
import os
|
| 3 |
+
import shutil
|
| 4 |
+
import uuid
|
| 5 |
+
import subprocess
|
| 6 |
+
|
| 7 |
+
# Function to handle image upload and transcription
|
| 8 |
+
def process_image(image):
|
| 9 |
+
# Save uploaded image to temp folder
|
| 10 |
+
temp_folder = "./temp_uploads"
|
| 11 |
+
os.makedirs(temp_folder, exist_ok=True)
|
| 12 |
+
|
| 13 |
+
# Generate unique filename
|
| 14 |
+
image_filename = f"{uuid.uuid4().hex}.jpg"
|
| 15 |
+
image_path = os.path.join(temp_folder, image_filename)
|
| 16 |
+
|
| 17 |
+
# Save image
|
| 18 |
+
image.save(image_path)
|
| 19 |
+
|
| 20 |
+
# Run your script with subprocess
|
| 21 |
+
try:
|
| 22 |
+
subprocess.run(["python", "page_transcription.py", f"-img={image_path}"], check=True)
|
| 23 |
+
except subprocess.CalledProcessError as e:
|
| 24 |
+
return f"Error during transcription: {e}", None
|
| 25 |
+
|
| 26 |
+
# Check if output.wav is generated
|
| 27 |
+
output_audio_path = "output.wav"
|
| 28 |
+
if os.path.exists(output_audio_path):
|
| 29 |
+
return "Transcription complete. Playing audio...", output_audio_path
|
| 30 |
+
else:
|
| 31 |
+
return "Failed to generate audio file.", None
|
| 32 |
+
|
| 33 |
+
# Gradio interface
|
| 34 |
+
iface = gr.Interface(
|
| 35 |
+
fn=process_image,
|
| 36 |
+
inputs=gr.Image(type="pil", label="Upload a Manga Page"),
|
| 37 |
+
outputs=[
|
| 38 |
+
gr.Textbox(label="Status"),
|
| 39 |
+
gr.Audio(label="Generated Audio", type="filepath")
|
| 40 |
+
],
|
| 41 |
+
title="Manga Page Audio Transcription",
|
| 42 |
+
description="Upload a manga image page, and this tool will transcribe and play the audio using a backend Python script."
|
| 43 |
+
)
|
| 44 |
+
|
| 45 |
+
# Launch app
|
| 46 |
+
if __name__ == "__main__":
|
| 47 |
+
iface.launch()
|
checkpoints/assoc_gcn.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:4c09dc23e06af2a1f373cb03748a92f134dcbeab2829543f928a5247e51839fe
|
| 3 |
+
size 810267
|
config.json
ADDED
|
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"root" : "./dataset/",
|
| 3 |
+
"dir_path" : "./utils/directories.txt",
|
| 4 |
+
"manga109_root_dir" : "./dataset/Manga109/",
|
| 5 |
+
"root_images" : "./dataset/images/",
|
| 6 |
+
"root_annotations" : "./dataset/annotations/",
|
| 7 |
+
"root_labels" : "./dataset/labels/",
|
| 8 |
+
"root_json" : "./dataset/json/",
|
| 9 |
+
"yolo_path" : "./yolo11m/runs/detect/train2/weights/best.pt",
|
| 10 |
+
"speaker_association_path" : "./checkpoints/assoc_gcn.pt"
|
| 11 |
+
}
|
models/model.pth
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:b7b8228bcec85fb40eaf2781a63816ccc1e8d6cc43e90a0cd257406e3abc85fc
|
| 3 |
+
size 102802172
|
page_transcription.py
ADDED
|
@@ -0,0 +1,100 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import argparse
|
| 2 |
+
import json
|
| 3 |
+
import torch
|
| 4 |
+
from ultralytics import YOLO
|
| 5 |
+
from utils.identify_speaker import get_predictions_from_yolo
|
| 6 |
+
from utils.panel_ordering import panel_ordering
|
| 7 |
+
from utils.utilities import show_prediction_per_page
|
| 8 |
+
from utils.utilities import show_panel_order
|
| 9 |
+
from utils.character_to_character import count_characters_single_page
|
| 10 |
+
from utils.identify_speaker import identify_speaker,load_trained_speaker_model
|
| 11 |
+
from utils.utilities import visualize_speaker_results
|
| 12 |
+
from utils.utilities import process_results
|
| 13 |
+
from transformers import pipeline
|
| 14 |
+
from utils.transcript_page import transcript_page
|
| 15 |
+
from utils.visual_llm_gender import visual_llm_gender
|
| 16 |
+
from utils.utilities import get_speakers
|
| 17 |
+
from scipy.io.wavfile import write
|
| 18 |
+
from utils.create_audiobook import create_audiobook
|
| 19 |
+
import sounddevice as sd
|
| 20 |
+
import soundfile as sf
|
| 21 |
+
|
| 22 |
+
# def main():
|
| 23 |
+
# parser = argparse.ArgumentParser(description="Script to transcript one single image")
|
| 24 |
+
# # parser.add_argument("filename", help="Path to the input file")
|
| 25 |
+
# parser.add_argument("-img", "--test_image", type=str, default="./dataset/images/test/AkkeraKanjinchou_083.jpg", help="Test Image on which transcription needs to be done")
|
| 26 |
+
# parser.add_argument("-vis", "--visualize", type=bool, default=False, help="Visualize predictions")
|
| 27 |
+
|
| 28 |
+
# args = parser.parse_args()
|
| 29 |
+
# print(f"Test image at : {args.test_image}")
|
| 30 |
+
# print(f"Test image at : {args.visualize}")
|
| 31 |
+
def main():
|
| 32 |
+
parser = argparse.ArgumentParser(description="Script to transcript one single image")
|
| 33 |
+
parser.add_argument(
|
| 34 |
+
"-img", "--test_image",
|
| 35 |
+
type=str,
|
| 36 |
+
default="./dataset/images/test/AkkeraKanjinchou_083.jpg",
|
| 37 |
+
help="Test Image on which transcription needs to be done"
|
| 38 |
+
)
|
| 39 |
+
parser.add_argument(
|
| 40 |
+
"-vis", "--visualize",
|
| 41 |
+
action="store_true",
|
| 42 |
+
help="Provide only flagname to Visualize predictions"
|
| 43 |
+
)
|
| 44 |
+
|
| 45 |
+
args = parser.parse_args()
|
| 46 |
+
print(f"Test image at : {args.test_image}")
|
| 47 |
+
print(f"Visualize? : {args.visualize}")
|
| 48 |
+
|
| 49 |
+
config = {}
|
| 50 |
+
with open('config.json') as f:
|
| 51 |
+
config = json.load(f)
|
| 52 |
+
device = 'cuda' if torch.cuda.is_available() else 'cpu'
|
| 53 |
+
yolo_model = YOLO(config["yolo_path"])
|
| 54 |
+
test_image = args.test_image
|
| 55 |
+
|
| 56 |
+
predictions = get_predictions_from_yolo(test_image, yolo_model)
|
| 57 |
+
|
| 58 |
+
boxOrderEstimator = panel_ordering(test_image,predictions)
|
| 59 |
+
|
| 60 |
+
unique_chars, labels,pairs = count_characters_single_page(test_image,predictions,eps=0.3,min_samples=1,device='cuda')
|
| 61 |
+
|
| 62 |
+
assoc_model = load_trained_speaker_model(config['speaker_association_path'], device)
|
| 63 |
+
results,predictions = identify_speaker(
|
| 64 |
+
assoc_model, test_image, yolo_model, 'cuda'
|
| 65 |
+
)
|
| 66 |
+
processed_predictions = process_results(predictions,boxOrderEstimator,pairs,labels,results)
|
| 67 |
+
|
| 68 |
+
# print("Panels predicted present in predictions.panels")
|
| 69 |
+
# for panel in predictions.panels:
|
| 70 |
+
# print(panel)
|
| 71 |
+
# print("Sppech bubbles predicted present in predictions.speech_bubbles")
|
| 72 |
+
# for panel in predictions.speech_bubbles:
|
| 73 |
+
# print(panel)
|
| 74 |
+
# print("Faces predicted present in predictions.faces")
|
| 75 |
+
# for panel in predictions.faces:
|
| 76 |
+
# print(panel)
|
| 77 |
+
# print("Bodies predicted present in predictions.bodies")
|
| 78 |
+
# for panel in predictions.bodies:
|
| 79 |
+
# print(panel)
|
| 80 |
+
translator = pipeline('translation', model='Mitsua/elan-mt-bt-ja-en')
|
| 81 |
+
japanese_transcript,english_transcript = transcript_page(test_image,translator,processed_predictions)
|
| 82 |
+
gender_list = visual_llm_gender(unique_chars,predictions,test_image)
|
| 83 |
+
speaker_list = get_speakers(gender_list)
|
| 84 |
+
combined,sample_rate = create_audiobook(gender_list,english_transcript,speaker_list)
|
| 85 |
+
write("output.wav", sample_rate, combined)
|
| 86 |
+
|
| 87 |
+
data, fs = sf.read('output.wav')
|
| 88 |
+
sd.play(data, fs)
|
| 89 |
+
sd.wait()
|
| 90 |
+
|
| 91 |
+
if args.visualize :
|
| 92 |
+
show_prediction_per_page(test_image,processed_predictions)
|
| 93 |
+
show_panel_order(test_image,boxOrderEstimator)
|
| 94 |
+
visualize_speaker_results(test_image,processed_predictions,results)
|
| 95 |
+
|
| 96 |
+
|
| 97 |
+
if __name__ == "__main__":
|
| 98 |
+
main()
|
| 99 |
+
|
| 100 |
+
|
preprocess.py
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import json
|
| 2 |
+
from utils.create_directories import create_directories
|
| 3 |
+
from utils.xml_to_json import xml_to_json
|
| 4 |
+
from utils.json_to_coco import json_to_coco
|
| 5 |
+
from utils.coco_to_yolo import coco_to_yolo
|
| 6 |
+
from utils.json_to_gcn import json_to_gcn
|
| 7 |
+
|
| 8 |
+
def preprocess():
|
| 9 |
+
|
| 10 |
+
config = {}
|
| 11 |
+
with open('config.json') as f:
|
| 12 |
+
config = json.load(f)
|
| 13 |
+
create_directories(config)
|
| 14 |
+
|
| 15 |
+
xml_to_json(config)
|
| 16 |
+
|
| 17 |
+
json_to_coco(config)
|
| 18 |
+
|
| 19 |
+
coco_to_yolo(config)
|
| 20 |
+
|
| 21 |
+
json_to_gcn(config)
|
requirements.txt
ADDED
|
@@ -0,0 +1,140 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
absl-py==2.3.1
|
| 2 |
+
accelerate==1.9.0
|
| 3 |
+
aiohappyeyeballs==2.6.1
|
| 4 |
+
aiohttp==3.12.14
|
| 5 |
+
aiosignal==1.4.0
|
| 6 |
+
anyascii==0.3.3
|
| 7 |
+
asttokens==3.0.0
|
| 8 |
+
attrs==25.3.0
|
| 9 |
+
audioread==3.0.1
|
| 10 |
+
av==15.0.0
|
| 11 |
+
babel==2.17.0
|
| 12 |
+
certifi==2025.7.14
|
| 13 |
+
cffi==1.17.1
|
| 14 |
+
charset-normalizer==3.4.2
|
| 15 |
+
colorama==0.4.6
|
| 16 |
+
comm==0.2.3
|
| 17 |
+
contourpy==1.3.3
|
| 18 |
+
coqpit-config==0.2.1
|
| 19 |
+
coqui-tts==0.27.0
|
| 20 |
+
coqui-tts-trainer==0.3.1
|
| 21 |
+
cycler==0.12.1
|
| 22 |
+
Cython==3.1.2
|
| 23 |
+
dateparser==1.1.8
|
| 24 |
+
debugpy==1.8.15
|
| 25 |
+
decorator==5.2.1
|
| 26 |
+
decord==0.6.0
|
| 27 |
+
docopt==0.6.2
|
| 28 |
+
einops==0.8.1
|
| 29 |
+
encodec==0.1.1
|
| 30 |
+
executing==2.2.0
|
| 31 |
+
filelock==3.13.1
|
| 32 |
+
fire==0.7.0
|
| 33 |
+
fonttools==4.59.0
|
| 34 |
+
frozenlist==1.7.0
|
| 35 |
+
fsspec==2024.6.1
|
| 36 |
+
fugashi==1.5.1
|
| 37 |
+
grpcio==1.74.0
|
| 38 |
+
gruut==2.4.0
|
| 39 |
+
gruut-ipa==0.13.0
|
| 40 |
+
gruut_lang_de==2.0.1
|
| 41 |
+
gruut_lang_en==2.0.1
|
| 42 |
+
gruut_lang_es==2.0.1
|
| 43 |
+
gruut_lang_fr==2.0.2
|
| 44 |
+
hf-xet==1.1.5
|
| 45 |
+
huggingface-hub==0.34.3
|
| 46 |
+
idna==3.10
|
| 47 |
+
inflect==7.5.0
|
| 48 |
+
ipykernel==6.30.0
|
| 49 |
+
ipython==9.4.0
|
| 50 |
+
ipython_pygments_lexers==1.1.1
|
| 51 |
+
jaconv==0.4.0
|
| 52 |
+
jedi==0.19.2
|
| 53 |
+
Jinja2==3.1.4
|
| 54 |
+
joblib==1.5.1
|
| 55 |
+
jsonlines==1.2.0
|
| 56 |
+
jupyter_client==8.6.3
|
| 57 |
+
jupyter_core==5.8.1
|
| 58 |
+
kiwisolver==1.4.8
|
| 59 |
+
lazy_loader==0.4
|
| 60 |
+
librosa==0.11.0
|
| 61 |
+
llvmlite==0.44.0
|
| 62 |
+
loguru==0.7.3
|
| 63 |
+
manga-ocr==0.1.14
|
| 64 |
+
manga109api==0.3.1
|
| 65 |
+
Markdown==3.8.2
|
| 66 |
+
MarkupSafe==2.1.5
|
| 67 |
+
matplotlib==3.10.3
|
| 68 |
+
matplotlib-inline==0.1.7
|
| 69 |
+
monotonic-alignment-search==0.2.0
|
| 70 |
+
more-itertools==10.7.0
|
| 71 |
+
mpmath==1.3.0
|
| 72 |
+
msgpack==1.1.1
|
| 73 |
+
multidict==6.6.3
|
| 74 |
+
nest-asyncio==1.6.0
|
| 75 |
+
networkx==3.3
|
| 76 |
+
num2words==0.5.14
|
| 77 |
+
numba==0.61.2
|
| 78 |
+
numpy==2.1.2
|
| 79 |
+
opencv-python==4.12.0.88
|
| 80 |
+
packaging==25.0
|
| 81 |
+
pandas==2.3.1
|
| 82 |
+
parso==0.8.4
|
| 83 |
+
pillow==11.0.0
|
| 84 |
+
platformdirs==4.3.8
|
| 85 |
+
pooch==1.8.2
|
| 86 |
+
prompt_toolkit==3.0.51
|
| 87 |
+
propcache==0.3.2
|
| 88 |
+
protobuf==6.31.1
|
| 89 |
+
psutil==7.0.0
|
| 90 |
+
pure_eval==0.2.3
|
| 91 |
+
py-cpuinfo==9.0.0
|
| 92 |
+
pycparser==2.22
|
| 93 |
+
Pygments==2.19.2
|
| 94 |
+
pyparsing==3.2.3
|
| 95 |
+
pyperclip==1.9.0
|
| 96 |
+
pysbd==0.3.4
|
| 97 |
+
python-crfsuite==0.9.11
|
| 98 |
+
python-dateutil==2.9.0.post0
|
| 99 |
+
pytz==2025.2
|
| 100 |
+
pywin32==311
|
| 101 |
+
PyYAML==6.0.2
|
| 102 |
+
pyzmq==27.0.0
|
| 103 |
+
qwen-vl-utils==0.0.8
|
| 104 |
+
regex==2025.7.34
|
| 105 |
+
requests==2.32.4
|
| 106 |
+
safetensors==0.5.3
|
| 107 |
+
scikit-learn==1.7.1
|
| 108 |
+
scipy==1.16.0
|
| 109 |
+
sentencepiece==0.2.0
|
| 110 |
+
six==1.17.0
|
| 111 |
+
sounddevice==0.5.2
|
| 112 |
+
soundfile==0.13.1
|
| 113 |
+
soxr==0.5.0.post1
|
| 114 |
+
stack-data==0.6.3
|
| 115 |
+
sympy==1.13.3
|
| 116 |
+
tensorboard==2.20.0
|
| 117 |
+
tensorboard-data-server==0.7.2
|
| 118 |
+
termcolor==3.1.0
|
| 119 |
+
threadpoolctl==3.6.0
|
| 120 |
+
tokenizers==0.21.4
|
| 121 |
+
torch==2.7.1+cu118
|
| 122 |
+
torch-geometric==2.6.1
|
| 123 |
+
torchaudio==2.7.1+cu118
|
| 124 |
+
torchvision==0.22.1+cu118
|
| 125 |
+
tornado==6.5.1
|
| 126 |
+
tqdm==4.67.1
|
| 127 |
+
traitlets==5.14.3
|
| 128 |
+
transformers @ git+https://github.com/huggingface/transformers@7dca2ff8cfd6102430acd3af3fdc3831b9779884
|
| 129 |
+
typeguard==4.4.4
|
| 130 |
+
typing_extensions==4.14.1
|
| 131 |
+
tzdata==2025.2
|
| 132 |
+
tzlocal==5.3.1
|
| 133 |
+
ultralytics==8.3.170
|
| 134 |
+
ultralytics-thop==2.0.14
|
| 135 |
+
unidic-lite==1.0.8
|
| 136 |
+
urllib3==2.5.0
|
| 137 |
+
wcwidth==0.2.13
|
| 138 |
+
Werkzeug==3.1.3
|
| 139 |
+
win32_setctime==1.2.0
|
| 140 |
+
yarl==1.20.1
|
train.py
ADDED
|
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import torch
|
| 2 |
+
torch.cuda.is_available()
|
| 3 |
+
from utils.train_speaker import train_speaker
|
| 4 |
+
from ultralytics import YOLO
|
| 5 |
+
|
| 6 |
+
def train(config):
|
| 7 |
+
|
| 8 |
+
model = YOLO("yolo11m.pt")
|
| 9 |
+
|
| 10 |
+
model.info()
|
| 11 |
+
|
| 12 |
+
model.train(data='data.yaml',epochs=10, device='cuda', imgsz=640, batch=16)
|
| 13 |
+
|
| 14 |
+
|
| 15 |
+
train_speaker(config)
|
utils/__pycache__/character_to_character.cpython-311.pyc
ADDED
|
Binary file (9.75 kB). View file
|
|
|
utils/__pycache__/coco_to_yolo.cpython-311.pyc
ADDED
|
Binary file (3.96 kB). View file
|
|
|
utils/__pycache__/create_audiobook.cpython-311.pyc
ADDED
|
Binary file (2.58 kB). View file
|
|
|
utils/__pycache__/create_directories.cpython-311.pyc
ADDED
|
Binary file (1.12 kB). View file
|
|
|
utils/__pycache__/get_yolo_predictions.cpython-311.pyc
ADDED
|
Binary file (1.03 kB). View file
|
|
|
utils/__pycache__/identify_speaker.cpython-311.pyc
ADDED
|
Binary file (16.4 kB). View file
|
|
|
utils/__pycache__/japanese_ocr.cpython-311.pyc
ADDED
|
Binary file (4.18 kB). View file
|
|
|
utils/__pycache__/json_to_coco.cpython-311.pyc
ADDED
|
Binary file (8.37 kB). View file
|
|
|
utils/__pycache__/json_to_gcn.cpython-311.pyc
ADDED
|
Binary file (23.5 kB). View file
|
|
|
utils/__pycache__/panel_ordering.cpython-311.pyc
ADDED
|
Binary file (14.2 kB). View file
|
|
|
utils/__pycache__/train_speaker.cpython-311.pyc
ADDED
|
Binary file (21.7 kB). View file
|
|
|
utils/__pycache__/transcript_page.cpython-311.pyc
ADDED
|
Binary file (2.67 kB). View file
|
|
|
utils/__pycache__/utilities.cpython-311.pyc
ADDED
|
Binary file (18.3 kB). View file
|
|
|
utils/__pycache__/visual_llm_gender.cpython-311.pyc
ADDED
|
Binary file (4.28 kB). View file
|
|
|
utils/__pycache__/xml_to_json.cpython-311.pyc
ADDED
|
Binary file (3.35 kB). View file
|
|
|
utils/character_to_character.py
ADDED
|
@@ -0,0 +1,192 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
def compute_iou(boxA, boxB):
|
| 2 |
+
"""
|
| 3 |
+
Compute Intersection over Union (IoU) between two boxes.
|
| 4 |
+
Boxes are (x1, y1, x2, y2).
|
| 5 |
+
"""
|
| 6 |
+
xA = max(boxA[0], boxB[0])
|
| 7 |
+
yA = max(boxA[1], boxB[1])
|
| 8 |
+
xB = min(boxA[2], boxB[2])
|
| 9 |
+
yB = min(boxA[3], boxB[3])
|
| 10 |
+
|
| 11 |
+
interW = max(0, xB - xA)
|
| 12 |
+
interH = max(0, yB - yA)
|
| 13 |
+
interArea = interW * interH
|
| 14 |
+
|
| 15 |
+
boxAArea = (boxA[2] - boxA[0]) * (boxA[3] - boxA[1])
|
| 16 |
+
# boxBArea = (boxB[2] - boxB[0]) * (boxB[3] - boxB[1])
|
| 17 |
+
# unionArea = boxAArea + boxBArea - interArea
|
| 18 |
+
|
| 19 |
+
return interArea / boxAArea if boxAArea > 0 else 0.0
|
| 20 |
+
|
| 21 |
+
def match_faces_to_bodies(face_boxes, body_boxes, iou_threshold=0.9):
|
| 22 |
+
"""
|
| 23 |
+
Match each body box to the face box with highest IoU above threshold.
|
| 24 |
+
Return list of (face_box or None, body_box).
|
| 25 |
+
If no face meets the threshold for a body, face_box is None.
|
| 26 |
+
"""
|
| 27 |
+
matches = []
|
| 28 |
+
used_faces = set()
|
| 29 |
+
|
| 30 |
+
for b in body_boxes:
|
| 31 |
+
best_iou = 0.0
|
| 32 |
+
best_f = None
|
| 33 |
+
for i, f in enumerate(face_boxes):
|
| 34 |
+
if i in used_faces:
|
| 35 |
+
continue
|
| 36 |
+
iou = compute_iou(f, b)
|
| 37 |
+
if iou > best_iou:
|
| 38 |
+
best_iou = iou
|
| 39 |
+
best_f = i
|
| 40 |
+
if best_iou >= iou_threshold:
|
| 41 |
+
matches.append((face_boxes[best_f], b))
|
| 42 |
+
used_faces.add(best_f)
|
| 43 |
+
else:
|
| 44 |
+
# No face sufficiently inside this body → treat body alone
|
| 45 |
+
matches.append((None, b))
|
| 46 |
+
|
| 47 |
+
|
| 48 |
+
return matches
|
| 49 |
+
|
| 50 |
+
# matches = match_faces_to_bodies(predictions_face,predictions_body)
|
| 51 |
+
# for face_box, body_box in matches:
|
| 52 |
+
# print(face_box,body_box)
|
| 53 |
+
|
| 54 |
+
import cv2
|
| 55 |
+
import numpy as np
|
| 56 |
+
|
| 57 |
+
def crop_and_prepare_embeddings(image_path, face_boxes, body_boxes, model, fuse_features):
|
| 58 |
+
"""
|
| 59 |
+
For each matched (face, body) pair:
|
| 60 |
+
- If both present: crop both and fuse embeddings.
|
| 61 |
+
- If only face: crop face and use face embedding alone.
|
| 62 |
+
- If only body: crop body and use body embedding alone.
|
| 63 |
+
Returns list of embeddings.
|
| 64 |
+
"""
|
| 65 |
+
img = cv2.imread(image_path)
|
| 66 |
+
pairs = match_faces_to_bodies(face_boxes, body_boxes)
|
| 67 |
+
embeddings = []
|
| 68 |
+
new_pairs = []
|
| 69 |
+
for face_box, body_box in pairs:
|
| 70 |
+
if face_box is not None:
|
| 71 |
+
x1, y1, x2, y2 = face_box
|
| 72 |
+
face_crop = img[y1:y2, x1:x2]
|
| 73 |
+
face_emb = model(face_crop)
|
| 74 |
+
else:
|
| 75 |
+
face_emb = None
|
| 76 |
+
|
| 77 |
+
if body_box is not None:
|
| 78 |
+
x1, y1, x2, y2 = body_box
|
| 79 |
+
body_crop = img[y1:y2, x1:x2]
|
| 80 |
+
body_emb = model(body_crop)
|
| 81 |
+
else:
|
| 82 |
+
body_emb = None
|
| 83 |
+
|
| 84 |
+
if face_emb is not None and body_emb is not None:
|
| 85 |
+
emb = fuse_features(face_emb, body_emb)
|
| 86 |
+
# elif face_emb is not None:
|
| 87 |
+
# emb = face_emb
|
| 88 |
+
# else:
|
| 89 |
+
# emb = body_emb
|
| 90 |
+
|
| 91 |
+
embeddings.append(emb)
|
| 92 |
+
new_pairs.append((face_box, body_box))
|
| 93 |
+
|
| 94 |
+
return embeddings,new_pairs
|
| 95 |
+
|
| 96 |
+
"""
|
| 97 |
+
Prerequisites
|
| 98 |
+
-------------
|
| 99 |
+
pip install torch torchvision scikit-learn opencv-python pillow tqdm
|
| 100 |
+
Download a backbone checkpoint that outputs 128-D embeddings.
|
| 101 |
+
Here we assume `reid_backbone.pth` is a ResNet-50 fine-tuned on manga faces/bodies.
|
| 102 |
+
"""
|
| 103 |
+
import cv2
|
| 104 |
+
import torch
|
| 105 |
+
import torch.nn as nn
|
| 106 |
+
import torchvision.transforms as T
|
| 107 |
+
import numpy as np
|
| 108 |
+
from sklearn.cluster import DBSCAN
|
| 109 |
+
from tqdm import tqdm
|
| 110 |
+
from pathlib import Path
|
| 111 |
+
|
| 112 |
+
# -------------------------------------------------------------------
|
| 113 |
+
# 1. Detection post-processing: crop_faces() is given; add crop_bodies()
|
| 114 |
+
# -------------------------------------------------------------------
|
| 115 |
+
def crop_bodies(image_path, boxes):
|
| 116 |
+
"""Crop body regions given YOLO boxes (x1,y1,x2,y2)."""
|
| 117 |
+
img = cv2.imread(image_path)
|
| 118 |
+
crops = []
|
| 119 |
+
for (x1, y1, x2, y2) in boxes:
|
| 120 |
+
body = img[y1:y2, x1:x2]
|
| 121 |
+
crops.append(body)
|
| 122 |
+
return crops
|
| 123 |
+
def crop_faces(image_path, boxes):
|
| 124 |
+
image = cv2.imread(image_path)
|
| 125 |
+
faces = []
|
| 126 |
+
for (x1, y1, x2, y2) in boxes:
|
| 127 |
+
face = image[y1:y2, x1:x2] # crop the face
|
| 128 |
+
faces.append(face)
|
| 129 |
+
return faces
|
| 130 |
+
# -------------------------------------------------------------------
|
| 131 |
+
# 2. Embedding network wrapper
|
| 132 |
+
# -------------------------------------------------------------------
|
| 133 |
+
class ReIDNet(nn.Module):
|
| 134 |
+
"""Backbone that outputs a 128-D L2-normalised embedding."""
|
| 135 |
+
def __init__(self, ckpt="./models/model.pth", dim=1031):
|
| 136 |
+
super().__init__()
|
| 137 |
+
base = torch.hub.load('pytorch/vision:v0.14.0', 'resnet50', weights=None)
|
| 138 |
+
base.fc = nn.Linear(base.fc.in_features, dim)
|
| 139 |
+
self.backbone = base
|
| 140 |
+
def add_prefix(state_dict, prefix="backbone."):
|
| 141 |
+
return {f"{prefix}{k}" if not k.startswith(prefix) else k: v
|
| 142 |
+
for k, v in state_dict.items()}
|
| 143 |
+
raw = torch.load(ckpt, map_location="cpu")
|
| 144 |
+
patched = add_prefix(raw)
|
| 145 |
+
self.load_state_dict(patched)
|
| 146 |
+
self.backbone.eval()
|
| 147 |
+
|
| 148 |
+
# Normalisation identical to training
|
| 149 |
+
self.preprocess = T.Compose([
|
| 150 |
+
T.ToPILImage(),
|
| 151 |
+
T.Resize((112, 112)),
|
| 152 |
+
T.ToTensor(),
|
| 153 |
+
T.Normalize(mean=[0.5]*3, std=[0.5]*3)
|
| 154 |
+
])
|
| 155 |
+
|
| 156 |
+
@torch.inference_mode()
|
| 157 |
+
def forward(self, np_image):
|
| 158 |
+
# tensor = self.preprocess(np_image).unsqueeze(0) # 1×3×112×112
|
| 159 |
+
tensor = self.preprocess(np_image).unsqueeze(0).to(next(self.backbone.parameters()).device)
|
| 160 |
+
feat = self.backbone(tensor)
|
| 161 |
+
return nn.functional.normalize(feat, p=2, dim=1).cpu().numpy().ravel()
|
| 162 |
+
|
| 163 |
+
# -------------------------------------------------------------------
|
| 164 |
+
# 3. Utility to fuse face & body features
|
| 165 |
+
# -------------------------------------------------------------------
|
| 166 |
+
def fuse_features(face_f, body_f, w_face=0.6):
|
| 167 |
+
"""Simple weighted late fusion."""
|
| 168 |
+
return np.concatenate([w_face * face_f, (1 - w_face) * body_f])
|
| 169 |
+
|
| 170 |
+
# -------------------------------------------------------------------
|
| 171 |
+
# 4. Main pipeline for a single page
|
| 172 |
+
# -------------------------------------------------------------------
|
| 173 |
+
def count_characters_single_page(img_path,
|
| 174 |
+
predictions,
|
| 175 |
+
eps=0.3,
|
| 176 |
+
min_samples=3,
|
| 177 |
+
device='cpu'):
|
| 178 |
+
model = ReIDNet().to(device)
|
| 179 |
+
face_boxes = [face['bbox'] for face in predictions.faces]
|
| 180 |
+
body_boxes = [body['bbox'] for body in predictions.bodies]
|
| 181 |
+
embeddings,pairs = crop_and_prepare_embeddings(img_path, face_boxes, body_boxes, model, fuse_features)
|
| 182 |
+
fused_embeds = np.vstack(embeddings) # N×256
|
| 183 |
+
|
| 184 |
+
# ----------------------------------------------------------------
|
| 185 |
+
# 5. DBSCAN clustering
|
| 186 |
+
# ----------------------------------------------------------------
|
| 187 |
+
db = DBSCAN(metric='cosine', eps=eps, min_samples=min_samples)
|
| 188 |
+
labels = db.fit_predict(fused_embeds) # -1 => noise
|
| 189 |
+
|
| 190 |
+
# Count unique positive cluster IDs
|
| 191 |
+
unique_chars = len(set(l for l in labels if l != -1))
|
| 192 |
+
return unique_chars, labels,pairs
|
utils/coco_to_yolo.py
ADDED
|
@@ -0,0 +1,52 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
def coco_to_yolo(config):
|
| 2 |
+
import os
|
| 3 |
+
import json
|
| 4 |
+
from pathlib import Path
|
| 5 |
+
from tqdm import tqdm
|
| 6 |
+
|
| 7 |
+
|
| 8 |
+
labels_dir = config["root_labels"]
|
| 9 |
+
annotations_dir = config["root_annotations"]
|
| 10 |
+
|
| 11 |
+
def convert_coco_json_to_yolo(coco_json_path, output_dir):
|
| 12 |
+
with open(coco_json_path) as f:
|
| 13 |
+
data = json.load(f)
|
| 14 |
+
|
| 15 |
+
images = {img['id']: img for img in data['images']}
|
| 16 |
+
categories = {cat['id']: cat['name'] for cat in data['categories']}
|
| 17 |
+
category_id_map = {cat_id: i for i, cat_id in enumerate(sorted(categories.keys()))}
|
| 18 |
+
|
| 19 |
+
os.makedirs(output_dir, exist_ok=True)
|
| 20 |
+
print(len(images))
|
| 21 |
+
for ann in tqdm(data['annotations']):
|
| 22 |
+
img = images[ann['image_id']]
|
| 23 |
+
image_name = Path(img['file_name']).stem
|
| 24 |
+
img_width = img['width']
|
| 25 |
+
img_height = img['height']
|
| 26 |
+
bbox = ann['bbox'] # [x_min, y_min, width, height]
|
| 27 |
+
x_center = (bbox[0] + bbox[2] / 2) / img_width
|
| 28 |
+
y_center = (bbox[1] + bbox[3] / 2) / img_height
|
| 29 |
+
w = bbox[2] / img_width
|
| 30 |
+
h = bbox[3] / img_height
|
| 31 |
+
|
| 32 |
+
class_id = category_id_map[ann['category_id']]
|
| 33 |
+
|
| 34 |
+
label_path = os.path.join(output_dir, f"{image_name}.txt")
|
| 35 |
+
with open(label_path, "a") as f:
|
| 36 |
+
f.write(f"{class_id} {x_center:.6f} {y_center:.6f} {w:.6f} {h:.6f}\n")
|
| 37 |
+
|
| 38 |
+
print(f"Converted labels saved to: {output_dir}")
|
| 39 |
+
|
| 40 |
+
# Example usage
|
| 41 |
+
convert_coco_json_to_yolo(
|
| 42 |
+
coco_json_path=annotations_dir+"instances_train.json",
|
| 43 |
+
output_dir=labels_dir+"train/"
|
| 44 |
+
)
|
| 45 |
+
convert_coco_json_to_yolo(
|
| 46 |
+
coco_json_path=annotations_dir+"instances_val.json",
|
| 47 |
+
output_dir=labels_dir+"val/"
|
| 48 |
+
)
|
| 49 |
+
convert_coco_json_to_yolo(
|
| 50 |
+
coco_json_path=annotations_dir+"instances_test.json",
|
| 51 |
+
output_dir=labels_dir+"test/"
|
| 52 |
+
)
|
utils/create_audiobook.py
ADDED
|
@@ -0,0 +1,55 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import torch
|
| 2 |
+
from TTS.api import TTS
|
| 3 |
+
import os
|
| 4 |
+
import numpy as np
|
| 5 |
+
from scipy.io import wavfile
|
| 6 |
+
|
| 7 |
+
# Save the original torch.load function
|
| 8 |
+
_original_torch_load = torch.load
|
| 9 |
+
|
| 10 |
+
# Define a custom function that forces weights_only=False
|
| 11 |
+
def custom_torch_load(*args, **kwargs):
|
| 12 |
+
if "weights_only" not in kwargs:
|
| 13 |
+
kwargs["weights_only"] = False
|
| 14 |
+
return _original_torch_load(*args, **kwargs)
|
| 15 |
+
|
| 16 |
+
# Override torch.load globally
|
| 17 |
+
torch.load = custom_torch_load
|
| 18 |
+
|
| 19 |
+
device = "cuda" if torch.cuda.is_available() else "cpu"
|
| 20 |
+
print(f"Using device: {device}")
|
| 21 |
+
|
| 22 |
+
|
| 23 |
+
# generate_samples()
|
| 24 |
+
def create_audiobook(gender_list,english_transcript,speaker_list):
|
| 25 |
+
parent_wav = []
|
| 26 |
+
tts = TTS("tts_models/multilingual/multi-dataset/xtts_v2").to(device)
|
| 27 |
+
for text in english_transcript:
|
| 28 |
+
print(text['char_id'],text['text'])
|
| 29 |
+
voice_gender = 'Male'
|
| 30 |
+
voice_speaker = ''
|
| 31 |
+
if text['char_id'] != None:
|
| 32 |
+
voice_gender = gender_list[int(text['char_id'])][0]
|
| 33 |
+
print("voice person",speaker_list[text['char_id']])
|
| 34 |
+
voice_speaker = speaker_list[text['char_id']]
|
| 35 |
+
else:
|
| 36 |
+
print("voice person",speaker_list[-1])
|
| 37 |
+
voice_speaker = speaker_list[-1]
|
| 38 |
+
print("Voice Gender",voice_gender)
|
| 39 |
+
|
| 40 |
+
try:
|
| 41 |
+
|
| 42 |
+
wav = tts.tts(
|
| 43 |
+
text=text['text'],
|
| 44 |
+
speaker=voice_speaker.replace('_',' '),
|
| 45 |
+
language="en"
|
| 46 |
+
)
|
| 47 |
+
print("✓ Audio Generated")
|
| 48 |
+
parent_wav.append(wav)
|
| 49 |
+
|
| 50 |
+
except Exception as e:
|
| 51 |
+
print(f"✗ Error with : {e}")
|
| 52 |
+
combined = np.concatenate(parent_wav)
|
| 53 |
+
return combined,tts.synthesizer.output_sample_rate
|
| 54 |
+
|
| 55 |
+
# parent_wav = generate_audio(gender_list,english_transcript,tts)
|
utils/create_directories.py
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
def create_directories(config):
|
| 3 |
+
dir_path = config["dir_path"]
|
| 4 |
+
dir_names = ""
|
| 5 |
+
if os.path.exists(dir_path):
|
| 6 |
+
with open(dir_path,"r") as f:
|
| 7 |
+
dir_names = f.readlines()
|
| 8 |
+
for dir_name in dir_names:
|
| 9 |
+
dir_name = dir_name.strip()
|
| 10 |
+
if not os.path.exists(dir_name):
|
| 11 |
+
os.mkdir(dir_name)
|
| 12 |
+
else:
|
| 13 |
+
print(f"Ensure {dir_path} exists in utils")
|
utils/directories.txt
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
./dataset
|
| 2 |
+
./dataset/images
|
| 3 |
+
./dataset/labels
|
| 4 |
+
./dataset/json
|
| 5 |
+
./dataset/panels
|
| 6 |
+
./dataset/panel_data
|
| 7 |
+
./dataset/annotations
|
| 8 |
+
./dataset/images/train
|
| 9 |
+
./dataset/images/val
|
| 10 |
+
./dataset/images/test
|
| 11 |
+
./dataset/labels/train
|
| 12 |
+
./dataset/labels/val
|
| 13 |
+
./dataset/labels/test
|
utils/get_yolo_predictions.py
ADDED
|
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
|
| 2 |
+
CLASSES = {0:"body", 1:"face", 2:"frame", 3:"text"} # confirmed by user
|
| 3 |
+
|
| 4 |
+
def get_yolo_predictions(yolo_model,img_path:str):
|
| 5 |
+
results = yolo_model.predict(source=img_path, device='cuda')
|
| 6 |
+
dets = {"body":[], "face":[], "frame":[], "text":[]}
|
| 7 |
+
for box in results[0].boxes:
|
| 8 |
+
c = int(box.cls[0])
|
| 9 |
+
x1, y1, x2, y2 = map(int, box.xyxy[0])
|
| 10 |
+
dets[CLASSES[c]].append([x1,y1,x2,y2])
|
| 11 |
+
return dets
|
utils/identify_speaker.py
ADDED
|
@@ -0,0 +1,443 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
Complete Standalone End-to-End Speech-to-Speaker Association Inference Script
|
| 4 |
+
Includes all necessary functions without external dependencies on custom files
|
| 5 |
+
"""
|
| 6 |
+
|
| 7 |
+
import json
|
| 8 |
+
import torch
|
| 9 |
+
import cv2
|
| 10 |
+
from torch_geometric.data import HeteroData, Batch
|
| 11 |
+
from typing import Dict, List, Any, Optional, Tuple, Union
|
| 12 |
+
import os
|
| 13 |
+
from ultralytics import YOLO
|
| 14 |
+
from utils.train_speaker import hungarian_matching,AssocGCN,infer_associations,DatasetLoader
|
| 15 |
+
|
| 16 |
+
|
| 17 |
+
|
| 18 |
+
# ============================================================================
|
| 19 |
+
# NEW CLASSES FOR UNSEEN IMAGE PROCESSING
|
| 20 |
+
# ============================================================================
|
| 21 |
+
|
| 22 |
+
class DetectionPredictions:
|
| 23 |
+
"""Container for object detection predictions from an unseen image"""
|
| 24 |
+
|
| 25 |
+
def __init__(self, image_path: str):
|
| 26 |
+
self.image_path = image_path
|
| 27 |
+
self.panels: List[Dict] = []
|
| 28 |
+
self.speech_bubbles: List[Dict] = []
|
| 29 |
+
self.faces: List[Dict] = []
|
| 30 |
+
self.bodies: List[Dict] = []
|
| 31 |
+
self.image_size: Tuple[int, int] = (0, 0) # (width, height)
|
| 32 |
+
|
| 33 |
+
def add_panel(self, bbox: List[float], confidence: float, panel_id: int):
|
| 34 |
+
"""Add panel detection (frame in YOLO terms)"""
|
| 35 |
+
self.panels.append({
|
| 36 |
+
'bbox': bbox, 'confidence': confidence, 'id': panel_id
|
| 37 |
+
})
|
| 38 |
+
|
| 39 |
+
def add_speech_bubble(self, bbox: List[float], confidence: float,
|
| 40 |
+
panel_id: int, bubble_id: int):
|
| 41 |
+
"""Add speech bubble detection (text in YOLO terms)"""
|
| 42 |
+
self.speech_bubbles.append({
|
| 43 |
+
'bbox': bbox, 'confidence': confidence,
|
| 44 |
+
'panel_id': panel_id, 'id': bubble_id
|
| 45 |
+
})
|
| 46 |
+
|
| 47 |
+
def add_face(self, bbox: List[float], confidence: float,
|
| 48 |
+
panel_id: int, face_id: int):
|
| 49 |
+
"""Add face detection"""
|
| 50 |
+
self.faces.append({
|
| 51 |
+
'bbox': bbox, 'confidence': confidence,
|
| 52 |
+
'panel_id': panel_id, 'id': face_id
|
| 53 |
+
})
|
| 54 |
+
|
| 55 |
+
def add_body(self, bbox: List[float], confidence: float,
|
| 56 |
+
panel_id: int, body_id: int):
|
| 57 |
+
"""Add body detection"""
|
| 58 |
+
self.bodies.append({
|
| 59 |
+
'bbox': bbox, 'confidence': confidence,
|
| 60 |
+
'panel_id': panel_id, 'id': body_id
|
| 61 |
+
})
|
| 62 |
+
|
| 63 |
+
|
| 64 |
+
# ============================================================================
|
| 65 |
+
# ENHANCED DATA PROCESSING FUNCTIONS
|
| 66 |
+
# ============================================================================
|
| 67 |
+
|
| 68 |
+
def create_panel_dict_from_predictions(predictions: DetectionPredictions,
|
| 69 |
+
panel_id: int) -> Dict:
|
| 70 |
+
"""
|
| 71 |
+
ADAPTED: Creates a panel dictionary from detection predictions
|
| 72 |
+
Uses the same format as original create_hetero_data_from_panel expects
|
| 73 |
+
"""
|
| 74 |
+
# Filter predictions for this panel
|
| 75 |
+
panel_bubbles = [b for b in predictions.speech_bubbles if b['panel_id'] == panel_id]
|
| 76 |
+
panel_faces = [f for f in predictions.faces if f['panel_id'] == panel_id]
|
| 77 |
+
|
| 78 |
+
# Create bubbles list in expected format
|
| 79 |
+
bubbles = []
|
| 80 |
+
for i, bubble in enumerate(panel_bubbles):
|
| 81 |
+
bubbles.append({
|
| 82 |
+
'bubble_id': bubble['id'],
|
| 83 |
+
'bbox': bubble['bbox']
|
| 84 |
+
})
|
| 85 |
+
|
| 86 |
+
# Create faces list in expected format
|
| 87 |
+
faces = []
|
| 88 |
+
for i, face in enumerate(panel_faces):
|
| 89 |
+
faces.append({
|
| 90 |
+
'face_id': face['id'],
|
| 91 |
+
'bbox': face['bbox']
|
| 92 |
+
})
|
| 93 |
+
|
| 94 |
+
# Create panel dict in expected format
|
| 95 |
+
panel_dict = {
|
| 96 |
+
'panel_id': f"unseen_panel_{panel_id}",
|
| 97 |
+
'width': predictions.image_size[0],
|
| 98 |
+
'height': predictions.image_size[1],
|
| 99 |
+
'bubbles': bubbles,
|
| 100 |
+
'faces': faces,
|
| 101 |
+
'links': [] # No ground truth links for unseen images
|
| 102 |
+
}
|
| 103 |
+
|
| 104 |
+
return panel_dict
|
| 105 |
+
|
| 106 |
+
|
| 107 |
+
def create_hetero_data_from_predictions(predictions: DetectionPredictions,
|
| 108 |
+
panel_id: int) -> Optional[HeteroData]:
|
| 109 |
+
"""
|
| 110 |
+
ENHANCED: Creates HeteroData from detection predictions
|
| 111 |
+
Leverages existing create_hetero_data_from_panel function
|
| 112 |
+
"""
|
| 113 |
+
# Convert predictions to panel dict format
|
| 114 |
+
panel_dict = create_panel_dict_from_predictions(predictions, panel_id)
|
| 115 |
+
|
| 116 |
+
# Use existing function to create HeteroData
|
| 117 |
+
hetero_data = DatasetLoader.create_hetero_data_from_panel(panel_dict)
|
| 118 |
+
|
| 119 |
+
if hetero_data is not None:
|
| 120 |
+
# Remove ground truth labels since we don't have them for unseen images
|
| 121 |
+
if hasattr(hetero_data['bubble', 'to', 'face'], 'edge_label'):
|
| 122 |
+
delattr(hetero_data['bubble', 'to', 'face'], 'edge_label')
|
| 123 |
+
|
| 124 |
+
return hetero_data
|
| 125 |
+
|
| 126 |
+
|
| 127 |
+
# ============================================================================
|
| 128 |
+
# YOLO DETECTION INTEGRATION
|
| 129 |
+
# ============================================================================
|
| 130 |
+
|
| 131 |
+
def get_predictions_from_yolo(img_path: str, yolo_model) -> DetectionPredictions:
|
| 132 |
+
"""
|
| 133 |
+
INTEGRATED: YOLO detection pipeline that maps to our detection categories
|
| 134 |
+
Maps YOLO classes: {0:"body", 1:"face", 2:"frame", 3:"text"}
|
| 135 |
+
"""
|
| 136 |
+
CLASSES = {0: "body", 1: "face", 2: "frame", 3: "text"}
|
| 137 |
+
|
| 138 |
+
# Create predictions container
|
| 139 |
+
predictions = DetectionPredictions(img_path)
|
| 140 |
+
|
| 141 |
+
# Load image to get dimensions
|
| 142 |
+
img = cv2.imread(img_path)
|
| 143 |
+
if img is None:
|
| 144 |
+
raise ValueError(f"Could not load image: {img_path}")
|
| 145 |
+
height, width = img.shape[:2]
|
| 146 |
+
predictions.image_size = (width, height)
|
| 147 |
+
|
| 148 |
+
# Get YOLO predictions
|
| 149 |
+
results = yolo_model.predict(source=img_path, device='cuda', verbose=False)
|
| 150 |
+
|
| 151 |
+
# Process detections and organize by type
|
| 152 |
+
detections_by_type = {"body": [], "face": [], "frame": [], "text": []}
|
| 153 |
+
|
| 154 |
+
for box in results[0].boxes:
|
| 155 |
+
c = int(box.cls[0])
|
| 156 |
+
x1, y1, x2, y2 = map(int, box.xyxy[0])
|
| 157 |
+
confidence = float(box.conf[0])
|
| 158 |
+
|
| 159 |
+
detection_type = CLASSES[c]
|
| 160 |
+
detections_by_type[detection_type].append({
|
| 161 |
+
'bbox': [x1, y1, x2, y2],
|
| 162 |
+
'confidence': confidence
|
| 163 |
+
})
|
| 164 |
+
|
| 165 |
+
# Process frame detections as panels
|
| 166 |
+
for i, frame in enumerate(detections_by_type["frame"]):
|
| 167 |
+
predictions.add_panel(frame['bbox'], frame['confidence'], i)
|
| 168 |
+
|
| 169 |
+
# If no frames detected, create a default full-image panel
|
| 170 |
+
if len(predictions.panels) == 0:
|
| 171 |
+
predictions.add_panel([0, 0, width, height], 1.0, 0)
|
| 172 |
+
|
| 173 |
+
# Process text detections as speech bubbles
|
| 174 |
+
for i, text in enumerate(detections_by_type["text"]):
|
| 175 |
+
panel_id = find_containing_panel(text['bbox'], predictions.panels)
|
| 176 |
+
predictions.add_speech_bubble(text['bbox'], text['confidence'], panel_id, i)
|
| 177 |
+
|
| 178 |
+
from collections import defaultdict
|
| 179 |
+
|
| 180 |
+
# Group bubbles by panel_id
|
| 181 |
+
grouped_texts = defaultdict(list)
|
| 182 |
+
for bubble in predictions.speech_bubbles:
|
| 183 |
+
grouped_texts[bubble['panel_id']].append(bubble)
|
| 184 |
+
|
| 185 |
+
# Assign seq based on right-to-left order (sort by bbox[0] descending)
|
| 186 |
+
for panel_id, bubbles in grouped_texts.items():
|
| 187 |
+
sorted_bubbles = sorted(bubbles, key=lambda b: b['bbox'][0], reverse=True)
|
| 188 |
+
for seq, bubble in enumerate(sorted_bubbles):
|
| 189 |
+
bubble['seq'] = seq
|
| 190 |
+
|
| 191 |
+
# # Optional: print results
|
| 192 |
+
# for bubble in speech_bubbles:
|
| 193 |
+
# print(bubble)
|
| 194 |
+
|
| 195 |
+
# Process face detections
|
| 196 |
+
for i, face in enumerate(detections_by_type["face"]):
|
| 197 |
+
panel_id = find_containing_panel(face['bbox'], predictions.panels)
|
| 198 |
+
predictions.add_face(face['bbox'], face['confidence'], panel_id, i)
|
| 199 |
+
|
| 200 |
+
# Process body detections (optional - can be used for additional context)
|
| 201 |
+
for i, body in enumerate(detections_by_type["body"]):
|
| 202 |
+
panel_id = find_containing_panel(body['bbox'], predictions.panels)
|
| 203 |
+
predictions.add_body(body['bbox'], body['confidence'], panel_id, i)
|
| 204 |
+
|
| 205 |
+
return predictions
|
| 206 |
+
|
| 207 |
+
|
| 208 |
+
def find_containing_panel(bbox: List[float], panels: List[Dict]) -> int:
|
| 209 |
+
"""
|
| 210 |
+
Helper function to determine which panel contains a detection
|
| 211 |
+
"""
|
| 212 |
+
bbox_center_x = (bbox[0] + bbox[2]) / 2
|
| 213 |
+
bbox_center_y = (bbox[1] + bbox[3]) / 2
|
| 214 |
+
|
| 215 |
+
for panel in panels:
|
| 216 |
+
p_bbox = panel['bbox']
|
| 217 |
+
if (p_bbox[0] <= bbox_center_x <= p_bbox[2] and
|
| 218 |
+
p_bbox[1] <= bbox_center_y <= p_bbox[3]):
|
| 219 |
+
return panel['id']
|
| 220 |
+
|
| 221 |
+
# Return first panel if not contained in any
|
| 222 |
+
return panels[0]['id'] if panels else 0
|
| 223 |
+
|
| 224 |
+
|
| 225 |
+
# ============================================================================
|
| 226 |
+
# MODEL LOADING WITH ERROR HANDLING
|
| 227 |
+
# ============================================================================
|
| 228 |
+
|
| 229 |
+
def load_trained_speaker_model(model_path: str, device: str = "cuda") -> AssocGCN:
|
| 230 |
+
"""
|
| 231 |
+
FIXED: Load trained AssocGCN model with proper error handling
|
| 232 |
+
Handles different checkpoint formats including 'model_state' key
|
| 233 |
+
"""
|
| 234 |
+
# Create model instance
|
| 235 |
+
model = AssocGCN().to(device)
|
| 236 |
+
|
| 237 |
+
# Load the checkpoint
|
| 238 |
+
checkpoint = torch.load(model_path, map_location=device)
|
| 239 |
+
|
| 240 |
+
# Handle different checkpoint formats
|
| 241 |
+
if isinstance(checkpoint, dict):
|
| 242 |
+
# Check for different possible keys where model state is stored
|
| 243 |
+
if 'model_state_dict' in checkpoint:
|
| 244 |
+
state_dict = checkpoint['model_state_dict']
|
| 245 |
+
elif 'model_state' in checkpoint: # This handles your specific case
|
| 246 |
+
state_dict = checkpoint['model_state']
|
| 247 |
+
elif 'state_dict' in checkpoint:
|
| 248 |
+
state_dict = checkpoint['state_dict']
|
| 249 |
+
else:
|
| 250 |
+
# If no specific key, assume the entire checkpoint is the state dict
|
| 251 |
+
# but filter out non-model keys
|
| 252 |
+
state_dict = {k: v for k, v in checkpoint.items()
|
| 253 |
+
if k not in {'epoch', 'loss', 'optimizer_state_dict'}}
|
| 254 |
+
else:
|
| 255 |
+
# Assume checkpoint is directly the state dict
|
| 256 |
+
state_dict = checkpoint
|
| 257 |
+
|
| 258 |
+
# Handle DataParallel module prefix if present
|
| 259 |
+
if any(key.startswith('module.') for key in state_dict.keys()):
|
| 260 |
+
new_state_dict = {}
|
| 261 |
+
for key, value in state_dict.items():
|
| 262 |
+
new_key = key.replace('module.', '') if key.startswith('module.') else key
|
| 263 |
+
new_state_dict[new_key] = value
|
| 264 |
+
state_dict = new_state_dict
|
| 265 |
+
|
| 266 |
+
try:
|
| 267 |
+
# Try to load with strict=True first
|
| 268 |
+
model.load_state_dict(state_dict, strict=True)
|
| 269 |
+
print("✅ Model loaded successfully with strict=True")
|
| 270 |
+
except RuntimeError as e:
|
| 271 |
+
print(f"⚠️ Warning: {str(e)}")
|
| 272 |
+
print("Attempting to load with strict=False...")
|
| 273 |
+
try:
|
| 274 |
+
# Try with strict=False to ignore missing/unexpected keys
|
| 275 |
+
missing_keys, unexpected_keys = model.load_state_dict(state_dict, strict=False)
|
| 276 |
+
if missing_keys:
|
| 277 |
+
print(f"⚠️ Missing keys: {missing_keys}")
|
| 278 |
+
if unexpected_keys:
|
| 279 |
+
print(f"⚠️ Unexpected keys: {unexpected_keys}")
|
| 280 |
+
print("✅ Model loaded successfully with strict=False")
|
| 281 |
+
except Exception as e2:
|
| 282 |
+
print(f"❌ Failed to load model: {str(e2)}")
|
| 283 |
+
raise e2
|
| 284 |
+
|
| 285 |
+
model.eval()
|
| 286 |
+
return model
|
| 287 |
+
|
| 288 |
+
|
| 289 |
+
# ============================================================================
|
| 290 |
+
# MAIN INFERENCE PIPELINE
|
| 291 |
+
# ============================================================================
|
| 292 |
+
|
| 293 |
+
@torch.no_grad()
|
| 294 |
+
def identify_speaker(model: AssocGCN, image_path: str,
|
| 295 |
+
yolo_model, device: str = "cuda") -> Dict[str, Dict[int, int]]:
|
| 296 |
+
"""
|
| 297 |
+
COMPLETE: End-to-end inference pipeline for unseen images
|
| 298 |
+
"""
|
| 299 |
+
|
| 300 |
+
try:
|
| 301 |
+
# Step 1: Run YOLO detection pipeline
|
| 302 |
+
print(f"🔍 Running YOLO detection on {os.path.basename(image_path)}...")
|
| 303 |
+
predictions = get_predictions_from_yolo(image_path, yolo_model)
|
| 304 |
+
|
| 305 |
+
# print(f"📊 Detection Results:")
|
| 306 |
+
# print(f" - Panels/Frames: {len(predictions.panels)}")
|
| 307 |
+
# print(f" - Speech Bubbles/Text: {len(predictions.speech_bubbles)}")
|
| 308 |
+
# print(f" - Faces: {len(predictions.faces)}")
|
| 309 |
+
# print(f" - Bodies: {len(predictions.bodies)}")
|
| 310 |
+
|
| 311 |
+
# Check if we have the minimum required detections
|
| 312 |
+
if len(predictions.speech_bubbles) == 0:
|
| 313 |
+
print("⚠️ No speech bubbles/text detected. Cannot perform association.")
|
| 314 |
+
return {}
|
| 315 |
+
|
| 316 |
+
if len(predictions.faces) == 0:
|
| 317 |
+
print("⚠️ No faces detected. Cannot perform association.")
|
| 318 |
+
return {}
|
| 319 |
+
|
| 320 |
+
# Step 2: Create HeteroData for each panel
|
| 321 |
+
panels_data = []
|
| 322 |
+
panel_ids = list(set([p['id'] for p in predictions.panels]))
|
| 323 |
+
|
| 324 |
+
for panel_id in panel_ids:
|
| 325 |
+
panel_data = create_hetero_data_from_predictions(predictions, panel_id)
|
| 326 |
+
# print("panel data", panel_data , dir(panel_data))
|
| 327 |
+
# print(( 'bubble' in panel_data.node_types , 'face' in panel_data.node_types,
|
| 328 |
+
# panel_data['bubble'].x.size(0) > 0 , panel_data['face'].x.size(0) > 0))
|
| 329 |
+
if panel_data is not None:
|
| 330 |
+
panel_data = panel_data.to(device)
|
| 331 |
+
panels_data.append(panel_data)
|
| 332 |
+
|
| 333 |
+
# Step 3: Run model inference
|
| 334 |
+
results = {}
|
| 335 |
+
|
| 336 |
+
for panel_data in panels_data:
|
| 337 |
+
# print("ppppp",panel_data == None)
|
| 338 |
+
# if (hasattr(panel_data, 'bubble') and hasattr(panel_data, 'face') and
|
| 339 |
+
# panel_data['bubble'].x.size(0) > 0 and panel_data['face'].x.size(0) > 0):
|
| 340 |
+
if ( panel_data != None and 'bubble' in panel_data.node_types and 'face' in panel_data.node_types and
|
| 341 |
+
panel_data['bubble'].x.size(0) > 0 and panel_data['face'].x.size(0) > 0):
|
| 342 |
+
|
| 343 |
+
# Run inference using existing function
|
| 344 |
+
mapping = infer_associations(model, panel_data)
|
| 345 |
+
|
| 346 |
+
# print(f"\\n🖼️ {panel_data.panel_id}:")
|
| 347 |
+
# if mapping:
|
| 348 |
+
# for bubble_idx, face_idx in mapping.items():
|
| 349 |
+
# print(f" Text/Bubble {bubble_idx} → Face {face_idx}")
|
| 350 |
+
# else:
|
| 351 |
+
# print(" No associations found")
|
| 352 |
+
|
| 353 |
+
results[panel_data.panel_id] = mapping
|
| 354 |
+
else:
|
| 355 |
+
print(f"⚠️ {panel_data.panel_id}: No valid bubbles or faces, skipping...")
|
| 356 |
+
|
| 357 |
+
|
| 358 |
+
total_associations = 0
|
| 359 |
+
if results:
|
| 360 |
+
for panel_id, mapping in results.items():
|
| 361 |
+
if mapping:
|
| 362 |
+
# print(f"\\n🖼️ {panel_id}:")
|
| 363 |
+
for bubble_id, face_id in mapping.items():
|
| 364 |
+
# print(f" Text/Bubble {bubble_id} ← → Face {face_id}")
|
| 365 |
+
total_associations += 1
|
| 366 |
+
# else:
|
| 367 |
+
# print(f"\\n🖼️ {panel_id}: No associations found")
|
| 368 |
+
else:
|
| 369 |
+
print("No associations found in the image.")
|
| 370 |
+
|
| 371 |
+
print(f"\\n✅ Total associations found: {total_associations}")
|
| 372 |
+
|
| 373 |
+
return results,predictions
|
| 374 |
+
|
| 375 |
+
except Exception as e:
|
| 376 |
+
print(f"❌ Error during inference: {str(e)}")
|
| 377 |
+
raise e
|
| 378 |
+
|
| 379 |
+
|
| 380 |
+
|
| 381 |
+
|
| 382 |
+
|
| 383 |
+
|
| 384 |
+
|
| 385 |
+
|
| 386 |
+
|
| 387 |
+
|
| 388 |
+
|
| 389 |
+
|
| 390 |
+
|
| 391 |
+
|
| 392 |
+
|
| 393 |
+
|
| 394 |
+
|
| 395 |
+
# #!/usr/bin/env python3
|
| 396 |
+
# """
|
| 397 |
+
# inference.py ── Run saved AssocGCN on new images / panels.
|
| 398 |
+
# Generate per-panel mappings and (optionally) aggregate metrics.
|
| 399 |
+
# """
|
| 400 |
+
# import argparse
|
| 401 |
+
# import torch
|
| 402 |
+
# from pathlib import Path
|
| 403 |
+
# from torch_geometric.data import Batch
|
| 404 |
+
# from typing import Dict, List
|
| 405 |
+
|
| 406 |
+
# from utils import load_model
|
| 407 |
+
# from train_speaker import DatasetLoader, infer_associations # your original file
|
| 408 |
+
|
| 409 |
+
# @torch.no_grad()
|
| 410 |
+
# def evaluate_panels(model,
|
| 411 |
+
# panels: List["HeteroData"],
|
| 412 |
+
# compute_metrics: bool = True) -> None:
|
| 413 |
+
# device = next(model.parameters()).device
|
| 414 |
+
# tp = fp = fn = 0
|
| 415 |
+
|
| 416 |
+
# for idx, data in enumerate(panels):
|
| 417 |
+
# mapping = infer_associations(model, data)
|
| 418 |
+
# print(f"\n🖼️ Panel {data.panel_id}:")
|
| 419 |
+
# for bub, face in mapping.items():
|
| 420 |
+
# print(f" Bubble {bub} → Face {face}")
|
| 421 |
+
|
| 422 |
+
# if compute_metrics and "edge_label" in data["bubble", "to", "face"]:
|
| 423 |
+
# gt = {(i.item(), j.item())
|
| 424 |
+
# for i, j, lbl in zip(*data["bubble", "to", "face"].edge_index,
|
| 425 |
+
# data["bubble", "to", "face"].edge_label)
|
| 426 |
+
# if lbl == 1}
|
| 427 |
+
# pred = {(b, f) for b, f in mapping.items()}
|
| 428 |
+
|
| 429 |
+
# tp += len(gt & pred)
|
| 430 |
+
# fp += len(pred - gt)
|
| 431 |
+
# fn += len(gt - pred)
|
| 432 |
+
|
| 433 |
+
# if compute_metrics:
|
| 434 |
+
# prec = tp / (tp + fp) if (tp + fp) else 0
|
| 435 |
+
# rec = tp / (tp + fn) if (tp + fn) else 0
|
| 436 |
+
# f1 = 2*prec*rec/(prec+rec) if (prec+rec) else 0
|
| 437 |
+
# print("\n📊 Aggregated metrics")
|
| 438 |
+
# print(f" Precision: {prec:.3f}")
|
| 439 |
+
# print(f" Recall : {rec:.3f}")
|
| 440 |
+
# print(f" F1 Score : {f1:.3f}")
|
| 441 |
+
|
| 442 |
+
# def identify_speaker(config):
|
| 443 |
+
# pass
|
utils/japanese_ocr.py
ADDED
|
@@ -0,0 +1,52 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import re
|
| 2 |
+
from pathlib import Path
|
| 3 |
+
|
| 4 |
+
import jaconv
|
| 5 |
+
import torch
|
| 6 |
+
from PIL import Image
|
| 7 |
+
from loguru import logger
|
| 8 |
+
from transformers import ViTImageProcessor, AutoTokenizer, VisionEncoderDecoderModel, GenerationMixin
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
class MangaOcrModel(VisionEncoderDecoderModel, GenerationMixin):
|
| 12 |
+
pass
|
| 13 |
+
|
| 14 |
+
class MangaOcr:
|
| 15 |
+
def __init__(self, pretrained_model_name_or_path="kha-white/manga-ocr-base", force_cpu=False):
|
| 16 |
+
logger.info(f"Loading OCR model from {pretrained_model_name_or_path}")
|
| 17 |
+
self.processor = ViTImageProcessor.from_pretrained(pretrained_model_name_or_path)
|
| 18 |
+
self.tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path)
|
| 19 |
+
self.model = MangaOcrModel.from_pretrained(pretrained_model_name_or_path)
|
| 20 |
+
|
| 21 |
+
if not force_cpu and torch.cuda.is_available():
|
| 22 |
+
logger.info("Using CUDA")
|
| 23 |
+
self.model.cuda()
|
| 24 |
+
elif not force_cpu and torch.backends.mps.is_available():
|
| 25 |
+
logger.info("Using MPS")
|
| 26 |
+
self.model.to("mps")
|
| 27 |
+
else:
|
| 28 |
+
logger.info("Using CPU")
|
| 29 |
+
logger.info("OCR ready")
|
| 30 |
+
|
| 31 |
+
def __call__(self, img_or_path):
|
| 32 |
+
img = img_or_path.convert("L").convert("RGB")
|
| 33 |
+
|
| 34 |
+
x = self._preprocess(img)
|
| 35 |
+
x = self.model.generate(x[None].to(self.model.device), max_length=300)[0].cpu()
|
| 36 |
+
x = self.tokenizer.decode(x, skip_special_tokens=True)
|
| 37 |
+
x = post_process(x)
|
| 38 |
+
return x
|
| 39 |
+
|
| 40 |
+
def _preprocess(self, img):
|
| 41 |
+
pixel_values = self.processor(img, return_tensors="pt").pixel_values
|
| 42 |
+
return pixel_values.squeeze()
|
| 43 |
+
|
| 44 |
+
|
| 45 |
+
def post_process(text):
|
| 46 |
+
text = "".join(text.split())
|
| 47 |
+
text = text.replace("…", "...")
|
| 48 |
+
text = re.sub("[・.]{2,}", lambda x: (x.end() - x.start()) * ".", text)
|
| 49 |
+
text = jaconv.h2z(text, ascii=True, digit=True)
|
| 50 |
+
|
| 51 |
+
return text
|
| 52 |
+
|
utils/json_to_coco.py
ADDED
|
@@ -0,0 +1,131 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import json
|
| 2 |
+
import os
|
| 3 |
+
|
| 4 |
+
def json_to_coco(config):
|
| 5 |
+
base_dir = config["root"]
|
| 6 |
+
image_dir = config["root_images"]
|
| 7 |
+
annotations_dir = config["root_annotations"]
|
| 8 |
+
|
| 9 |
+
def create_coco(json_dir):
|
| 10 |
+
# COCO base structure
|
| 11 |
+
coco = {
|
| 12 |
+
"images": [],
|
| 13 |
+
"annotations": [],
|
| 14 |
+
"categories": []
|
| 15 |
+
}
|
| 16 |
+
uniq_filename_mapping = {}
|
| 17 |
+
# Category mapping
|
| 18 |
+
category_names = ["body", "face", "frame", "text"]
|
| 19 |
+
category_map = {name: idx + 1 for idx, name in enumerate(category_names)}
|
| 20 |
+
coco["categories"] = [{"id": cid, "name": name, "supercategory": "object"} for name, cid in category_map.items()]
|
| 21 |
+
import copy
|
| 22 |
+
coco_train = copy.deepcopy(coco)
|
| 23 |
+
coco_test = copy.deepcopy(coco)
|
| 24 |
+
coco_val = copy.deepcopy(coco)
|
| 25 |
+
uniq = 0
|
| 26 |
+
for book_json in os.listdir(json_dir):
|
| 27 |
+
book = str(book_json).split(".")[0]
|
| 28 |
+
data = []
|
| 29 |
+
with open(base_dir +"json/"+book+".json", 'r',encoding='utf-8') as file:
|
| 30 |
+
data = json.load(file)
|
| 31 |
+
|
| 32 |
+
pages = data["page"]
|
| 33 |
+
|
| 34 |
+
train_output = []
|
| 35 |
+
val_output = []
|
| 36 |
+
test_output = []
|
| 37 |
+
|
| 38 |
+
import math
|
| 39 |
+
import shutil
|
| 40 |
+
|
| 41 |
+
page_count = len(pages) - 2
|
| 42 |
+
div = page_count / 10
|
| 43 |
+
div = int(math.ceil(div))
|
| 44 |
+
# train_count = page_count - (2 * div)
|
| 45 |
+
# val_count = div
|
| 46 |
+
# test_count = div
|
| 47 |
+
# print(train_count,val_count,test_count)
|
| 48 |
+
train_arr = []
|
| 49 |
+
val_arr = []
|
| 50 |
+
test_arr = []
|
| 51 |
+
count = 0
|
| 52 |
+
for i in range(page_count,0,-1):
|
| 53 |
+
if count < div :
|
| 54 |
+
test_arr.append(i)
|
| 55 |
+
elif count < 2*div :
|
| 56 |
+
val_arr.append(i)
|
| 57 |
+
else:
|
| 58 |
+
train_arr.append(i)
|
| 59 |
+
count += 1
|
| 60 |
+
|
| 61 |
+
# Annotation ID counter
|
| 62 |
+
ann_id = 1
|
| 63 |
+
|
| 64 |
+
# Helper to create COCO bbox annotation
|
| 65 |
+
def create_annotation(ann, category_name, image_id, ann_id,uniq):
|
| 66 |
+
x_min = int(ann["@xmin"])
|
| 67 |
+
y_min = int(ann["@ymin"])
|
| 68 |
+
x_max = int(ann["@xmax"])
|
| 69 |
+
y_max = int(ann["@ymax"])
|
| 70 |
+
width = x_max - x_min
|
| 71 |
+
height = y_max - y_min
|
| 72 |
+
|
| 73 |
+
return {
|
| 74 |
+
"id": ann_id,
|
| 75 |
+
"image_id": uniq,
|
| 76 |
+
"category_id": category_map[category_name],
|
| 77 |
+
"bbox": [x_min, y_min, width, height],
|
| 78 |
+
"area": width * height,
|
| 79 |
+
"iscrowd": 0
|
| 80 |
+
}
|
| 81 |
+
|
| 82 |
+
def append_data(coco, ann_id,uniq):
|
| 83 |
+
# Add image entry
|
| 84 |
+
coco["images"].append({
|
| 85 |
+
"id": uniq,
|
| 86 |
+
"width": page["@width"],
|
| 87 |
+
"height": page["@height"],
|
| 88 |
+
"file_name": f"{book}_{str(img_id).zfill(3)}.jpg"
|
| 89 |
+
})
|
| 90 |
+
|
| 91 |
+
|
| 92 |
+
# Loop through each category and add annotations
|
| 93 |
+
for category in category_names:
|
| 94 |
+
for ann in page.get(category, []):
|
| 95 |
+
coco["annotations"].append(
|
| 96 |
+
create_annotation(ann, category, img_id, ann_id,uniq)
|
| 97 |
+
)
|
| 98 |
+
ann_id += 1
|
| 99 |
+
|
| 100 |
+
|
| 101 |
+
for page in pages:
|
| 102 |
+
uniq += 1
|
| 103 |
+
img_id = int(page["@index"])
|
| 104 |
+
if img_id == 0 or img_id == 1 :
|
| 105 |
+
continue
|
| 106 |
+
|
| 107 |
+
curr_img_path = os.path.join(config["manga109_root_dir"],"images",book,f"{str(img_id).zfill(3)}.jpg")
|
| 108 |
+
uniq_filename_mapping[uniq] = book+"_"+(curr_img_path.split("\\")[-1])
|
| 109 |
+
if img_id in train_arr:
|
| 110 |
+
append_data(coco_train, ann_id,uniq)
|
| 111 |
+
train_output.append(coco)
|
| 112 |
+
shutil.copyfile(curr_img_path,os.path.join(image_dir,"train/",book+"_"+(curr_img_path.split("\\")[-1])))
|
| 113 |
+
if img_id in val_arr:
|
| 114 |
+
append_data(coco_val, ann_id,uniq)
|
| 115 |
+
val_output.append(coco)
|
| 116 |
+
shutil.copyfile(curr_img_path,os.path.join(image_dir,"val/",book+"_"+(curr_img_path.split("\\")[-1])))
|
| 117 |
+
if img_id in test_arr:
|
| 118 |
+
append_data(coco_test, ann_id,uniq)
|
| 119 |
+
test_output.append(coco)
|
| 120 |
+
shutil.copyfile(curr_img_path,os.path.join(image_dir,"test/",book+"_"+(curr_img_path.split("\\")[-1])))
|
| 121 |
+
|
| 122 |
+
with open(base_dir+"temp_uniq_file_name_mapping.json","w",encoding="utf-8") as f:
|
| 123 |
+
json.dump(uniq_filename_mapping, f, ensure_ascii=False, indent=2)
|
| 124 |
+
with open(annotations_dir+"instances_train.json", "w", encoding="utf-8") as f:
|
| 125 |
+
json.dump(coco_train, f, ensure_ascii=False, indent=2)
|
| 126 |
+
with open(annotations_dir+"instances_val.json", "w", encoding="utf-8") as f:
|
| 127 |
+
json.dump(coco_val, f, ensure_ascii=False, indent=2)
|
| 128 |
+
with open(annotations_dir+"instances_test.json", "w", encoding="utf-8") as f:
|
| 129 |
+
json.dump(coco_test, f, ensure_ascii=False, indent=2)
|
| 130 |
+
|
| 131 |
+
create_coco(base_dir+"json/")
|
utils/json_to_gcn.py
ADDED
|
@@ -0,0 +1,484 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Enhanced Manga Panel Dataset Converter: Extract frames as panels and prepare GCN-ready dataset.
|
| 3 |
+
Crops each frame from manga pages and creates dataset with elements within each panel.
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
import json
|
| 7 |
+
import os
|
| 8 |
+
import argparse
|
| 9 |
+
from pathlib import Path
|
| 10 |
+
from typing import Dict, List, Any, Optional, Tuple
|
| 11 |
+
import difflib
|
| 12 |
+
import math
|
| 13 |
+
import cv2
|
| 14 |
+
import numpy as np
|
| 15 |
+
|
| 16 |
+
|
| 17 |
+
class MangaPanelDatasetConverter:
|
| 18 |
+
def __init__(self, fuzzy_mapping: bool = True, proximity_threshold: float = 200.0):
|
| 19 |
+
self.fuzzy_mapping = fuzzy_mapping
|
| 20 |
+
self.proximity_threshold = proximity_threshold
|
| 21 |
+
self.character_mapping = {}
|
| 22 |
+
self.stats = {
|
| 23 |
+
'total_pages': 0,
|
| 24 |
+
'total_panels': 0,
|
| 25 |
+
'total_bubbles': 0,
|
| 26 |
+
'total_faces': 0,
|
| 27 |
+
'total_bodies': 0,
|
| 28 |
+
'successful_links': 0,
|
| 29 |
+
'failed_links': 0,
|
| 30 |
+
'unique_characters': set(),
|
| 31 |
+
'extracted_panels': 0
|
| 32 |
+
}
|
| 33 |
+
|
| 34 |
+
def load_manga_data(self, file_path: str) -> Dict[str, Any]:
|
| 35 |
+
"""Load manga data from JSON file."""
|
| 36 |
+
with open(file_path, 'r', encoding='utf-8') as f:
|
| 37 |
+
return json.load(f)
|
| 38 |
+
|
| 39 |
+
def build_character_mapping(self, character_list: List[Dict]) -> Dict[str, str]:
|
| 40 |
+
"""Build character ID to name mapping from character list."""
|
| 41 |
+
mapping = {}
|
| 42 |
+
for char in character_list:
|
| 43 |
+
mapping_id = char.get('@id', '')
|
| 44 |
+
char_name = char.get('@name', '')
|
| 45 |
+
if mapping_id and char_name:
|
| 46 |
+
mapping[mapping_id] = char_name
|
| 47 |
+
return mapping
|
| 48 |
+
|
| 49 |
+
def check_overlap(self, bbox1: List[int], bbox2: List[int]) -> bool:
|
| 50 |
+
"""Check if two bounding boxes overlap."""
|
| 51 |
+
x1_min, y1_min, x1_max, y1_max = bbox1
|
| 52 |
+
x2_min, y2_min, x2_max, y2_max = bbox2
|
| 53 |
+
|
| 54 |
+
return not (x1_max <= x2_min or x2_max <= x1_min or
|
| 55 |
+
y1_max <= y2_min or y2_max <= y1_min)
|
| 56 |
+
|
| 57 |
+
def is_element_in_frame(self, element_bbox: List[int], frame_bbox: List[int],
|
| 58 |
+
overlap_threshold: float = 0.5) -> bool:
|
| 59 |
+
"""
|
| 60 |
+
Check if an element (face, body, text) is within or significantly overlaps with a frame.
|
| 61 |
+
|
| 62 |
+
Args:
|
| 63 |
+
element_bbox: [xmin, ymin, xmax, ymax] of the element
|
| 64 |
+
frame_bbox: [xmin, ymin, xmax, ymax] of the frame
|
| 65 |
+
overlap_threshold: Minimum overlap ratio to consider element as belonging to frame
|
| 66 |
+
"""
|
| 67 |
+
if not self.check_overlap(element_bbox, frame_bbox):
|
| 68 |
+
return False
|
| 69 |
+
|
| 70 |
+
# Calculate intersection area
|
| 71 |
+
x_left = max(element_bbox[0], frame_bbox[0])
|
| 72 |
+
y_top = max(element_bbox[1], frame_bbox[1])
|
| 73 |
+
x_right = min(element_bbox[2], frame_bbox[2])
|
| 74 |
+
y_bottom = min(element_bbox[3], frame_bbox[3])
|
| 75 |
+
|
| 76 |
+
if x_right <= x_left or y_bottom <= y_top:
|
| 77 |
+
return False
|
| 78 |
+
|
| 79 |
+
intersection_area = (x_right - x_left) * (y_bottom - y_top)
|
| 80 |
+
element_area = (element_bbox[2] - element_bbox[0]) * (element_bbox[3] - element_bbox[1])
|
| 81 |
+
|
| 82 |
+
# Check if significant portion of element is within frame
|
| 83 |
+
overlap_ratio = intersection_area / element_area if element_area > 0 else 0
|
| 84 |
+
return overlap_ratio >= overlap_threshold
|
| 85 |
+
|
| 86 |
+
def adjust_coordinates_to_frame(self, element_bbox: List[int], frame_bbox: List[int]) -> List[int]:
|
| 87 |
+
"""Adjust element coordinates to be relative to the cropped frame."""
|
| 88 |
+
frame_x_min, frame_y_min = frame_bbox[0], frame_bbox[1]
|
| 89 |
+
|
| 90 |
+
adjusted_bbox = [
|
| 91 |
+
element_bbox[0] - frame_x_min, # xmin
|
| 92 |
+
element_bbox[1] - frame_y_min, # ymin
|
| 93 |
+
element_bbox[2] - frame_x_min, # xmax
|
| 94 |
+
element_bbox[3] - frame_y_min # ymax
|
| 95 |
+
]
|
| 96 |
+
|
| 97 |
+
# Ensure coordinates are not negative
|
| 98 |
+
adjusted_bbox = [max(0, coord) for coord in adjusted_bbox]
|
| 99 |
+
|
| 100 |
+
return adjusted_bbox
|
| 101 |
+
|
| 102 |
+
def extract_panel_from_page(self, image_path: str, frame_data: Dict,
|
| 103 |
+
output_dir: str, page_index: int, frame_index: int) -> Optional[str]:
|
| 104 |
+
"""
|
| 105 |
+
Extract a single panel (frame) from manga page image.
|
| 106 |
+
|
| 107 |
+
Args:
|
| 108 |
+
image_path: Path to the manga page image
|
| 109 |
+
frame_data: Frame annotation with coordinates
|
| 110 |
+
output_dir: Directory to save extracted panel
|
| 111 |
+
page_index: Index of the current page
|
| 112 |
+
frame_index: Index of the frame within the page
|
| 113 |
+
|
| 114 |
+
Returns:
|
| 115 |
+
Path to extracted panel image or None if failed
|
| 116 |
+
"""
|
| 117 |
+
# print(image_path)
|
| 118 |
+
# Load the image
|
| 119 |
+
image = cv2.imread(image_path)
|
| 120 |
+
if image is None:
|
| 121 |
+
print(f"Warning: Could not load image {image_path}")
|
| 122 |
+
return None
|
| 123 |
+
|
| 124 |
+
try:
|
| 125 |
+
# Get frame coordinates
|
| 126 |
+
xmin = int(frame_data.get('@xmin', 0))
|
| 127 |
+
ymin = int(frame_data.get('@ymin', 0))
|
| 128 |
+
xmax = int(frame_data.get('@xmax', image.shape[1]))
|
| 129 |
+
ymax = int(frame_data.get('@ymax', image.shape[0]))
|
| 130 |
+
|
| 131 |
+
# Validate and clip coordinates
|
| 132 |
+
xmin = max(0, xmin)
|
| 133 |
+
ymin = max(0, ymin)
|
| 134 |
+
xmax = min(image.shape[1], xmax)
|
| 135 |
+
ymax = min(image.shape[0], ymax)
|
| 136 |
+
|
| 137 |
+
if xmax <= xmin or ymax <= ymin:
|
| 138 |
+
print(f"Warning: Invalid frame coordinates for page {page_index}, frame {frame_index}")
|
| 139 |
+
return None
|
| 140 |
+
|
| 141 |
+
# Crop the panel
|
| 142 |
+
cropped_panel = image[ymin:ymax, xmin:xmax]
|
| 143 |
+
|
| 144 |
+
# Generate panel filename
|
| 145 |
+
panel_filename = f"page_{page_index:04d}_panel_{frame_index:03d}.jpg"
|
| 146 |
+
panel_path = os.path.join(output_dir, panel_filename)
|
| 147 |
+
|
| 148 |
+
# Save the cropped panel
|
| 149 |
+
success = cv2.imwrite(panel_path, cropped_panel)
|
| 150 |
+
if success:
|
| 151 |
+
self.stats['extracted_panels'] += 1
|
| 152 |
+
# print(f"Extracted panel: {panel_path}")
|
| 153 |
+
return panel_path
|
| 154 |
+
else:
|
| 155 |
+
print(f"Warning: Failed to save panel {panel_path}")
|
| 156 |
+
return None
|
| 157 |
+
|
| 158 |
+
except Exception as e:
|
| 159 |
+
print(f"Error extracting panel from page {page_index}, frame {frame_index}: {str(e)}")
|
| 160 |
+
return None
|
| 161 |
+
|
| 162 |
+
def calculate_distance(self, bbox1: List[int], bbox2: List[int]) -> float:
|
| 163 |
+
"""Calculate Euclidean distance between centers of two bounding boxes."""
|
| 164 |
+
cx1, cy1 = (bbox1[0] + bbox1[2]) / 2, (bbox1[1] + bbox1[3]) / 2
|
| 165 |
+
cx2, cy2 = (bbox2[0] + bbox2[2]) / 2, (bbox2[1] + bbox2[3]) / 2
|
| 166 |
+
return math.sqrt((cx1 - cx2)**2 + (cy1 - cy2)**2)
|
| 167 |
+
|
| 168 |
+
def find_character_by_body_id(self, body_id: str, bodies: List[Dict]) -> Optional[str]:
|
| 169 |
+
"""Find character ID associated with a body ID."""
|
| 170 |
+
for body in bodies:
|
| 171 |
+
if body.get('@id') == body_id:
|
| 172 |
+
return body.get('@character')
|
| 173 |
+
return None
|
| 174 |
+
|
| 175 |
+
def find_face_by_character(self, character_id: str, faces: List[Dict]) -> Optional[int]:
|
| 176 |
+
"""Find face index by character ID."""
|
| 177 |
+
for i, face in enumerate(faces):
|
| 178 |
+
if face.get('@character') == character_id:
|
| 179 |
+
return i
|
| 180 |
+
return None
|
| 181 |
+
|
| 182 |
+
def find_closest_face(self, bubble_bbox: List[int], faces: List[Dict]) -> Optional[int]:
|
| 183 |
+
"""Find closest face to bubble based on spatial proximity."""
|
| 184 |
+
if not faces:
|
| 185 |
+
return None
|
| 186 |
+
|
| 187 |
+
min_distance = float('inf')
|
| 188 |
+
closest_face_idx = None
|
| 189 |
+
|
| 190 |
+
for i, face in enumerate(faces):
|
| 191 |
+
face_bbox = [
|
| 192 |
+
face['bbox'][0], face['bbox'][1],
|
| 193 |
+
face['bbox'][2], face['bbox'][3]
|
| 194 |
+
]
|
| 195 |
+
distance = self.calculate_distance(bubble_bbox, face_bbox)
|
| 196 |
+
|
| 197 |
+
if distance < min_distance and distance <= self.proximity_threshold:
|
| 198 |
+
min_distance = distance
|
| 199 |
+
closest_face_idx = i
|
| 200 |
+
|
| 201 |
+
return closest_face_idx
|
| 202 |
+
|
| 203 |
+
def associate_bubble_to_face(self, text_item: Dict, faces: List[Dict],
|
| 204 |
+
bodies: List[Dict], original_bodies: List[Dict]) -> Optional[int]:
|
| 205 |
+
"""Associate a speech bubble to a face using multiple strategies."""
|
| 206 |
+
bubble_bbox = text_item['bbox']
|
| 207 |
+
|
| 208 |
+
# Strategy 1: Direct body ID to character mapping
|
| 209 |
+
body_id = text_item.get('body_ref')
|
| 210 |
+
if body_id:
|
| 211 |
+
character_id = self.find_character_by_body_id(body_id, original_bodies)
|
| 212 |
+
if character_id:
|
| 213 |
+
face_idx = self.find_face_by_character(character_id, faces)
|
| 214 |
+
if face_idx is not None:
|
| 215 |
+
return face_idx
|
| 216 |
+
|
| 217 |
+
# Strategy 2: Spatial proximity - find closest face
|
| 218 |
+
closest_face_idx = self.find_closest_face(bubble_bbox, faces)
|
| 219 |
+
if closest_face_idx is not None:
|
| 220 |
+
return closest_face_idx
|
| 221 |
+
|
| 222 |
+
# Strategy 3: Single face fallback
|
| 223 |
+
if len(faces) == 1:
|
| 224 |
+
return 0
|
| 225 |
+
|
| 226 |
+
return None
|
| 227 |
+
|
| 228 |
+
def process_panel(self, page_data: Dict, frame_data: Dict, page_index: int,
|
| 229 |
+
frame_index: int, character_mapping: Dict[str, str],
|
| 230 |
+
image_path: str = None, panels_output_dir: str = None) -> Dict[str, Any]:
|
| 231 |
+
"""Process a single panel (frame) and create GCN dataset entry."""
|
| 232 |
+
|
| 233 |
+
# Extract panel image if paths provided
|
| 234 |
+
panel_image_path = None
|
| 235 |
+
if image_path and panels_output_dir:
|
| 236 |
+
panel_image_path = self.extract_panel_from_page(
|
| 237 |
+
image_path, frame_data, panels_output_dir, page_index, frame_index
|
| 238 |
+
)
|
| 239 |
+
|
| 240 |
+
# Get frame bounding box
|
| 241 |
+
frame_bbox = [
|
| 242 |
+
frame_data['@xmin'], frame_data['@ymin'],
|
| 243 |
+
frame_data['@xmax'], frame_data['@ymax']
|
| 244 |
+
]
|
| 245 |
+
|
| 246 |
+
panel_width = frame_bbox[2] - frame_bbox[0]
|
| 247 |
+
panel_height = frame_bbox[3] - frame_bbox[1]
|
| 248 |
+
|
| 249 |
+
# Find elements within this frame
|
| 250 |
+
panel_faces = []
|
| 251 |
+
panel_bodies = []
|
| 252 |
+
panel_bubbles = []
|
| 253 |
+
|
| 254 |
+
# Process faces
|
| 255 |
+
for face in page_data.get('face', []):
|
| 256 |
+
face_bbox = [face['@xmin'], face['@ymin'], face['@xmax'], face['@ymax']]
|
| 257 |
+
if self.is_element_in_frame(face_bbox, frame_bbox):
|
| 258 |
+
adjusted_bbox = self.adjust_coordinates_to_frame(face_bbox, frame_bbox)
|
| 259 |
+
character_id = face.get('@character', '')
|
| 260 |
+
character_name = character_mapping.get(character_id, character_id)
|
| 261 |
+
|
| 262 |
+
panel_faces.append({
|
| 263 |
+
"face_id": len(panel_faces),
|
| 264 |
+
"bbox": adjusted_bbox,
|
| 265 |
+
"original_id": face.get('@id', ''),
|
| 266 |
+
"attributes": {
|
| 267 |
+
"character_id": character_id,
|
| 268 |
+
"character_name": character_name
|
| 269 |
+
}
|
| 270 |
+
})
|
| 271 |
+
|
| 272 |
+
if character_id:
|
| 273 |
+
self.stats['unique_characters'].add(character_id)
|
| 274 |
+
|
| 275 |
+
# Process bodies
|
| 276 |
+
for body in page_data.get('body', []):
|
| 277 |
+
body_bbox = [body['@xmin'], body['@ymin'], body['@xmax'], body['@ymax']]
|
| 278 |
+
if self.is_element_in_frame(body_bbox, frame_bbox):
|
| 279 |
+
adjusted_bbox = self.adjust_coordinates_to_frame(body_bbox, frame_bbox)
|
| 280 |
+
character_id = body.get('@character', '')
|
| 281 |
+
character_name = character_mapping.get(character_id, character_id)
|
| 282 |
+
|
| 283 |
+
panel_bodies.append({
|
| 284 |
+
"body_id": len(panel_bodies),
|
| 285 |
+
"bbox": adjusted_bbox,
|
| 286 |
+
"original_id": body.get('@id', ''),
|
| 287 |
+
"attributes": {
|
| 288 |
+
"character_id": character_id,
|
| 289 |
+
"character_name": character_name
|
| 290 |
+
}
|
| 291 |
+
})
|
| 292 |
+
|
| 293 |
+
# Process text/speech bubbles
|
| 294 |
+
for text in page_data.get('text', []):
|
| 295 |
+
text_bbox = [text['@xmin'], text['@ymin'], text['@xmax'], text['@ymax']]
|
| 296 |
+
if self.is_element_in_frame(text_bbox, frame_bbox):
|
| 297 |
+
adjusted_bbox = self.adjust_coordinates_to_frame(text_bbox, frame_bbox)
|
| 298 |
+
|
| 299 |
+
panel_bubbles.append({
|
| 300 |
+
"bubble_id": len(panel_bubbles),
|
| 301 |
+
"bbox": adjusted_bbox,
|
| 302 |
+
"text": text.get('#text', ''),
|
| 303 |
+
"original_id": text.get('@id', ''),
|
| 304 |
+
"body_ref": text.get('@bodyid', '')
|
| 305 |
+
})
|
| 306 |
+
|
| 307 |
+
# Create bubble-to-face links
|
| 308 |
+
links = []
|
| 309 |
+
original_bodies = page_data.get('body', [])
|
| 310 |
+
|
| 311 |
+
for bubble in panel_bubbles:
|
| 312 |
+
face_idx = self.associate_bubble_to_face(
|
| 313 |
+
bubble, panel_faces, panel_bodies, original_bodies
|
| 314 |
+
)
|
| 315 |
+
if face_idx is not None:
|
| 316 |
+
links.append({
|
| 317 |
+
"bubble_id": bubble['bubble_id'],
|
| 318 |
+
"face_id": face_idx
|
| 319 |
+
})
|
| 320 |
+
self.stats['successful_links'] += 1
|
| 321 |
+
else:
|
| 322 |
+
self.stats['failed_links'] += 1
|
| 323 |
+
|
| 324 |
+
# Update statistics
|
| 325 |
+
self.stats['total_panels'] += 1
|
| 326 |
+
self.stats['total_bubbles'] += len(panel_bubbles)
|
| 327 |
+
self.stats['total_faces'] += len(panel_faces)
|
| 328 |
+
self.stats['total_bodies'] += len(panel_bodies)
|
| 329 |
+
|
| 330 |
+
# Create panel ID
|
| 331 |
+
panel_id = f"page_{page_index:04d}_panel_{frame_index:03d}"
|
| 332 |
+
|
| 333 |
+
return {
|
| 334 |
+
"panel_id": panel_id,
|
| 335 |
+
"width": panel_width,
|
| 336 |
+
"height": panel_height,
|
| 337 |
+
"panel_image_path": panel_image_path,
|
| 338 |
+
"bubbles": panel_bubbles,
|
| 339 |
+
"faces": panel_faces,
|
| 340 |
+
"bodies": panel_bodies,
|
| 341 |
+
"links": links,
|
| 342 |
+
"metadata": {
|
| 343 |
+
"original_page_index": page_index,
|
| 344 |
+
"original_frame_index": frame_index,
|
| 345 |
+
"original_frame_id": frame_data.get('@id', ''),
|
| 346 |
+
"original_frame_bbox": frame_bbox
|
| 347 |
+
}
|
| 348 |
+
}
|
| 349 |
+
|
| 350 |
+
def convert_dataset(self, input_file: str, output_file: str,
|
| 351 |
+
image_dir: str = None, panels_output_dir: str = None) -> Dict[str, Any]:
|
| 352 |
+
"""Convert manga dataset to panel-based GCN format."""
|
| 353 |
+
print(f"Loading manga data from: {input_file}")
|
| 354 |
+
manga_data = self.load_manga_data(input_file)
|
| 355 |
+
|
| 356 |
+
# Extract title and character mapping
|
| 357 |
+
title = manga_data.get('title', 'Unknown')
|
| 358 |
+
character_list = manga_data.get('character', [])
|
| 359 |
+
character_mapping = self.build_character_mapping(character_list)
|
| 360 |
+
|
| 361 |
+
print(f"Title: {title}")
|
| 362 |
+
print(f"Characters found: {len(character_mapping)}")
|
| 363 |
+
|
| 364 |
+
# Create panels output directory if specified
|
| 365 |
+
if panels_output_dir:
|
| 366 |
+
os.makedirs(panels_output_dir, exist_ok=True)
|
| 367 |
+
print(f"Panel extraction enabled. Output directory: {panels_output_dir}")
|
| 368 |
+
|
| 369 |
+
# Process all pages and frames
|
| 370 |
+
pages = manga_data.get('page', [])
|
| 371 |
+
converted_panels = []
|
| 372 |
+
|
| 373 |
+
for page_idx, page_data in enumerate(pages):
|
| 374 |
+
self.stats['total_pages'] += 1
|
| 375 |
+
|
| 376 |
+
# Construct image path if image directory provided
|
| 377 |
+
image_path = os.path.join(image_dir,f"{str(page_data.get('@index', page_idx)).zfill(3)}.jpg")
|
| 378 |
+
|
| 379 |
+
# Process each frame in the page
|
| 380 |
+
frames = page_data.get('frame', [])
|
| 381 |
+
for frame_idx, frame_data in enumerate(frames):
|
| 382 |
+
panel = self.process_panel(
|
| 383 |
+
page_data, frame_data, page_idx, frame_idx,
|
| 384 |
+
character_mapping, image_path, panels_output_dir
|
| 385 |
+
)
|
| 386 |
+
converted_panels.append(panel)
|
| 387 |
+
|
| 388 |
+
# Create final dataset
|
| 389 |
+
dataset = {
|
| 390 |
+
"title": title,
|
| 391 |
+
"character_mapping": character_mapping,
|
| 392 |
+
"panels": converted_panels,
|
| 393 |
+
"conversion_stats": {
|
| 394 |
+
"total_pages": self.stats['total_pages'],
|
| 395 |
+
"total_panels": self.stats['total_panels'],
|
| 396 |
+
"total_bubbles": self.stats['total_bubbles'],
|
| 397 |
+
"total_faces": self.stats['total_faces'],
|
| 398 |
+
"total_bodies": self.stats['total_bodies'],
|
| 399 |
+
"successful_links": self.stats['successful_links'],
|
| 400 |
+
"failed_links": self.stats['failed_links'],
|
| 401 |
+
"extracted_panels": self.stats['extracted_panels'],
|
| 402 |
+
"unique_characters": len(self.stats['unique_characters']),
|
| 403 |
+
"link_success_rate": (self.stats['successful_links'] /
|
| 404 |
+
max(1, self.stats['total_bubbles'])) * 100
|
| 405 |
+
}
|
| 406 |
+
}
|
| 407 |
+
|
| 408 |
+
# Save converted dataset
|
| 409 |
+
print(f"Saving converted dataset to: {output_file}")
|
| 410 |
+
with open(output_file, 'w', encoding='utf-8') as f:
|
| 411 |
+
json.dump(dataset, f, ensure_ascii=False, indent=2)
|
| 412 |
+
|
| 413 |
+
# Print statistics
|
| 414 |
+
self.print_conversion_stats()
|
| 415 |
+
|
| 416 |
+
return dataset
|
| 417 |
+
|
| 418 |
+
def print_conversion_stats(self):
|
| 419 |
+
"""Print conversion statistics."""
|
| 420 |
+
print("\n=== Conversion Statistics ===")
|
| 421 |
+
print(f"Total pages processed: {self.stats['total_pages']}")
|
| 422 |
+
print(f"Total panels extracted: {self.stats['total_panels']}")
|
| 423 |
+
print(f"Total speech bubbles: {self.stats['total_bubbles']}")
|
| 424 |
+
print(f"Total faces: {self.stats['total_faces']}")
|
| 425 |
+
print(f"Total bodies: {self.stats['total_bodies']}")
|
| 426 |
+
print(f"Successful links: {self.stats['successful_links']}")
|
| 427 |
+
print(f"Failed links: {self.stats['failed_links']}")
|
| 428 |
+
print(f"Panel images extracted: {self.stats['extracted_panels']}")
|
| 429 |
+
print(f"Unique characters: {len(self.stats['unique_characters'])}")
|
| 430 |
+
if self.stats['total_bubbles'] > 0:
|
| 431 |
+
success_rate = (self.stats['successful_links'] / self.stats['total_bubbles']) * 100
|
| 432 |
+
print(f"Link success rate: {success_rate:.1f}%")
|
| 433 |
+
print("=" * 30)
|
| 434 |
+
|
| 435 |
+
|
| 436 |
+
def json_to_gcn(config):
|
| 437 |
+
# !python ./frame.py ../json/AkkeraKanjinchou.json ./output2.json --image-dir ../Manga109/images/AkkeraKanjinchou/ --panels-output-dir ./frames1/
|
| 438 |
+
# parser = argparse.ArgumentParser(description='Convert manga dataset to panel-based GCN format')
|
| 439 |
+
# parser.add_argument('input_file', help='Input JSON file with manga annotations')
|
| 440 |
+
# parser.add_argument('output_file', help='Output JSON file for GCN dataset')
|
| 441 |
+
# parser.add_argument('--image-dir', type=str, required=True,
|
| 442 |
+
# help='Directory containing manga page images')
|
| 443 |
+
# parser.add_argument('--panels-output-dir', type=str, required=True,
|
| 444 |
+
# help='Directory to save extracted panel images')
|
| 445 |
+
# parser.add_argument('--fuzzy-mapping', action='store_true', default=True,
|
| 446 |
+
# help='Enable fuzzy ID matching (default: True)')
|
| 447 |
+
# parser.add_argument('--proximity-threshold', type=float, default=200.0,
|
| 448 |
+
# help='Maximum distance for spatial proximity matching (default: 200.0)')
|
| 449 |
+
# parser.add_argument('--overlap-threshold', type=float, default=0.5,
|
| 450 |
+
# help='Minimum overlap ratio for element-frame association (default: 0.5)')
|
| 451 |
+
|
| 452 |
+
# args = parser.parse_args()
|
| 453 |
+
base_dir = config["root"]
|
| 454 |
+
manga109_root_dir = config["manga109_root_dir"]
|
| 455 |
+
root_json = config["root_json"]
|
| 456 |
+
|
| 457 |
+
for book_json in os.listdir(root_json):
|
| 458 |
+
book_name = str(book_json).split(".")[0]
|
| 459 |
+
panel_dir = os.path.join(base_dir,"panels",book_name)
|
| 460 |
+
if not os.path.exists(panel_dir):
|
| 461 |
+
os.makedirs(panel_dir)
|
| 462 |
+
|
| 463 |
+
# Convert dataset
|
| 464 |
+
converter = MangaPanelDatasetConverter(
|
| 465 |
+
# fuzzy_mapping=args.fuzzy_mapping,
|
| 466 |
+
# proximity_threshold=args.proximity_threshold
|
| 467 |
+
)
|
| 468 |
+
input_file = os.path.join(root_json,book_json)
|
| 469 |
+
output_file = os.path.join(base_dir,"panel_data",book_json)
|
| 470 |
+
image_dir = os.path.join(manga109_root_dir,"images",book_name)
|
| 471 |
+
try:
|
| 472 |
+
result = converter.convert_dataset(
|
| 473 |
+
input_file,
|
| 474 |
+
output_file,
|
| 475 |
+
image_dir,
|
| 476 |
+
panel_dir
|
| 477 |
+
)
|
| 478 |
+
print(f"\nConversion completed successfully!")
|
| 479 |
+
print(f"GCN dataset saved to: {output_file}")
|
| 480 |
+
print(f"Panel images saved to: {panel_dir}")
|
| 481 |
+
|
| 482 |
+
except Exception as e:
|
| 483 |
+
print(f"Error during conversion: {str(e)}")
|
| 484 |
+
|
utils/panel_ordering.py
ADDED
|
@@ -0,0 +1,258 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import cv2
|
| 2 |
+
|
| 3 |
+
class BoundingBox(object):
|
| 4 |
+
def __init__(self, xmin=None, ymin=None, xmax=None, ymax=None,
|
| 5 |
+
panels=None,
|
| 6 |
+
bbtype=None,
|
| 7 |
+
id_=""):
|
| 8 |
+
self.dict = {
|
| 9 |
+
"@xmin": xmin,
|
| 10 |
+
"@ymin": ymin,
|
| 11 |
+
"@xmax": xmax,
|
| 12 |
+
"@ymax": ymax,
|
| 13 |
+
"@id": id_,
|
| 14 |
+
}
|
| 15 |
+
if panels is None:
|
| 16 |
+
self.panels = [self]
|
| 17 |
+
else:
|
| 18 |
+
self.panels = panels
|
| 19 |
+
self.bbtype = bbtype
|
| 20 |
+
|
| 21 |
+
def init_dict(self, d):
|
| 22 |
+
self.dict = d
|
| 23 |
+
self.dict["@xmin"] = float(self.xmin)
|
| 24 |
+
self.dict["@ymin"] = float(self.ymin)
|
| 25 |
+
self.dict["@xmax"] = float(self.xmax)
|
| 26 |
+
self.dict["@ymax"] = float(self.ymax)
|
| 27 |
+
return self
|
| 28 |
+
|
| 29 |
+
def __getitem__(self, index):
|
| 30 |
+
return self.dict[index]
|
| 31 |
+
|
| 32 |
+
@property
|
| 33 |
+
def xmin(self):
|
| 34 |
+
return self.dict["@xmin"]
|
| 35 |
+
|
| 36 |
+
@property
|
| 37 |
+
def xmax(self):
|
| 38 |
+
return self.dict["@xmax"]
|
| 39 |
+
|
| 40 |
+
@property
|
| 41 |
+
def ymin(self):
|
| 42 |
+
return self.dict["@ymin"]
|
| 43 |
+
|
| 44 |
+
@property
|
| 45 |
+
def ymax(self):
|
| 46 |
+
return self.dict["@ymax"]
|
| 47 |
+
|
| 48 |
+
@property
|
| 49 |
+
def width(self):
|
| 50 |
+
return self.xmax - self.xmin
|
| 51 |
+
|
| 52 |
+
@property
|
| 53 |
+
def height(self):
|
| 54 |
+
return self.ymax - self.ymin
|
| 55 |
+
|
| 56 |
+
@property
|
| 57 |
+
def text(self):
|
| 58 |
+
return self.dict["#text"]
|
| 59 |
+
|
| 60 |
+
@property
|
| 61 |
+
def id(self):
|
| 62 |
+
return self.dict["@id"]
|
| 63 |
+
|
| 64 |
+
@property
|
| 65 |
+
def list(self):
|
| 66 |
+
return [self.xmin, self.ymin, self.xmax, self.ymax]
|
| 67 |
+
|
| 68 |
+
@property
|
| 69 |
+
def is_null(self):
|
| 70 |
+
return self.xmin is None or self.ymin is None or self.xmax is None or self.ymax is None
|
| 71 |
+
|
| 72 |
+
@property
|
| 73 |
+
def area(self):
|
| 74 |
+
if self.xmax is None or self.xmin is None or self.ymax is None or self.ymin is None:
|
| 75 |
+
return 0
|
| 76 |
+
return (self.xmax - self.xmin) * (self.ymax - self.ymin)
|
| 77 |
+
|
| 78 |
+
@property
|
| 79 |
+
def base_panels(self):
|
| 80 |
+
return len(self.panels)
|
| 81 |
+
|
| 82 |
+
|
| 83 |
+
def __getitem__(self, item):
|
| 84 |
+
return self.dict[item]
|
| 85 |
+
|
| 86 |
+
def __add__(self, a):
|
| 87 |
+
assert issubclass(type(a), BoundingBox)
|
| 88 |
+
if a.is_null:
|
| 89 |
+
return self
|
| 90 |
+
elif self.is_null:
|
| 91 |
+
return a
|
| 92 |
+
return BoundingBox(xmin=min(self.xmin, a.xmin),
|
| 93 |
+
ymin=min(self.ymin, a.ymin),
|
| 94 |
+
xmax=max(self.xmax, a.xmax),
|
| 95 |
+
ymax=max(self.ymax, a.ymax),
|
| 96 |
+
panels=self.panels + a.panels)
|
| 97 |
+
|
| 98 |
+
def __mul__(self, a):
|
| 99 |
+
assert issubclass(type(a), BoundingBox)
|
| 100 |
+
bb = BoundingBox(xmin=max(self.xmin, a.xmin),
|
| 101 |
+
ymin=max(self.ymin, a.ymin),
|
| 102 |
+
xmax=min(self.xmax, a.xmax),
|
| 103 |
+
ymax=min(self.ymax, a.ymax),
|
| 104 |
+
panels=self.panels + a.panels)
|
| 105 |
+
if bb.xmin > bb.xmax or bb.ymin > bb.ymax:
|
| 106 |
+
return BoundingBox()
|
| 107 |
+
else:
|
| 108 |
+
return bb
|
| 109 |
+
|
| 110 |
+
def __repr__(self):
|
| 111 |
+
return "<BoundingBox({},{}) {},{},{},{},{}>".format(self.bbtype, self.id, *self.list, self.base_panels)
|
| 112 |
+
|
| 113 |
+
|
| 114 |
+
|
| 115 |
+
def get_pivot_side(zmin, zmax, pivot):
|
| 116 |
+
interception_ratio_threshold = 0.25
|
| 117 |
+
if pivot <= zmin:
|
| 118 |
+
return 1
|
| 119 |
+
elif zmax <= pivot:
|
| 120 |
+
return 0
|
| 121 |
+
else:
|
| 122 |
+
pivot_z_ratio = (pivot - zmin) / (zmax - zmin)
|
| 123 |
+
interception_ratio = min(pivot_z_ratio, 1 - pivot_z_ratio)
|
| 124 |
+
|
| 125 |
+
if interception_ratio > interception_ratio_threshold:
|
| 126 |
+
return -1
|
| 127 |
+
else:
|
| 128 |
+
return 0 if pivot_z_ratio > 0.5 else 1
|
| 129 |
+
|
| 130 |
+
|
| 131 |
+
class BoxSet(set):
|
| 132 |
+
def get_highest_priority_division(self):
|
| 133 |
+
# Horizontal division
|
| 134 |
+
ydivs = sorted([bb.ymin for bb in self] + [bb.ymax for bb in self])
|
| 135 |
+
for pivot in ydivs:
|
| 136 |
+
division = self.get_pivot_division(pivot,
|
| 137 |
+
is_horizontal_division=True)
|
| 138 |
+
if len(division) > 1:
|
| 139 |
+
return division
|
| 140 |
+
|
| 141 |
+
# Vertical division
|
| 142 |
+
xdivs = sorted([bb.xmin for bb in self] + [bb.xmax for bb in self], reverse=True)
|
| 143 |
+
for pivot in xdivs:
|
| 144 |
+
division = self.get_pivot_division(pivot,
|
| 145 |
+
is_horizontal_division=False)
|
| 146 |
+
if len(division) > 1:
|
| 147 |
+
return division
|
| 148 |
+
|
| 149 |
+
# Undividable box set
|
| 150 |
+
return [self]
|
| 151 |
+
|
| 152 |
+
def get_pivot_division(self, pivot, is_horizontal_division):
|
| 153 |
+
divs = [BoxSet(), BoxSet()]
|
| 154 |
+
for bb in self:
|
| 155 |
+
if is_horizontal_division:
|
| 156 |
+
side = get_pivot_side(bb.ymin, bb.ymax, pivot)
|
| 157 |
+
else:
|
| 158 |
+
side = get_pivot_side(-bb.xmax, -bb.xmin, -pivot)
|
| 159 |
+
|
| 160 |
+
if side == -1:
|
| 161 |
+
return [self]
|
| 162 |
+
else:
|
| 163 |
+
divs[side].add(bb)
|
| 164 |
+
if len(divs[0]) == 0 or len(divs[1]) == 0:
|
| 165 |
+
return [self]
|
| 166 |
+
return divs
|
| 167 |
+
|
| 168 |
+
def get_multicut_division(self, cuts):
|
| 169 |
+
curset = self
|
| 170 |
+
cur_division = []
|
| 171 |
+
for cut in cuts:
|
| 172 |
+
pivot, is_horizontal_division = cut
|
| 173 |
+
division = curset.get_pivot_division(pivot, is_horizontal_division)
|
| 174 |
+
if len(division) > 1:
|
| 175 |
+
cur_division.append(division[0])
|
| 176 |
+
curset = division[1]
|
| 177 |
+
if len(cur_division) > 0:
|
| 178 |
+
return cur_division + [curset]
|
| 179 |
+
else:
|
| 180 |
+
return [self]
|
| 181 |
+
|
| 182 |
+
def yield_ordered_bbs(self):
|
| 183 |
+
if len(self) == 0:
|
| 184 |
+
pass
|
| 185 |
+
elif len(self) > 1:
|
| 186 |
+
yield self.sum(), False
|
| 187 |
+
else:
|
| 188 |
+
yield next(iter(self)), True
|
| 189 |
+
|
| 190 |
+
def sum(self):
|
| 191 |
+
if len(self) == 0:
|
| 192 |
+
return BoundingBox()
|
| 193 |
+
else:
|
| 194 |
+
l = list(self)
|
| 195 |
+
return sum(l[1:], l[0])
|
| 196 |
+
|
| 197 |
+
class BoxNode(object):
|
| 198 |
+
def __init__(self, bbset, initial_cuts=None):
|
| 199 |
+
if initial_cuts:
|
| 200 |
+
division = bbset.get_multicut_division(initial_cuts)
|
| 201 |
+
else:
|
| 202 |
+
division = [bbset]
|
| 203 |
+
|
| 204 |
+
if len(division) == 1:
|
| 205 |
+
division = bbset.get_highest_priority_division()
|
| 206 |
+
|
| 207 |
+
isLeaf = len(division) <= 1
|
| 208 |
+
self.division = division if isLeaf else [BoxNode(section) for section in division]
|
| 209 |
+
|
| 210 |
+
def yield_ordered_bbs(self):
|
| 211 |
+
for section in self.division:
|
| 212 |
+
for bb in section.yield_ordered_bbs():
|
| 213 |
+
yield bb
|
| 214 |
+
|
| 215 |
+
class BoxOrderEstimator(object):
|
| 216 |
+
def __init__(self, bbs, pagewidth=None, initial_cut_option=None):
|
| 217 |
+
if initial_cut_option == "two-page-four-panel":
|
| 218 |
+
initial_cuts = [(pagewidth * n / 4, False)
|
| 219 |
+
for n in reversed(range(1, 4))]
|
| 220 |
+
elif initial_cut_option == "two-page":
|
| 221 |
+
initial_cuts = [(pagewidth / 2, False)]
|
| 222 |
+
else:
|
| 223 |
+
initial_cuts = None
|
| 224 |
+
|
| 225 |
+
self.boxnode = BoxNode(BoxSet(bbs), initial_cuts)
|
| 226 |
+
t = tuple(zip(*self.boxnode.yield_ordered_bbs()))
|
| 227 |
+
if len(t) > 0:
|
| 228 |
+
self.ordered_bbs, self.bb_estimation_statuses = t
|
| 229 |
+
else:
|
| 230 |
+
self.ordered_bbs, self.bb_estimation_statuses = (), ()
|
| 231 |
+
|
| 232 |
+
|
| 233 |
+
def panel_ordering(test_image,dets):
|
| 234 |
+
image = cv2.imread(test_image)
|
| 235 |
+
image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB) # Convert BGR to RGB for display
|
| 236 |
+
# interception_ratio_threshold = 0.25
|
| 237 |
+
|
| 238 |
+
# print(predictions_frame)
|
| 239 |
+
|
| 240 |
+
panels = set()
|
| 241 |
+
# for panel in dets['frame']:
|
| 242 |
+
for panel in dets.panels:
|
| 243 |
+
panel = panel['bbox']
|
| 244 |
+
panels.add(BoundingBox(panel[0],panel[1],panel[2],panel[3]))
|
| 245 |
+
# print(panels)
|
| 246 |
+
|
| 247 |
+
# image = page.get_image()
|
| 248 |
+
pagewidth = image.size
|
| 249 |
+
pageheight, pagewidth, pagechannels = image.shape
|
| 250 |
+
|
| 251 |
+
# panels = page.get_bbs()["frame"]
|
| 252 |
+
# print(panels)
|
| 253 |
+
boxOrderEstimator = BoxOrderEstimator(
|
| 254 |
+
panels,
|
| 255 |
+
pagewidth=pagewidth,
|
| 256 |
+
initial_cut_option="two-page")
|
| 257 |
+
|
| 258 |
+
return boxOrderEstimator
|
utils/train_speaker.py
ADDED
|
@@ -0,0 +1,385 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
Complete GCN Training Pipeline for Speech Bubble to Speaker Association
|
| 4 |
+
Fixed version that handles the dataset format correctly and resolves training issues.
|
| 5 |
+
"""
|
| 6 |
+
|
| 7 |
+
import json
|
| 8 |
+
import torch
|
| 9 |
+
import numpy as np
|
| 10 |
+
import random
|
| 11 |
+
from torch_geometric.data import HeteroData, Batch
|
| 12 |
+
import torch.nn as nn
|
| 13 |
+
from scipy.optimize import linear_sum_assignment
|
| 14 |
+
from typing import Dict, List, Any, Optional, Tuple
|
| 15 |
+
# from utils.utilities import save_checkpoint
|
| 16 |
+
import os
|
| 17 |
+
|
| 18 |
+
from pathlib import Path
|
| 19 |
+
CHECKPOINT_DIR = Path("checkpoints")
|
| 20 |
+
CHECKPOINT_DIR.mkdir(exist_ok=True)
|
| 21 |
+
|
| 22 |
+
def save_checkpoint(model: torch.nn.Module,
|
| 23 |
+
epoch: int,
|
| 24 |
+
loss: float,
|
| 25 |
+
path: Path = CHECKPOINT_DIR / "assoc_gcn.pt") -> None:
|
| 26 |
+
"""
|
| 27 |
+
Persist full training state so you can resume fine-tuning later.
|
| 28 |
+
"""
|
| 29 |
+
path = Path(path)
|
| 30 |
+
torch.save({
|
| 31 |
+
"epoch": epoch,
|
| 32 |
+
"loss": loss,
|
| 33 |
+
"model_state": model.state_dict()
|
| 34 |
+
}, path)
|
| 35 |
+
print(f"✅ Model checkpoint saved to {path.resolve()}")
|
| 36 |
+
|
| 37 |
+
class DatasetLoader:
|
| 38 |
+
"""Handles loading and preprocessing of the converted GCN dataset"""
|
| 39 |
+
|
| 40 |
+
@staticmethod
|
| 41 |
+
def load_converted_dataset(json_path: str) -> List[HeteroData]:
|
| 42 |
+
"""Load the converted GCN dataset and create PyTorch Geometric HeteroData objects"""
|
| 43 |
+
with open(json_path, 'r', encoding='utf-8') as f:
|
| 44 |
+
data = json.load(f)
|
| 45 |
+
|
| 46 |
+
dataset = []
|
| 47 |
+
panels = data.get('panels', [])
|
| 48 |
+
|
| 49 |
+
for panel in panels:
|
| 50 |
+
het_data = DatasetLoader.create_hetero_data_from_panel(panel)
|
| 51 |
+
if het_data is not None:
|
| 52 |
+
dataset.append(het_data)
|
| 53 |
+
|
| 54 |
+
print(f"Loaded {len(dataset)} panels from {json_path}")
|
| 55 |
+
return dataset
|
| 56 |
+
|
| 57 |
+
@staticmethod
|
| 58 |
+
def create_hetero_data_from_panel(panel: Dict) -> Optional[HeteroData]:
|
| 59 |
+
"""Convert a single panel from the converted dataset into HeteroData format"""
|
| 60 |
+
bubbles = panel.get('bubbles', [])
|
| 61 |
+
faces = panel.get('faces', [])
|
| 62 |
+
links = panel.get('links', [])
|
| 63 |
+
|
| 64 |
+
if len(bubbles) == 0 or len(faces) == 0:
|
| 65 |
+
return None
|
| 66 |
+
|
| 67 |
+
W, H = panel['width'], panel['height']
|
| 68 |
+
|
| 69 |
+
# Create node features
|
| 70 |
+
bubble_features = []
|
| 71 |
+
for bubble in bubbles:
|
| 72 |
+
x1, y1, x2, y2 = bubble['bbox']
|
| 73 |
+
cx, cy = (x1 + x2) / (2 * W), (y1 + y2) / (2 * H)
|
| 74 |
+
w, h = (x2 - x1) / W, (y2 - y1) / H
|
| 75 |
+
area = w * h
|
| 76 |
+
aspect = w / h if h > 0 else 1.0
|
| 77 |
+
bubble_features.append([cx, cy, w, h, area, aspect])
|
| 78 |
+
|
| 79 |
+
face_features = []
|
| 80 |
+
for face in faces:
|
| 81 |
+
x1, y1, x2, y2 = face['bbox']
|
| 82 |
+
cx, cy = (x1 + x2) / (2 * W), (y1 + y2) / (2 * H)
|
| 83 |
+
w, h = (x2 - x1) / W, (y2 - y1) / H
|
| 84 |
+
area = w * h
|
| 85 |
+
aspect = w / h if h > 0 else 1.0
|
| 86 |
+
face_features.append([cx, cy, w, h, area, aspect])
|
| 87 |
+
|
| 88 |
+
# Create edge indices and features
|
| 89 |
+
edge_indices, edge_features, edge_labels = [], [], []
|
| 90 |
+
|
| 91 |
+
# Create mappings
|
| 92 |
+
bubble_id_to_idx = {bubble['bubble_id']: i for i, bubble in enumerate(bubbles)}
|
| 93 |
+
face_id_to_idx = {face['face_id']: i for i, face in enumerate(faces)}
|
| 94 |
+
|
| 95 |
+
# Create ground truth mapping
|
| 96 |
+
gt_links = {}
|
| 97 |
+
for link in links:
|
| 98 |
+
if link['bubble_id'] in bubble_id_to_idx and link['face_id'] in face_id_to_idx:
|
| 99 |
+
bubble_idx = bubble_id_to_idx[link['bubble_id']]
|
| 100 |
+
face_idx = face_id_to_idx[link['face_id']]
|
| 101 |
+
gt_links[(bubble_idx, face_idx)] = 1
|
| 102 |
+
|
| 103 |
+
# Create all possible bubble-face edges
|
| 104 |
+
for i, bubble in enumerate(bubbles):
|
| 105 |
+
for j, face in enumerate(faces):
|
| 106 |
+
# Calculate edge features
|
| 107 |
+
b_x1, b_y1, b_x2, b_y2 = bubble['bbox']
|
| 108 |
+
f_x1, f_y1, f_x2, f_y2 = face['bbox']
|
| 109 |
+
|
| 110 |
+
b_cx, b_cy = (b_x1 + b_x2) / (2 * W), (b_y1 + b_y2) / (2 * H)
|
| 111 |
+
f_cx, f_cy = (f_x1 + f_x2) / (2 * W), (f_y1 + f_y2) / (2 * H)
|
| 112 |
+
|
| 113 |
+
dx, dy = b_cx - f_cx, b_cy - f_cy
|
| 114 |
+
dist = (dx**2 + dy**2)**0.5
|
| 115 |
+
|
| 116 |
+
# Calculate IoU
|
| 117 |
+
xx1, yy1 = max(b_x1, f_x1), max(b_y1, f_y1)
|
| 118 |
+
xx2, yy2 = min(b_x2, f_x2), min(b_y2, f_y2)
|
| 119 |
+
inter = max(0, xx2 - xx1) * max(0, yy2 - yy1)
|
| 120 |
+
union = (b_x2 - b_x1) * (b_y2 - b_y1) + (f_x2 - f_x1) * (f_y2 - f_y1) - inter
|
| 121 |
+
iou = inter / union if union > 0 else 0
|
| 122 |
+
|
| 123 |
+
edge_indices.append([i, j])
|
| 124 |
+
edge_features.append([dx, dy, dist, iou])
|
| 125 |
+
edge_labels.append(1.0 if (i, j) in gt_links else 0.0)
|
| 126 |
+
|
| 127 |
+
if len(edge_indices) == 0:
|
| 128 |
+
return None
|
| 129 |
+
|
| 130 |
+
# Create HeteroData object
|
| 131 |
+
data = HeteroData()
|
| 132 |
+
data['bubble'].x = torch.tensor(bubble_features, dtype=torch.float)
|
| 133 |
+
data['face'].x = torch.tensor(face_features, dtype=torch.float)
|
| 134 |
+
data['bubble', 'to', 'face'].edge_index = torch.tensor(edge_indices, dtype=torch.long).t().contiguous()
|
| 135 |
+
data['bubble', 'to', 'face'].edge_attr = torch.tensor(edge_features, dtype=torch.float)
|
| 136 |
+
data['bubble', 'to', 'face'].edge_label = torch.tensor(edge_labels, dtype=torch.float)
|
| 137 |
+
|
| 138 |
+
# Add metadata
|
| 139 |
+
data.panel_id = panel['panel_id']
|
| 140 |
+
data.width = W
|
| 141 |
+
data.height = H
|
| 142 |
+
|
| 143 |
+
return data
|
| 144 |
+
|
| 145 |
+
|
| 146 |
+
class AssocGCN(nn.Module):
|
| 147 |
+
"""Graph Convolutional Network for Speech Bubble to Speaker Association"""
|
| 148 |
+
|
| 149 |
+
def __init__(self, in_feats: int = 6, hid: int = 128):
|
| 150 |
+
super().__init__()
|
| 151 |
+
self.node_encoder = nn.Sequential(
|
| 152 |
+
nn.Linear(in_feats, hid),
|
| 153 |
+
nn.ReLU(),
|
| 154 |
+
nn.Linear(hid, hid)
|
| 155 |
+
)
|
| 156 |
+
|
| 157 |
+
# Message passing layers
|
| 158 |
+
self.conv1 = nn.Sequential(
|
| 159 |
+
nn.Linear(hid * 2 + 4, hid), # node features + edge features
|
| 160 |
+
nn.ReLU(),
|
| 161 |
+
nn.Linear(hid, hid)
|
| 162 |
+
)
|
| 163 |
+
|
| 164 |
+
self.conv2 = nn.Sequential(
|
| 165 |
+
nn.Linear(hid * 2 + 4, hid),
|
| 166 |
+
nn.ReLU(),
|
| 167 |
+
nn.Linear(hid, hid)
|
| 168 |
+
)
|
| 169 |
+
|
| 170 |
+
self.conv3 = nn.Sequential(
|
| 171 |
+
nn.Linear(hid * 2 + 4, hid),
|
| 172 |
+
nn.ReLU(),
|
| 173 |
+
nn.Linear(hid, hid)
|
| 174 |
+
)
|
| 175 |
+
|
| 176 |
+
# Edge classifier
|
| 177 |
+
self.edge_mlp = nn.Sequential(
|
| 178 |
+
nn.Linear(2 * hid + 4, hid),
|
| 179 |
+
nn.ReLU(),
|
| 180 |
+
nn.Dropout(0.1),
|
| 181 |
+
nn.Linear(hid, 1)
|
| 182 |
+
)
|
| 183 |
+
|
| 184 |
+
def forward(self, data):
|
| 185 |
+
bubble_x = self.node_encoder(data['bubble'].x)
|
| 186 |
+
face_x = self.node_encoder(data['face'].x)
|
| 187 |
+
|
| 188 |
+
edge_index = data['bubble', 'to', 'face'].edge_index
|
| 189 |
+
edge_attr = data['bubble', 'to', 'face'].edge_attr
|
| 190 |
+
src_idx, dst_idx = edge_index[0], edge_index[1]
|
| 191 |
+
|
| 192 |
+
# Apply message passing
|
| 193 |
+
for conv in [self.conv1, self.conv2, self.conv3]:
|
| 194 |
+
src_features = bubble_x[src_idx]
|
| 195 |
+
dst_features = face_x[dst_idx]
|
| 196 |
+
|
| 197 |
+
edge_input = torch.cat([src_features, dst_features, edge_attr], dim=1)
|
| 198 |
+
edge_updates = conv(edge_input)
|
| 199 |
+
|
| 200 |
+
# Update node features (simplified aggregation)
|
| 201 |
+
bubble_updates = torch.zeros_like(bubble_x)
|
| 202 |
+
face_updates = torch.zeros_like(face_x)
|
| 203 |
+
|
| 204 |
+
for i in range(len(src_idx)):
|
| 205 |
+
s, d = src_idx[i].item(), dst_idx[i].item()
|
| 206 |
+
bubble_updates[s] += edge_updates[i]
|
| 207 |
+
face_updates[d] += edge_updates[i]
|
| 208 |
+
|
| 209 |
+
# Normalize by degree
|
| 210 |
+
bubble_degrees = torch.bincount(src_idx, minlength=bubble_x.size(0)).float().clamp(min=1)
|
| 211 |
+
face_degrees = torch.bincount(dst_idx, minlength=face_x.size(0)).float().clamp(min=1)
|
| 212 |
+
|
| 213 |
+
bubble_updates = bubble_updates / bubble_degrees.unsqueeze(1)
|
| 214 |
+
face_updates = face_updates / face_degrees.unsqueeze(1)
|
| 215 |
+
|
| 216 |
+
# Residual connection
|
| 217 |
+
bubble_x = bubble_x + bubble_updates
|
| 218 |
+
face_x = face_x + face_updates
|
| 219 |
+
|
| 220 |
+
# Final edge prediction
|
| 221 |
+
src_final = bubble_x[src_idx]
|
| 222 |
+
dst_final = face_x[dst_idx]
|
| 223 |
+
edge_input = torch.cat([src_final, dst_final, edge_attr], dim=1)
|
| 224 |
+
logits = self.edge_mlp(edge_input).squeeze(-1)
|
| 225 |
+
|
| 226 |
+
return logits
|
| 227 |
+
|
| 228 |
+
|
| 229 |
+
def hungarian_matching(scores: torch.Tensor, src_indices, dst_indices):
|
| 230 |
+
"""Apply Hungarian algorithm for optimal bipartite matching"""
|
| 231 |
+
if len(scores) == 0:
|
| 232 |
+
return {}
|
| 233 |
+
|
| 234 |
+
num_bubbles = src_indices.max().item() + 1 if len(src_indices) > 0 else 0
|
| 235 |
+
num_faces = dst_indices.max().item() + 1 if len(dst_indices) > 0 else 0
|
| 236 |
+
|
| 237 |
+
cost_matrix = np.full((num_bubbles, num_faces), 1e6, dtype=np.float32)
|
| 238 |
+
|
| 239 |
+
scores_np = scores.detach().cpu().sigmoid().numpy()
|
| 240 |
+
for i, (s, d, score) in enumerate(zip(src_indices.cpu(), dst_indices.cpu(), scores_np)):
|
| 241 |
+
cost_matrix[s, d] = -score # Negative for minimization
|
| 242 |
+
|
| 243 |
+
row_indices, col_indices = linear_sum_assignment(cost_matrix)
|
| 244 |
+
|
| 245 |
+
mapping = {}
|
| 246 |
+
for r, c in zip(row_indices, col_indices):
|
| 247 |
+
if cost_matrix[r, c] < 0: # Valid assignment
|
| 248 |
+
mapping[int(r)] = int(c)
|
| 249 |
+
|
| 250 |
+
return mapping
|
| 251 |
+
|
| 252 |
+
|
| 253 |
+
def train_gcn(dataset: List[HeteroData], epochs: int = 200, batch_size: int = 16, lr: float = 1e-4):
|
| 254 |
+
"""Train the GCN model on the dataset"""
|
| 255 |
+
if len(dataset) == 0:
|
| 256 |
+
raise ValueError("Dataset is empty!")
|
| 257 |
+
|
| 258 |
+
print(f"Training on {len(dataset)} panels...")
|
| 259 |
+
|
| 260 |
+
device = 'cuda' if torch.cuda.is_available() else 'cpu'
|
| 261 |
+
model = AssocGCN().to(device)
|
| 262 |
+
optimizer = torch.optim.AdamW(model.parameters(), lr=lr)
|
| 263 |
+
|
| 264 |
+
# Calculate class weights
|
| 265 |
+
total_positive = sum(data['bubble', 'to', 'face'].edge_label.sum().item() for data in dataset)
|
| 266 |
+
total_edges = sum(len(data['bubble', 'to', 'face'].edge_label) for data in dataset)
|
| 267 |
+
pos_weight = (total_edges - total_positive) / total_positive if total_positive > 0 else 9.0
|
| 268 |
+
|
| 269 |
+
print(f"Positive edges: {total_positive}/{total_edges} ({100*total_positive/total_edges:.1f}%)")
|
| 270 |
+
print(f"Using pos_weight: {pos_weight:.2f}")
|
| 271 |
+
|
| 272 |
+
loss_fn = nn.BCEWithLogitsLoss(pos_weight=torch.tensor([pos_weight], device=device))
|
| 273 |
+
|
| 274 |
+
model.train()
|
| 275 |
+
best_loss = float("inf")
|
| 276 |
+
for epoch in range(epochs):
|
| 277 |
+
total_correct = 0
|
| 278 |
+
total_samples = 0
|
| 279 |
+
total_tp = 0
|
| 280 |
+
total_fp = 0
|
| 281 |
+
total_fn = 0
|
| 282 |
+
# FIXED: Properly shuffle the dataset (it's a list, not a dict)
|
| 283 |
+
shuffled_dataset = dataset.copy()
|
| 284 |
+
random.shuffle(shuffled_dataset)
|
| 285 |
+
|
| 286 |
+
total_loss = 0.0
|
| 287 |
+
num_batches = (len(shuffled_dataset) + batch_size - 1) // batch_size
|
| 288 |
+
|
| 289 |
+
for batch_idx in range(num_batches):
|
| 290 |
+
start_idx = batch_idx * batch_size
|
| 291 |
+
end_idx = min(start_idx + batch_size, len(shuffled_dataset))
|
| 292 |
+
batch_data = shuffled_dataset[start_idx:end_idx]
|
| 293 |
+
|
| 294 |
+
# Create batch
|
| 295 |
+
batch = Batch.from_data_list(batch_data).to(device)
|
| 296 |
+
|
| 297 |
+
# Forward pass
|
| 298 |
+
logits = model(batch)
|
| 299 |
+
labels = batch['bubble', 'to', 'face'].edge_label
|
| 300 |
+
|
| 301 |
+
# Compute loss
|
| 302 |
+
loss = loss_fn(logits, labels)
|
| 303 |
+
|
| 304 |
+
# Backward pass
|
| 305 |
+
optimizer.zero_grad()
|
| 306 |
+
loss.backward()
|
| 307 |
+
optimizer.step()
|
| 308 |
+
|
| 309 |
+
total_loss += loss.item() * len(batch_data)
|
| 310 |
+
|
| 311 |
+
# Accuracy computation
|
| 312 |
+
probs = torch.sigmoid(logits)
|
| 313 |
+
preds = (probs > 0.5).float()
|
| 314 |
+
correct = (preds == labels).sum().item()
|
| 315 |
+
total_correct += correct
|
| 316 |
+
total_samples += labels.numel()
|
| 317 |
+
|
| 318 |
+
avg_loss = total_loss / len(shuffled_dataset)
|
| 319 |
+
print(f"Epoch {epoch+1:02d}/{epochs}: Loss = {avg_loss:.4f}")
|
| 320 |
+
|
| 321 |
+
avg_loss = total_loss / len(shuffled_dataset)
|
| 322 |
+
accuracy = total_correct / total_samples
|
| 323 |
+
if avg_loss < best_loss:
|
| 324 |
+
best_loss = avg_loss
|
| 325 |
+
save_checkpoint(model, epoch+1, best_loss) # epoch is 0-indexed
|
| 326 |
+
|
| 327 |
+
# Compute recall and F1
|
| 328 |
+
recall = total_tp / (total_tp + total_fn) if (total_tp + total_fn) > 0 else 0.0
|
| 329 |
+
precision = total_tp / (total_tp + total_fp) if (total_tp + total_fp) > 0 else 0.0
|
| 330 |
+
f1 = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0.0
|
| 331 |
+
|
| 332 |
+
print(f"Epoch {epoch+1:02d}/{epochs}: Loss = {avg_loss:.4f}, Accuracy = {accuracy:.4f}, Recall = {recall:.4f}, F1 = {f1:.4f}")
|
| 333 |
+
|
| 334 |
+
|
| 335 |
+
print("Training completed!")
|
| 336 |
+
return model
|
| 337 |
+
|
| 338 |
+
|
| 339 |
+
def infer_associations(model, data):
|
| 340 |
+
"""Infer speech bubble to speaker associations"""
|
| 341 |
+
device = next(model.parameters()).device
|
| 342 |
+
data = data.to(device)
|
| 343 |
+
|
| 344 |
+
model.eval()
|
| 345 |
+
with torch.no_grad():
|
| 346 |
+
logits = model(data)
|
| 347 |
+
src, dst = data['bubble', 'to', 'face'].edge_index
|
| 348 |
+
mapping = hungarian_matching(logits, src, dst)
|
| 349 |
+
|
| 350 |
+
return mapping
|
| 351 |
+
|
| 352 |
+
|
| 353 |
+
# Example usage and testing
|
| 354 |
+
def train_speaker(config):
|
| 355 |
+
# Test with sample data
|
| 356 |
+
dataset = []
|
| 357 |
+
for panel_data_file in os.listdir(os.path.join(config["root"]+"panel_data/")):
|
| 358 |
+
try:
|
| 359 |
+
print(panel_data_file)
|
| 360 |
+
# Load your converted dataset
|
| 361 |
+
dataset += DatasetLoader.load_converted_dataset(os.path.join(config["root"]+"panel_data/",panel_data_file))
|
| 362 |
+
|
| 363 |
+
# if len(dataset) == 0:
|
| 364 |
+
# print("No valid panels found in dataset!")
|
| 365 |
+
# else:
|
| 366 |
+
# # Train the model
|
| 367 |
+
# model = train_gcn(dataset, epochs=10, batch_size=16) # Reduced epochs for testing
|
| 368 |
+
|
| 369 |
+
# # Test inference on first panel
|
| 370 |
+
# test_data = dataset[0]
|
| 371 |
+
# print(test_data)
|
| 372 |
+
# mapping = infer_associations(model, test_data)
|
| 373 |
+
|
| 374 |
+
# print("\nInference Results:")
|
| 375 |
+
# for bubble_id, face_id in mapping.items():
|
| 376 |
+
# print(f"Bubble {bubble_id} → Face {face_id}")
|
| 377 |
+
|
| 378 |
+
except FileNotFoundError:
|
| 379 |
+
print("Error: ./output.json not found!")
|
| 380 |
+
print("Please ensure your converted dataset file exists.")
|
| 381 |
+
except Exception as e:
|
| 382 |
+
print(f"Error: {e}")
|
| 383 |
+
print("Please check your dataset format and file paths.")
|
| 384 |
+
|
| 385 |
+
model = train_gcn(dataset, epochs=30, batch_size=16) # Reduced epochs for testing
|
utils/transcript_page.py
ADDED
|
@@ -0,0 +1,37 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from utils.japanese_ocr import MangaOcr
|
| 2 |
+
import cv2
|
| 3 |
+
from PIL import Image
|
| 4 |
+
|
| 5 |
+
def transcript_page(test_image,translator,predictions):
|
| 6 |
+
image = cv2.imread(test_image)
|
| 7 |
+
mocr = MangaOcr()
|
| 8 |
+
# for bubble in predictions.speech_bubbles:
|
| 9 |
+
# xmin,ymin,xmax,ymax = bubble['bbox']
|
| 10 |
+
# text = mocr(Image.fromarray(cv2.cvtColor(image[ymin:ymax, xmin:xmax], cv2.COLOR_BGR2RGB)))
|
| 11 |
+
# print(text)
|
| 12 |
+
# result = translator(text)
|
| 13 |
+
# print(result[0]['translation_text'])
|
| 14 |
+
|
| 15 |
+
# for panel in predictions.panels:
|
| 16 |
+
# print(panel)
|
| 17 |
+
japanese_transcript = []
|
| 18 |
+
english_transcript = []
|
| 19 |
+
for i in range(len(predictions.panels)):
|
| 20 |
+
# print("ver",i)
|
| 21 |
+
cur_panel = [b for b in predictions.panels if b['order'] == i]
|
| 22 |
+
# print("ver",cur_panel)
|
| 23 |
+
if(len(cur_panel) > 0):
|
| 24 |
+
# If subpanels exist , no of panels will exceed the actual amount of panels
|
| 25 |
+
panel_bubbles = [b for b in predictions.speech_bubbles if b['panel_id'] == cur_panel[0]['id']]
|
| 26 |
+
panel_bubbles.sort(key=lambda b: b['seq'])
|
| 27 |
+
for bubble in panel_bubbles:
|
| 28 |
+
xmin,ymin,xmax,ymax = bubble['bbox']
|
| 29 |
+
char_id = bubble['char_id'] if 'char_id' in bubble.keys() else None
|
| 30 |
+
text = mocr(Image.fromarray(cv2.cvtColor(image[ymin:ymax, xmin:xmax], cv2.COLOR_BGR2RGB)))
|
| 31 |
+
print("Character ",char_id," : ",text)
|
| 32 |
+
japanese_transcript.append({"char_id":char_id,"text":text})
|
| 33 |
+
result = translator(text)
|
| 34 |
+
print("Character ",char_id," : ",result[0]['translation_text'])
|
| 35 |
+
english_transcript.append({"char_id":char_id,"text":result[0]['translation_text']})
|
| 36 |
+
|
| 37 |
+
return japanese_transcript,english_transcript
|
utils/utilities.py
ADDED
|
@@ -0,0 +1,306 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# ✨ utils.py ── reusable helpers
|
| 2 |
+
import torch
|
| 3 |
+
from pathlib import Path
|
| 4 |
+
from typing import List, Dict
|
| 5 |
+
import cv2
|
| 6 |
+
import matplotlib.pyplot as plt
|
| 7 |
+
import numpy as np
|
| 8 |
+
import matplotlib.pyplot as plt
|
| 9 |
+
import matplotlib.patches as patches
|
| 10 |
+
import cv2
|
| 11 |
+
from utils.identify_speaker import DetectionPredictions
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
|
| 15 |
+
def load_model(path: Path,
|
| 16 |
+
device: str = "cpu") -> "AssocGCN":
|
| 17 |
+
"""
|
| 18 |
+
Re-create the network and load the saved weights.
|
| 19 |
+
"""
|
| 20 |
+
from train_speaker import AssocGCN # import lazily to avoid circular ref
|
| 21 |
+
ckpt = torch.load(path, map_location=device)
|
| 22 |
+
model = AssocGCN().to(device)
|
| 23 |
+
model.load_state_dict(ckpt["model_state"])
|
| 24 |
+
model.eval() # switch to inference mode
|
| 25 |
+
print(f"🔄 Loaded checkpoint from {path} (epoch {ckpt['epoch']})")
|
| 26 |
+
return model
|
| 27 |
+
|
| 28 |
+
def show_prediction_per_page(img_path,dets):
|
| 29 |
+
# %matplotlib inline
|
| 30 |
+
# Load your manga panel (image)
|
| 31 |
+
image = cv2.imread(img_path)
|
| 32 |
+
image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB) # Convert BGR to RGB for display
|
| 33 |
+
|
| 34 |
+
# Draw bounding boxes for body
|
| 35 |
+
for box in dets.bodies:
|
| 36 |
+
x_min, y_min, x_max, y_max = box['bbox']
|
| 37 |
+
cv2.rectangle(image, (x_min, y_min), (x_max, y_max), color=(255, 0, 0), thickness=2)
|
| 38 |
+
|
| 39 |
+
# Draw bounding boxes for face
|
| 40 |
+
for box in dets.faces:
|
| 41 |
+
x_min, y_min, x_max, y_max = box['bbox']
|
| 42 |
+
cv2.rectangle(image, (x_min, y_min), (x_max, y_max), color=(0, 255, 0), thickness=2)
|
| 43 |
+
|
| 44 |
+
# Draw bounding boxes for body
|
| 45 |
+
for box in dets.panels:
|
| 46 |
+
x_min, y_min, x_max, y_max = box['bbox']
|
| 47 |
+
cv2.rectangle(image, (x_min, y_min), (x_max, y_max), color=(0, 0, 255), thickness=2)
|
| 48 |
+
|
| 49 |
+
# Draw bounding boxes for body
|
| 50 |
+
for box in dets.speech_bubbles:
|
| 51 |
+
x_min, y_min, x_max, y_max = box['bbox']
|
| 52 |
+
cv2.rectangle(image, (x_min, y_min), (x_max, y_max), color=(255, 255, 0), thickness=2)
|
| 53 |
+
|
| 54 |
+
# Display the image
|
| 55 |
+
plt.imshow(image)
|
| 56 |
+
plt.axis('off')
|
| 57 |
+
plt.title('Manga Panel with Bounding Boxes')
|
| 58 |
+
plt.show()
|
| 59 |
+
|
| 60 |
+
def show_panel_order(img_path,boxOrderEstimator):
|
| 61 |
+
image = cv2.imread(img_path)
|
| 62 |
+
image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB) # Convert BGR to RGB for display
|
| 63 |
+
|
| 64 |
+
def show_panel(panel, labeltext, linestyle="solid", edgecolor="red", ax=None):
|
| 65 |
+
fontsize = 12 # reduce fontsize for memory efficiency
|
| 66 |
+
if ax is None:
|
| 67 |
+
ax = plt.gca()
|
| 68 |
+
|
| 69 |
+
rect = patches.Rectangle(
|
| 70 |
+
(panel.xmin, panel.ymin),
|
| 71 |
+
panel.width,
|
| 72 |
+
panel.height,
|
| 73 |
+
linewidth=2,
|
| 74 |
+
linestyle=linestyle,
|
| 75 |
+
edgecolor=edgecolor,
|
| 76 |
+
facecolor="none"
|
| 77 |
+
)
|
| 78 |
+
ax.add_patch(rect)
|
| 79 |
+
|
| 80 |
+
g_sub_bb_x = panel.xmin + panel.width / 2
|
| 81 |
+
g_sub_bb_y = panel.ymin + panel.height / 2
|
| 82 |
+
ax.text(g_sub_bb_x, g_sub_bb_y, labeltext,
|
| 83 |
+
fontsize=fontsize, color=edgecolor,
|
| 84 |
+
ha='center', va='center')
|
| 85 |
+
|
| 86 |
+
|
| 87 |
+
fig, ax = plt.subplots(figsize=(10, 7)) # This avoids multiple axes
|
| 88 |
+
# Show the image
|
| 89 |
+
if len(np.array(image).shape) == 2:
|
| 90 |
+
ax.imshow(image, cmap="gray")
|
| 91 |
+
else:
|
| 92 |
+
ax.imshow(image)
|
| 93 |
+
|
| 94 |
+
# Draw bounding boxes
|
| 95 |
+
for i_panel, panel in enumerate(boxOrderEstimator.ordered_bbs):
|
| 96 |
+
# print(i_panel,panel)
|
| 97 |
+
order = i_panel + 1
|
| 98 |
+
if len(panel.panels) == 1:
|
| 99 |
+
show_panel(panel, labeltext=f"{order}", edgecolor="red", ax=ax)
|
| 100 |
+
else:
|
| 101 |
+
for subpanel in panel.panels:
|
| 102 |
+
show_panel(subpanel, edgecolor="orange", linestyle="dotted", labeltext=f"({order})", ax=ax)
|
| 103 |
+
|
| 104 |
+
plt.axis('off') # cleaner output
|
| 105 |
+
plt.tight_layout()
|
| 106 |
+
plt.show()
|
| 107 |
+
|
| 108 |
+
|
| 109 |
+
# ============================================================================
|
| 110 |
+
# VISUALIZATION
|
| 111 |
+
# ============================================================================
|
| 112 |
+
|
| 113 |
+
def visualize_speaker_results(image_path: str, predictions: DetectionPredictions,
|
| 114 |
+
associations: Dict[str, Dict[int, int]], output_path: str = None):
|
| 115 |
+
"""
|
| 116 |
+
Visualize detection results and speech-to-speaker associations
|
| 117 |
+
"""
|
| 118 |
+
img = cv2.imread(image_path)
|
| 119 |
+
if img is None:
|
| 120 |
+
print(f"❌ Could not load image for visualization: {image_path}")
|
| 121 |
+
return
|
| 122 |
+
|
| 123 |
+
# Colors for different elements (BGR format for OpenCV)
|
| 124 |
+
colors = {
|
| 125 |
+
'panel': (255, 0, 0), # Blue
|
| 126 |
+
'bubble': (0, 255, 0), # Green
|
| 127 |
+
'face': (0, 0, 255), # Red
|
| 128 |
+
'body': (255, 255, 0), # Cyan
|
| 129 |
+
'association': (255, 0, 255) # Magenta
|
| 130 |
+
}
|
| 131 |
+
|
| 132 |
+
# Draw panels/frames
|
| 133 |
+
for panel in predictions.panels:
|
| 134 |
+
bbox = [int(x) for x in panel['bbox']]
|
| 135 |
+
cv2.rectangle(img, (bbox[0], bbox[1]), (bbox[2], bbox[3]),
|
| 136 |
+
colors['panel'], 2)
|
| 137 |
+
cv2.putText(img, f"Panel {panel['id']}", (bbox[0], bbox[1]-10),
|
| 138 |
+
cv2.FONT_HERSHEY_SIMPLEX, 0.6, colors['panel'], 2)
|
| 139 |
+
|
| 140 |
+
# Draw speech bubbles/text
|
| 141 |
+
for bubble in predictions.speech_bubbles:
|
| 142 |
+
bbox = [int(x) for x in bubble['bbox']]
|
| 143 |
+
cv2.rectangle(img, (bbox[0], bbox[1]), (bbox[2], bbox[3]),
|
| 144 |
+
colors['bubble'], 2)
|
| 145 |
+
cv2.putText(img, f"T{bubble['id']}", (bbox[0], bbox[1]-5),
|
| 146 |
+
cv2.FONT_HERSHEY_SIMPLEX, 0.5, colors['bubble'], 2)
|
| 147 |
+
|
| 148 |
+
# Draw faces
|
| 149 |
+
for face in predictions.faces:
|
| 150 |
+
bbox = [int(x) for x in face['bbox']]
|
| 151 |
+
cv2.rectangle(img, (bbox[0], bbox[1]), (bbox[2], bbox[3]),
|
| 152 |
+
colors['face'], 2)
|
| 153 |
+
cv2.putText(img, f"F{face['id']}", (bbox[0], bbox[1]-5),
|
| 154 |
+
cv2.FONT_HERSHEY_SIMPLEX, 0.5, colors['face'], 2)
|
| 155 |
+
|
| 156 |
+
# Draw bodies (optional)
|
| 157 |
+
for body in predictions.bodies:
|
| 158 |
+
bbox = [int(x) for x in body['bbox']]
|
| 159 |
+
cv2.rectangle(img, (bbox[0], bbox[1]), (bbox[2], bbox[3]),
|
| 160 |
+
colors['body'], 1)
|
| 161 |
+
cv2.putText(img, f"B{body['id']}", (bbox[0], bbox[1]-5),
|
| 162 |
+
cv2.FONT_HERSHEY_SIMPLEX, 0.4, colors['body'], 1)
|
| 163 |
+
|
| 164 |
+
# Draw associations
|
| 165 |
+
for panel_id, mapping in associations.items():
|
| 166 |
+
# Extract panel number from panel_id string
|
| 167 |
+
try:
|
| 168 |
+
panel_num = int(panel_id.split('_')[-1])
|
| 169 |
+
except:
|
| 170 |
+
panel_num = 0
|
| 171 |
+
|
| 172 |
+
panel_bubbles = [b for b in predictions.speech_bubbles
|
| 173 |
+
if b['panel_id'] == panel_num]
|
| 174 |
+
panel_faces = [f for f in predictions.faces
|
| 175 |
+
if f['panel_id'] == panel_num]
|
| 176 |
+
|
| 177 |
+
for bubble_idx, face_idx in mapping.items():
|
| 178 |
+
if bubble_idx < len(panel_bubbles) and face_idx < len(panel_faces):
|
| 179 |
+
b_bbox = panel_bubbles[bubble_idx]['bbox']
|
| 180 |
+
f_bbox = panel_faces[face_idx]['bbox']
|
| 181 |
+
|
| 182 |
+
b_center = (int((b_bbox[0] + b_bbox[2]) / 2),
|
| 183 |
+
int((b_bbox[1] + b_bbox[3]) / 2))
|
| 184 |
+
f_center = (int((f_bbox[0] + f_bbox[2]) / 2),
|
| 185 |
+
int((f_bbox[1] + f_bbox[3]) / 2))
|
| 186 |
+
|
| 187 |
+
# Draw association line
|
| 188 |
+
cv2.line(img, b_center, f_center, colors['association'], 3)
|
| 189 |
+
cv2.circle(img, b_center, 5, colors['association'], -1)
|
| 190 |
+
cv2.circle(img, f_center, 5, colors['association'], -1)
|
| 191 |
+
|
| 192 |
+
# Add association number
|
| 193 |
+
mid_point = (int((b_center[0] + f_center[0]) / 2),
|
| 194 |
+
int((b_center[1] + f_center[1]) / 2))
|
| 195 |
+
cv2.putText(img, f"{bubble_idx}→{face_idx}", mid_point,
|
| 196 |
+
cv2.FONT_HERSHEY_SIMPLEX, 0.5, colors['association'], 2)
|
| 197 |
+
|
| 198 |
+
# Save or display
|
| 199 |
+
if output_path:
|
| 200 |
+
cv2.imwrite(output_path, img)
|
| 201 |
+
print(f"📷 Visualization saved to {output_path}")
|
| 202 |
+
else:
|
| 203 |
+
# cv2.imshow("Speech-to-Speaker Associations", img)
|
| 204 |
+
# cv2.waitKey(0)
|
| 205 |
+
# cv2.destroyAllWindows()
|
| 206 |
+
# Display the image
|
| 207 |
+
plt.imshow(img)
|
| 208 |
+
plt.axis('off')
|
| 209 |
+
plt.title('Speech-to-Speaker Associations')
|
| 210 |
+
plt.show()
|
| 211 |
+
|
| 212 |
+
def process_results(predictions,boxOrderEstimator,pairs,labels,results):
|
| 213 |
+
# Assign order to each panel
|
| 214 |
+
for i_panel, panel in enumerate(boxOrderEstimator.ordered_bbs):
|
| 215 |
+
x1 = panel.xmin
|
| 216 |
+
y1 = panel.ymin
|
| 217 |
+
x2 = panel.xmax
|
| 218 |
+
y2 = panel.ymax
|
| 219 |
+
for pred_panel in predictions.panels:
|
| 220 |
+
x3,y3,x4,y4 = pred_panel['bbox']
|
| 221 |
+
# print(x1 == x3 , y1 == y3 , x2 == x4 , y2 == y4,x1,y1,x2,y2,x3,y3,x4,y4)
|
| 222 |
+
if x1 == x3 and y1 == y3 and x2 == x4 and y2 == y4:
|
| 223 |
+
# print("ok")
|
| 224 |
+
pred_panel['order'] = i_panel
|
| 225 |
+
|
| 226 |
+
if len(panel.panels) > 1:
|
| 227 |
+
for subpanel in panel.panels:
|
| 228 |
+
x1 = subpanel.xmin
|
| 229 |
+
y1 = subpanel.ymin
|
| 230 |
+
x2 = subpanel.xmax
|
| 231 |
+
y2 = subpanel.ymax
|
| 232 |
+
for pred_panel in predictions.panels:
|
| 233 |
+
x3,y3,x4,y4 = pred_panel['bbox']
|
| 234 |
+
# print(x1 == x3 , y1 == y3 , x2 == x4 , y2 == y4,x1,y1,x2,y2,x3,y3,x4,y4)
|
| 235 |
+
if x1 == x3 and y1 == y3 and x2 == x4 and y2 == y4:
|
| 236 |
+
# print("ok")
|
| 237 |
+
pred_panel['order'] = i_panel
|
| 238 |
+
|
| 239 |
+
# Assign character id to each face
|
| 240 |
+
for i,(face,body) in enumerate(pairs):
|
| 241 |
+
# print(i,face,body)
|
| 242 |
+
x1,y1,x2,y2 = face
|
| 243 |
+
for pred_panel in predictions.faces:
|
| 244 |
+
x3,y3,x4,y4 = pred_panel['bbox']
|
| 245 |
+
# print(x1 == x3 , y1 == y3 , x2 == x4 , y2 == y4,x1,y1,x2,y2,x3,y3,x4,y4)
|
| 246 |
+
if x1 == x3 and y1 == y3 and x2 == x4 and y2 == y4:
|
| 247 |
+
# print("ok")
|
| 248 |
+
pred_panel['char_id'] = int(labels[i])
|
| 249 |
+
|
| 250 |
+
# Assign character id to each text
|
| 251 |
+
for panel_id, mapping in results.items():
|
| 252 |
+
# print(panel_id,mapping)
|
| 253 |
+
try:
|
| 254 |
+
panel_num = int(panel_id.split('_')[-1])
|
| 255 |
+
except:
|
| 256 |
+
panel_num = 0
|
| 257 |
+
|
| 258 |
+
panel_bubbles = [b for b in predictions.speech_bubbles
|
| 259 |
+
if b['panel_id'] == panel_num]
|
| 260 |
+
panel_faces = [f for f in predictions.faces
|
| 261 |
+
if f['panel_id'] == panel_num]
|
| 262 |
+
|
| 263 |
+
# print("aaa",panel_num,panel_bubbles,panel_faces)
|
| 264 |
+
for bubble_idx, face_idx in mapping.items():
|
| 265 |
+
if bubble_idx < len(panel_bubbles) and face_idx < len(panel_faces):
|
| 266 |
+
bubble_id = panel_bubbles[bubble_idx]['id']
|
| 267 |
+
face_id = panel_faces[face_idx]['id']
|
| 268 |
+
# print("bbb",bubble_id,face_id)
|
| 269 |
+
|
| 270 |
+
# Safely add char_id to the bubble by matching IDs
|
| 271 |
+
# (in case panel_bubbles and predictions.speech_bubbles are different lists)
|
| 272 |
+
for b in predictions.speech_bubbles:
|
| 273 |
+
if b['id'] == bubble_id:
|
| 274 |
+
for f in predictions.faces:
|
| 275 |
+
if f['id'] == face_id:
|
| 276 |
+
b['char_id'] = f.get('char_id', None)
|
| 277 |
+
break # Stop after finding the bubble
|
| 278 |
+
|
| 279 |
+
return predictions
|
| 280 |
+
|
| 281 |
+
def get_speakers(gender_list):
|
| 282 |
+
speakers = [['Alexandra_Hisakawa','f','o'],['Annmarie_Nele','f','o'],['Luis_Moray','m','y'],['Baldur_Sanjin','m','o'],['Ige_Behringer','m','o'],['Eugenio_Mataracı','m','y'],['Camilla_Holmström','f','y'],['Uta_Obando','f','o'],['Alma_María','f','o'],['Gitta_Nikolina','f','o'],['Kazuhiko_Atallah','m','y'],['Tammie_Ema','f','o'],['Barbora_MacLean','f','y'],['Rosemary_Okafor','f','o'],['Craig_Gutsy','m','o'],['Gilberto_Mathias','m','o'],['Suad_Qasim','f','o'],['Chandra_MacFarland','f','o'],['Marcos_Rudaski','m','o'],['Brenda_Stern','f','y'],['Ana_Florence','f','y'],['Royston_Min','m','y'],['Tammy_Grit','f','y'],['Tanja_Adelina','f','y'],['Kumar_Dahl','m','o'],['Abrahan_Mack','m','o'],['Daisy_Studious','f','o'],['Alison_Dietlinde','f','o'],['Ilkin_Urbano','m','o'],['Dionisio_Schuyler','m','o'],['Narelle_Moon','f','o'],['Asya_Anara','f','o'],['Viktor_Eka','m','o'],['Vjollca_Johnnie','f','o'],['Adde_Michal','m','y'],['Viktor_Menelaos','m','y']]
|
| 283 |
+
new_speakers = {"Male":[],"Female":[]}
|
| 284 |
+
for speaker in speakers:
|
| 285 |
+
# print(speaker)
|
| 286 |
+
if speaker[1] == 'm' and speaker[2] == 'y':
|
| 287 |
+
new_speakers['Male'].append(speaker[0])
|
| 288 |
+
elif speaker[1] == 'f' and speaker[2] == 'y':
|
| 289 |
+
new_speakers['Female'].append(speaker[0])
|
| 290 |
+
for speaker in speakers:
|
| 291 |
+
# print(speaker)
|
| 292 |
+
if speaker[1] == 'm' and speaker[2] == 'o':
|
| 293 |
+
new_speakers['Male'].append(speaker[0])
|
| 294 |
+
elif speaker[1] == 'f' and speaker[2] == 'o':
|
| 295 |
+
new_speakers['Female'].append(speaker[0])
|
| 296 |
+
|
| 297 |
+
# print(new_speakers['Male'])
|
| 298 |
+
# print(new_speakers['Female'])
|
| 299 |
+
# return new_speakers
|
| 300 |
+
speaker_list = []
|
| 301 |
+
|
| 302 |
+
for gender in gender_list:
|
| 303 |
+
speaker_list.append(new_speakers[gender[0]].pop(0))
|
| 304 |
+
speaker_list.append(new_speakers['Male'].pop(0))
|
| 305 |
+
|
| 306 |
+
return speaker_list
|
utils/visual_llm_gender.py
ADDED
|
@@ -0,0 +1,79 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from transformers import Qwen2_5_VLForConditionalGeneration, AutoTokenizer, AutoProcessor
|
| 2 |
+
from qwen_vl_utils import process_vision_info
|
| 3 |
+
|
| 4 |
+
def load_llm():
|
| 5 |
+
|
| 6 |
+
# default: Load the model on the available device(s)
|
| 7 |
+
model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
|
| 8 |
+
"Qwen/Qwen2.5-VL-3B-Instruct", torch_dtype="auto", device_map="auto"
|
| 9 |
+
)
|
| 10 |
+
min_pixels = 256*28*28
|
| 11 |
+
max_pixels = 1280*28*28
|
| 12 |
+
processor = AutoProcessor.from_pretrained("Qwen/Qwen2.5-VL-3B-Instruct", min_pixels=min_pixels, max_pixels=max_pixels)
|
| 13 |
+
return model,processor
|
| 14 |
+
|
| 15 |
+
import cv2
|
| 16 |
+
import base64
|
| 17 |
+
from PIL import Image
|
| 18 |
+
from io import BytesIO
|
| 19 |
+
|
| 20 |
+
def check_gender(model,processor,img_list,test_image):
|
| 21 |
+
content = [
|
| 22 |
+
{"type": "text", "text": "Both of the images belong to same person. Identify the gender and respond only Male or Female"},
|
| 23 |
+
]
|
| 24 |
+
# Load image and crop face
|
| 25 |
+
img = cv2.imread(test_image)
|
| 26 |
+
for image in img_list[:3]:
|
| 27 |
+
x1, y1, x2, y2 = image['bbox']
|
| 28 |
+
face_crop = img[y1:y2, x1:x2]
|
| 29 |
+
# Encode to JPEG format in memory
|
| 30 |
+
_, buffer = cv2.imencode('.jpg', face_crop)
|
| 31 |
+
# Convert to Base64
|
| 32 |
+
face_base64 = base64.b64encode(buffer).decode('utf-8')
|
| 33 |
+
image_bytes = base64.b64decode(face_base64)
|
| 34 |
+
image_64 = Image.open(BytesIO(image_bytes)).convert("RGB")
|
| 35 |
+
content.append({"type": "image", "image": image_64})
|
| 36 |
+
messages = [
|
| 37 |
+
{
|
| 38 |
+
"role": "user",
|
| 39 |
+
"content": content
|
| 40 |
+
}
|
| 41 |
+
]
|
| 42 |
+
|
| 43 |
+
# Preparation for inference
|
| 44 |
+
text = processor.apply_chat_template(
|
| 45 |
+
messages, tokenize=False, add_generation_prompt=True
|
| 46 |
+
)
|
| 47 |
+
image_inputs, video_inputs = process_vision_info(messages)
|
| 48 |
+
inputs = processor(
|
| 49 |
+
text=[text],
|
| 50 |
+
images=image_inputs,
|
| 51 |
+
videos=video_inputs,
|
| 52 |
+
padding=True,
|
| 53 |
+
return_tensors="pt",
|
| 54 |
+
)
|
| 55 |
+
inputs = inputs.to("cuda")
|
| 56 |
+
|
| 57 |
+
# Inference
|
| 58 |
+
generated_ids = model.generate(**inputs, max_new_tokens=128)
|
| 59 |
+
generated_ids_trimmed = [
|
| 60 |
+
out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
|
| 61 |
+
]
|
| 62 |
+
output_text = processor.batch_decode(
|
| 63 |
+
generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
|
| 64 |
+
)
|
| 65 |
+
print(output_text)
|
| 66 |
+
return output_text
|
| 67 |
+
|
| 68 |
+
def visual_llm_gender(unique_chars,predictions,test_image):
|
| 69 |
+
model,processor = load_llm()
|
| 70 |
+
# for panel in predictions.faces:
|
| 71 |
+
# print(panel)
|
| 72 |
+
gender_list = []
|
| 73 |
+
for i in range(unique_chars):
|
| 74 |
+
print(i)
|
| 75 |
+
new_panels = [p for p in predictions.faces if p['char_id'] == i]
|
| 76 |
+
print(new_panels)
|
| 77 |
+
gender = check_gender(model,processor,new_panels,test_image)
|
| 78 |
+
gender_list.append(gender)
|
| 79 |
+
return gender_list
|
utils/xml_to_json.py
ADDED
|
@@ -0,0 +1,45 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Extract speaker info and add to parsed-XML json object
|
| 2 |
+
import xml.etree.ElementTree as ET
|
| 3 |
+
import manga109api
|
| 4 |
+
import os
|
| 5 |
+
import json
|
| 6 |
+
def xml_to_json(config):
|
| 7 |
+
manga109_root_dir = config["manga109_root_dir"]
|
| 8 |
+
|
| 9 |
+
def create_json(book):
|
| 10 |
+
def parse_xml(xml_path):
|
| 11 |
+
tree = ET.parse(xml_path)
|
| 12 |
+
return tree.getroot()
|
| 13 |
+
|
| 14 |
+
root_speaker = parse_xml(manga109_root_dir+"annotations_Manga109Dialog/"+book+".xml")
|
| 15 |
+
|
| 16 |
+
print(len(root_speaker.find('pages')))
|
| 17 |
+
fullPage = []
|
| 18 |
+
for page in root_speaker.find('pages'):
|
| 19 |
+
index = page.attrib.get('index')
|
| 20 |
+
width = page.attrib.get('width')
|
| 21 |
+
height = page.attrib.get('height')
|
| 22 |
+
curpage = {}
|
| 23 |
+
for speaker in page.findall('speaker_to_text'):
|
| 24 |
+
id = speaker.attrib.get('id')
|
| 25 |
+
text_id = speaker.attrib.get('text_id')
|
| 26 |
+
speaker_id = speaker.attrib.get('speaker_id')
|
| 27 |
+
curpage[text_id] = speaker_id
|
| 28 |
+
fullPage.append(curpage)
|
| 29 |
+
|
| 30 |
+
p = manga109api.Parser(root_dir=manga109_root_dir)
|
| 31 |
+
|
| 32 |
+
annotation = p.get_annotation(book=book)
|
| 33 |
+
|
| 34 |
+
for page in annotation['page']:
|
| 35 |
+
index = page['@index']
|
| 36 |
+
for obj in page['text']:
|
| 37 |
+
if obj['@id'] in fullPage[index]:
|
| 38 |
+
obj['@bodyid'] = fullPage[index][obj['@id']]
|
| 39 |
+
|
| 40 |
+
with open(config["root"] + "json/"+book+".json","w",encoding='utf-8') as f:
|
| 41 |
+
json.dump(annotation,f,ensure_ascii=False, indent=2)
|
| 42 |
+
|
| 43 |
+
for book in os.listdir(os.path.join(manga109_root_dir,"images")):
|
| 44 |
+
print(book)
|
| 45 |
+
create_json(book)
|
yolo11m/runs/detect/train/args.yaml
ADDED
|
@@ -0,0 +1,105 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
task: detect
|
| 2 |
+
mode: train
|
| 3 |
+
model: yolo11m.pt
|
| 4 |
+
data: ./data.yaml
|
| 5 |
+
epochs: 100
|
| 6 |
+
time: null
|
| 7 |
+
patience: 100
|
| 8 |
+
batch: 16
|
| 9 |
+
imgsz: 640
|
| 10 |
+
save: true
|
| 11 |
+
save_period: -1
|
| 12 |
+
cache: false
|
| 13 |
+
device: '0'
|
| 14 |
+
workers: 8
|
| 15 |
+
project: null
|
| 16 |
+
name: train
|
| 17 |
+
exist_ok: false
|
| 18 |
+
pretrained: true
|
| 19 |
+
optimizer: auto
|
| 20 |
+
verbose: true
|
| 21 |
+
seed: 0
|
| 22 |
+
deterministic: true
|
| 23 |
+
single_cls: false
|
| 24 |
+
rect: false
|
| 25 |
+
cos_lr: false
|
| 26 |
+
close_mosaic: 10
|
| 27 |
+
resume: false
|
| 28 |
+
amp: true
|
| 29 |
+
fraction: 1.0
|
| 30 |
+
profile: false
|
| 31 |
+
freeze: null
|
| 32 |
+
multi_scale: false
|
| 33 |
+
overlap_mask: true
|
| 34 |
+
mask_ratio: 4
|
| 35 |
+
dropout: 0.0
|
| 36 |
+
val: true
|
| 37 |
+
split: val
|
| 38 |
+
save_json: false
|
| 39 |
+
conf: null
|
| 40 |
+
iou: 0.7
|
| 41 |
+
max_det: 300
|
| 42 |
+
half: false
|
| 43 |
+
dnn: false
|
| 44 |
+
plots: true
|
| 45 |
+
source: null
|
| 46 |
+
vid_stride: 1
|
| 47 |
+
stream_buffer: false
|
| 48 |
+
visualize: false
|
| 49 |
+
augment: false
|
| 50 |
+
agnostic_nms: false
|
| 51 |
+
classes: null
|
| 52 |
+
retina_masks: false
|
| 53 |
+
embed: null
|
| 54 |
+
show: false
|
| 55 |
+
save_frames: false
|
| 56 |
+
save_txt: false
|
| 57 |
+
save_conf: false
|
| 58 |
+
save_crop: false
|
| 59 |
+
show_labels: true
|
| 60 |
+
show_conf: true
|
| 61 |
+
show_boxes: true
|
| 62 |
+
line_width: null
|
| 63 |
+
format: torchscript
|
| 64 |
+
keras: false
|
| 65 |
+
optimize: false
|
| 66 |
+
int8: false
|
| 67 |
+
dynamic: false
|
| 68 |
+
simplify: true
|
| 69 |
+
opset: null
|
| 70 |
+
workspace: null
|
| 71 |
+
nms: false
|
| 72 |
+
lr0: 0.01
|
| 73 |
+
lrf: 0.01
|
| 74 |
+
momentum: 0.937
|
| 75 |
+
weight_decay: 0.0005
|
| 76 |
+
warmup_epochs: 3.0
|
| 77 |
+
warmup_momentum: 0.8
|
| 78 |
+
warmup_bias_lr: 0.1
|
| 79 |
+
box: 7.5
|
| 80 |
+
cls: 0.5
|
| 81 |
+
dfl: 1.5
|
| 82 |
+
pose: 12.0
|
| 83 |
+
kobj: 1.0
|
| 84 |
+
nbs: 64
|
| 85 |
+
hsv_h: 0.015
|
| 86 |
+
hsv_s: 0.7
|
| 87 |
+
hsv_v: 0.4
|
| 88 |
+
degrees: 0.0
|
| 89 |
+
translate: 0.1
|
| 90 |
+
scale: 0.5
|
| 91 |
+
shear: 0.0
|
| 92 |
+
perspective: 0.0
|
| 93 |
+
flipud: 0.0
|
| 94 |
+
fliplr: 0.5
|
| 95 |
+
bgr: 0.0
|
| 96 |
+
mosaic: 1.0
|
| 97 |
+
mixup: 0.0
|
| 98 |
+
cutmix: 0.0
|
| 99 |
+
copy_paste: 0.0
|
| 100 |
+
copy_paste_mode: flip
|
| 101 |
+
auto_augment: randaugment
|
| 102 |
+
erasing: 0.4
|
| 103 |
+
cfg: null
|
| 104 |
+
tracker: botsort.yaml
|
| 105 |
+
save_dir: runs/detect/train
|
yolo11m/runs/detect/train2/args.yaml
ADDED
|
@@ -0,0 +1,105 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
task: detect
|
| 2 |
+
mode: train
|
| 3 |
+
model: yolo11m.pt
|
| 4 |
+
data: ./data.yaml
|
| 5 |
+
epochs: 30
|
| 6 |
+
time: null
|
| 7 |
+
patience: 100
|
| 8 |
+
batch: 16
|
| 9 |
+
imgsz: 640
|
| 10 |
+
save: true
|
| 11 |
+
save_period: -1
|
| 12 |
+
cache: false
|
| 13 |
+
device: '0'
|
| 14 |
+
workers: 8
|
| 15 |
+
project: null
|
| 16 |
+
name: train2
|
| 17 |
+
exist_ok: false
|
| 18 |
+
pretrained: true
|
| 19 |
+
optimizer: auto
|
| 20 |
+
verbose: true
|
| 21 |
+
seed: 0
|
| 22 |
+
deterministic: true
|
| 23 |
+
single_cls: false
|
| 24 |
+
rect: false
|
| 25 |
+
cos_lr: false
|
| 26 |
+
close_mosaic: 10
|
| 27 |
+
resume: false
|
| 28 |
+
amp: true
|
| 29 |
+
fraction: 1.0
|
| 30 |
+
profile: false
|
| 31 |
+
freeze: null
|
| 32 |
+
multi_scale: false
|
| 33 |
+
overlap_mask: true
|
| 34 |
+
mask_ratio: 4
|
| 35 |
+
dropout: 0.0
|
| 36 |
+
val: true
|
| 37 |
+
split: val
|
| 38 |
+
save_json: false
|
| 39 |
+
conf: null
|
| 40 |
+
iou: 0.7
|
| 41 |
+
max_det: 300
|
| 42 |
+
half: false
|
| 43 |
+
dnn: false
|
| 44 |
+
plots: true
|
| 45 |
+
source: null
|
| 46 |
+
vid_stride: 1
|
| 47 |
+
stream_buffer: false
|
| 48 |
+
visualize: false
|
| 49 |
+
augment: false
|
| 50 |
+
agnostic_nms: false
|
| 51 |
+
classes: null
|
| 52 |
+
retina_masks: false
|
| 53 |
+
embed: null
|
| 54 |
+
show: false
|
| 55 |
+
save_frames: false
|
| 56 |
+
save_txt: false
|
| 57 |
+
save_conf: false
|
| 58 |
+
save_crop: false
|
| 59 |
+
show_labels: true
|
| 60 |
+
show_conf: true
|
| 61 |
+
show_boxes: true
|
| 62 |
+
line_width: null
|
| 63 |
+
format: torchscript
|
| 64 |
+
keras: false
|
| 65 |
+
optimize: false
|
| 66 |
+
int8: false
|
| 67 |
+
dynamic: false
|
| 68 |
+
simplify: true
|
| 69 |
+
opset: null
|
| 70 |
+
workspace: null
|
| 71 |
+
nms: false
|
| 72 |
+
lr0: 0.01
|
| 73 |
+
lrf: 0.01
|
| 74 |
+
momentum: 0.937
|
| 75 |
+
weight_decay: 0.0005
|
| 76 |
+
warmup_epochs: 3.0
|
| 77 |
+
warmup_momentum: 0.8
|
| 78 |
+
warmup_bias_lr: 0.1
|
| 79 |
+
box: 7.5
|
| 80 |
+
cls: 0.5
|
| 81 |
+
dfl: 1.5
|
| 82 |
+
pose: 12.0
|
| 83 |
+
kobj: 1.0
|
| 84 |
+
nbs: 64
|
| 85 |
+
hsv_h: 0.015
|
| 86 |
+
hsv_s: 0.7
|
| 87 |
+
hsv_v: 0.4
|
| 88 |
+
degrees: 0.0
|
| 89 |
+
translate: 0.1
|
| 90 |
+
scale: 0.5
|
| 91 |
+
shear: 0.0
|
| 92 |
+
perspective: 0.0
|
| 93 |
+
flipud: 0.0
|
| 94 |
+
fliplr: 0.5
|
| 95 |
+
bgr: 0.0
|
| 96 |
+
mosaic: 1.0
|
| 97 |
+
mixup: 0.0
|
| 98 |
+
cutmix: 0.0
|
| 99 |
+
copy_paste: 0.0
|
| 100 |
+
copy_paste_mode: flip
|
| 101 |
+
auto_augment: randaugment
|
| 102 |
+
erasing: 0.4
|
| 103 |
+
cfg: null
|
| 104 |
+
tracker: botsort.yaml
|
| 105 |
+
save_dir: runs/detect/train2
|
yolo11m/runs/detect/train2/results.csv
ADDED
|
@@ -0,0 +1,31 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
epoch,time,train/box_loss,train/cls_loss,train/dfl_loss,metrics/precision(B),metrics/recall(B),metrics/mAP50(B),metrics/mAP50-95(B),val/box_loss,val/cls_loss,val/dfl_loss,lr/pg0,lr/pg1,lr/pg2
|
| 2 |
+
1,364.701,0.99528,0.88434,1.10876,0.84374,0.81339,0.86346,0.61416,0.9399,0.67931,1.06036,0.000415851,0.000415851,0.000415851
|
| 3 |
+
2,717.67,0.93907,0.68799,1.07341,0.8555,0.81756,0.8734,0.6256,0.91886,0.66057,1.05007,0.000805045,0.000805045,0.000805045
|
| 4 |
+
3,1070.28,0.9246,0.66399,1.06682,0.82583,0.78783,0.84902,0.58408,1.00997,0.71844,1.10452,0.00116674,0.00116674,0.00116674
|
| 5 |
+
4,1415.94,0.89772,0.63638,1.05457,0.87076,0.83342,0.88502,0.6424,0.88072,0.59905,1.0277,0.00112625,0.00112625,0.00112625
|
| 6 |
+
5,1761.7,0.87716,0.61356,1.04405,0.88033,0.83517,0.89177,0.6547,0.87207,0.57414,1.0236,0.001085,0.001085,0.001085
|
| 7 |
+
6,2110.53,0.86309,0.59526,1.03721,0.88965,0.8458,0.90099,0.66986,0.83239,0.53844,1.01,0.00104375,0.00104375,0.00104375
|
| 8 |
+
7,2458.47,0.84727,0.5812,1.02879,0.88126,0.84073,0.89735,0.65739,0.86915,0.56387,1.01918,0.0010025,0.0010025,0.0010025
|
| 9 |
+
8,2806.22,0.84261,0.57434,1.02717,0.89677,0.84979,0.90884,0.67838,0.81902,0.52912,0.99507,0.00096125,0.00096125,0.00096125
|
| 10 |
+
9,3154.13,0.83402,0.56153,1.02006,0.89845,0.85878,0.91456,0.68555,0.81409,0.51218,0.98894,0.00092,0.00092,0.00092
|
| 11 |
+
10,3505.97,0.8254,0.55079,1.01678,0.89708,0.86145,0.91757,0.68555,0.83014,0.51276,0.9874,0.00087875,0.00087875,0.00087875
|
| 12 |
+
11,3850.26,0.81667,0.54212,1.01165,0.89666,0.87261,0.92286,0.69549,0.79841,0.49455,0.97811,0.0008375,0.0008375,0.0008375
|
| 13 |
+
12,4198.48,0.8088,0.53515,1.0076,0.90241,0.86783,0.92325,0.69524,0.80291,0.49307,0.98062,0.00079625,0.00079625,0.00079625
|
| 14 |
+
13,4544.49,0.80596,0.53055,1.0057,0.89774,0.86735,0.92057,0.69194,0.80916,0.50261,0.97499,0.000755,0.000755,0.000755
|
| 15 |
+
14,4890.59,0.79576,0.52245,1.00188,0.90424,0.86295,0.92337,0.69801,0.79489,0.49061,0.97536,0.00071375,0.00071375,0.00071375
|
| 16 |
+
15,5238.24,0.79264,0.51495,0.99893,0.90668,0.87894,0.92957,0.70712,0.78733,0.47507,0.96654,0.0006725,0.0006725,0.0006725
|
| 17 |
+
16,5583.58,0.78354,0.50664,0.99404,0.90396,0.87658,0.9299,0.70605,0.78008,0.47593,0.96723,0.00063125,0.00063125,0.00063125
|
| 18 |
+
17,5929.19,0.78022,0.50345,0.99207,0.90272,0.87945,0.93169,0.71304,0.76638,0.46915,0.96346,0.00059,0.00059,0.00059
|
| 19 |
+
18,6280.28,0.77585,0.4996,0.98929,0.9089,0.87976,0.93364,0.71495,0.76637,0.46142,0.95967,0.00054875,0.00054875,0.00054875
|
| 20 |
+
19,6631.13,0.77146,0.49338,0.98748,0.90486,0.8868,0.9333,0.71573,0.76155,0.45881,0.96285,0.0005075,0.0005075,0.0005075
|
| 21 |
+
20,6977.4,0.76437,0.48726,0.98545,0.91121,0.87741,0.93274,0.71797,0.75895,0.45138,0.95755,0.00046625,0.00046625,0.00046625
|
| 22 |
+
21,7309.74,0.76783,0.47192,0.98514,0.90617,0.88781,0.93546,0.72133,0.75577,0.45381,0.95819,0.000425,0.000425,0.000425
|
| 23 |
+
22,7644.28,0.75947,0.46115,0.97943,0.90908,0.88789,0.9366,0.72428,0.74821,0.44937,0.95353,0.00038375,0.00038375,0.00038375
|
| 24 |
+
23,7976.83,0.75231,0.45512,0.97455,0.90643,0.89093,0.93685,0.72362,0.74938,0.4485,0.95206,0.0003425,0.0003425,0.0003425
|
| 25 |
+
24,8308.26,0.74468,0.44702,0.97117,0.90852,0.88729,0.93576,0.72268,0.75086,0.44312,0.95287,0.00030125,0.00030125,0.00030125
|
| 26 |
+
25,8638.72,0.73652,0.44072,0.96737,0.91305,0.89204,0.94021,0.73021,0.7399,0.43814,0.94657,0.00026,0.00026,0.00026
|
| 27 |
+
26,8970.02,0.73089,0.43389,0.96468,0.91071,0.89248,0.93932,0.73064,0.7382,0.43585,0.94573,0.00021875,0.00021875,0.00021875
|
| 28 |
+
27,9302.82,0.72381,0.42706,0.96115,0.91403,0.89302,0.94056,0.73294,0.73565,0.42938,0.94517,0.0001775,0.0001775,0.0001775
|
| 29 |
+
28,9635.86,0.71664,0.4207,0.95751,0.91457,0.8922,0.94121,0.73333,0.73483,0.42626,0.94493,0.00013625,0.00013625,0.00013625
|
| 30 |
+
29,9971.96,0.71077,0.41469,0.95334,0.91221,0.89692,0.94196,0.7358,0.72995,0.42248,0.94099,9.5e-05,9.5e-05,9.5e-05
|
| 31 |
+
30,10303.7,0.70721,0.4106,0.95196,0.91199,0.8969,0.94201,0.7353,0.73065,0.42129,0.94193,5.375e-05,5.375e-05,5.375e-05
|
yolo11m/runs/detect/train2/weights/best.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:472dcc9fcf237d5d9ede9afa30eb7da4ffbad514d5a1bca537918bcd2aa7120a
|
| 3 |
+
size 40502700
|
yolo11m/runs/detect/train2/weights/last.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:755404e647007f3dfb3dac583bea81b176114b2c6c706ae43a177e968a29e98e
|
| 3 |
+
size 40502700
|