|
|
|
import torch
|
|
from pathlib import Path
|
|
from typing import List, Dict
|
|
import cv2
|
|
import matplotlib.pyplot as plt
|
|
import numpy as np
|
|
import matplotlib.pyplot as plt
|
|
import matplotlib.patches as patches
|
|
import cv2
|
|
from utils.identify_speaker import DetectionPredictions
|
|
|
|
|
|
|
|
def load_model(path: Path,
|
|
device: str = "cpu") -> "AssocGCN":
|
|
"""
|
|
Re-create the network and load the saved weights.
|
|
"""
|
|
from train_speaker import AssocGCN
|
|
ckpt = torch.load(path, map_location=device)
|
|
model = AssocGCN().to(device)
|
|
model.load_state_dict(ckpt["model_state"])
|
|
model.eval()
|
|
print(f"🔄 Loaded checkpoint from {path} (epoch {ckpt['epoch']})")
|
|
return model
|
|
|
|
def show_prediction_per_page(img_path,dets):
|
|
|
|
|
|
image = cv2.imread(img_path)
|
|
image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
|
|
|
|
|
|
for box in dets.bodies:
|
|
x_min, y_min, x_max, y_max = box['bbox']
|
|
cv2.rectangle(image, (x_min, y_min), (x_max, y_max), color=(255, 0, 0), thickness=2)
|
|
|
|
|
|
for box in dets.faces:
|
|
x_min, y_min, x_max, y_max = box['bbox']
|
|
cv2.rectangle(image, (x_min, y_min), (x_max, y_max), color=(0, 255, 0), thickness=2)
|
|
|
|
|
|
for box in dets.panels:
|
|
x_min, y_min, x_max, y_max = box['bbox']
|
|
cv2.rectangle(image, (x_min, y_min), (x_max, y_max), color=(0, 0, 255), thickness=2)
|
|
|
|
|
|
for box in dets.speech_bubbles:
|
|
x_min, y_min, x_max, y_max = box['bbox']
|
|
cv2.rectangle(image, (x_min, y_min), (x_max, y_max), color=(255, 255, 0), thickness=2)
|
|
|
|
|
|
plt.imshow(image)
|
|
plt.axis('off')
|
|
plt.title('Manga Panel with Bounding Boxes')
|
|
plt.show()
|
|
|
|
def show_panel_order(img_path,boxOrderEstimator):
|
|
image = cv2.imread(img_path)
|
|
image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
|
|
|
|
def show_panel(panel, labeltext, linestyle="solid", edgecolor="red", ax=None):
|
|
fontsize = 12
|
|
if ax is None:
|
|
ax = plt.gca()
|
|
|
|
rect = patches.Rectangle(
|
|
(panel.xmin, panel.ymin),
|
|
panel.width,
|
|
panel.height,
|
|
linewidth=2,
|
|
linestyle=linestyle,
|
|
edgecolor=edgecolor,
|
|
facecolor="none"
|
|
)
|
|
ax.add_patch(rect)
|
|
|
|
g_sub_bb_x = panel.xmin + panel.width / 2
|
|
g_sub_bb_y = panel.ymin + panel.height / 2
|
|
ax.text(g_sub_bb_x, g_sub_bb_y, labeltext,
|
|
fontsize=fontsize, color=edgecolor,
|
|
ha='center', va='center')
|
|
|
|
|
|
fig, ax = plt.subplots(figsize=(10, 7))
|
|
|
|
if len(np.array(image).shape) == 2:
|
|
ax.imshow(image, cmap="gray")
|
|
else:
|
|
ax.imshow(image)
|
|
|
|
|
|
for i_panel, panel in enumerate(boxOrderEstimator.ordered_bbs):
|
|
|
|
order = i_panel + 1
|
|
if len(panel.panels) == 1:
|
|
show_panel(panel, labeltext=f"{order}", edgecolor="red", ax=ax)
|
|
else:
|
|
for subpanel in panel.panels:
|
|
show_panel(subpanel, edgecolor="orange", linestyle="dotted", labeltext=f"({order})", ax=ax)
|
|
|
|
plt.axis('off')
|
|
plt.tight_layout()
|
|
plt.show()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def visualize_speaker_results(image_path: str, predictions: DetectionPredictions,
|
|
associations: Dict[str, Dict[int, int]], output_path: str = None):
|
|
"""
|
|
Visualize detection results and speech-to-speaker associations
|
|
"""
|
|
img = cv2.imread(image_path)
|
|
if img is None:
|
|
print(f"❌ Could not load image for visualization: {image_path}")
|
|
return
|
|
|
|
|
|
colors = {
|
|
'panel': (255, 0, 0),
|
|
'bubble': (0, 255, 0),
|
|
'face': (0, 0, 255),
|
|
'body': (255, 255, 0),
|
|
'association': (255, 0, 255)
|
|
}
|
|
|
|
|
|
for panel in predictions.panels:
|
|
bbox = [int(x) for x in panel['bbox']]
|
|
cv2.rectangle(img, (bbox[0], bbox[1]), (bbox[2], bbox[3]),
|
|
colors['panel'], 2)
|
|
cv2.putText(img, f"Panel {panel['id']}", (bbox[0], bbox[1]-10),
|
|
cv2.FONT_HERSHEY_SIMPLEX, 0.6, colors['panel'], 2)
|
|
|
|
|
|
for bubble in predictions.speech_bubbles:
|
|
bbox = [int(x) for x in bubble['bbox']]
|
|
cv2.rectangle(img, (bbox[0], bbox[1]), (bbox[2], bbox[3]),
|
|
colors['bubble'], 2)
|
|
cv2.putText(img, f"T{bubble['id']}", (bbox[0], bbox[1]-5),
|
|
cv2.FONT_HERSHEY_SIMPLEX, 0.5, colors['bubble'], 2)
|
|
|
|
|
|
for face in predictions.faces:
|
|
bbox = [int(x) for x in face['bbox']]
|
|
cv2.rectangle(img, (bbox[0], bbox[1]), (bbox[2], bbox[3]),
|
|
colors['face'], 2)
|
|
cv2.putText(img, f"F{face['id']}", (bbox[0], bbox[1]-5),
|
|
cv2.FONT_HERSHEY_SIMPLEX, 0.5, colors['face'], 2)
|
|
|
|
|
|
for body in predictions.bodies:
|
|
bbox = [int(x) for x in body['bbox']]
|
|
cv2.rectangle(img, (bbox[0], bbox[1]), (bbox[2], bbox[3]),
|
|
colors['body'], 1)
|
|
cv2.putText(img, f"B{body['id']}", (bbox[0], bbox[1]-5),
|
|
cv2.FONT_HERSHEY_SIMPLEX, 0.4, colors['body'], 1)
|
|
|
|
|
|
for panel_id, mapping in associations.items():
|
|
|
|
try:
|
|
panel_num = int(panel_id.split('_')[-1])
|
|
except:
|
|
panel_num = 0
|
|
|
|
panel_bubbles = [b for b in predictions.speech_bubbles
|
|
if b['panel_id'] == panel_num]
|
|
panel_faces = [f for f in predictions.faces
|
|
if f['panel_id'] == panel_num]
|
|
|
|
for bubble_idx, face_idx in mapping.items():
|
|
if bubble_idx < len(panel_bubbles) and face_idx < len(panel_faces):
|
|
b_bbox = panel_bubbles[bubble_idx]['bbox']
|
|
f_bbox = panel_faces[face_idx]['bbox']
|
|
|
|
b_center = (int((b_bbox[0] + b_bbox[2]) / 2),
|
|
int((b_bbox[1] + b_bbox[3]) / 2))
|
|
f_center = (int((f_bbox[0] + f_bbox[2]) / 2),
|
|
int((f_bbox[1] + f_bbox[3]) / 2))
|
|
|
|
|
|
cv2.line(img, b_center, f_center, colors['association'], 3)
|
|
cv2.circle(img, b_center, 5, colors['association'], -1)
|
|
cv2.circle(img, f_center, 5, colors['association'], -1)
|
|
|
|
|
|
mid_point = (int((b_center[0] + f_center[0]) / 2),
|
|
int((b_center[1] + f_center[1]) / 2))
|
|
cv2.putText(img, f"{bubble_idx}→{face_idx}", mid_point,
|
|
cv2.FONT_HERSHEY_SIMPLEX, 0.5, colors['association'], 2)
|
|
|
|
|
|
if output_path:
|
|
cv2.imwrite(output_path, img)
|
|
print(f"📷 Visualization saved to {output_path}")
|
|
else:
|
|
|
|
|
|
|
|
|
|
plt.imshow(img)
|
|
plt.axis('off')
|
|
plt.title('Speech-to-Speaker Associations')
|
|
plt.show()
|
|
|
|
def process_results(predictions,boxOrderEstimator,pairs,labels,results):
|
|
|
|
for i_panel, panel in enumerate(boxOrderEstimator.ordered_bbs):
|
|
x1 = panel.xmin
|
|
y1 = panel.ymin
|
|
x2 = panel.xmax
|
|
y2 = panel.ymax
|
|
for pred_panel in predictions.panels:
|
|
x3,y3,x4,y4 = pred_panel['bbox']
|
|
|
|
if x1 == x3 and y1 == y3 and x2 == x4 and y2 == y4:
|
|
|
|
pred_panel['order'] = i_panel
|
|
|
|
if len(panel.panels) > 1:
|
|
for subpanel in panel.panels:
|
|
x1 = subpanel.xmin
|
|
y1 = subpanel.ymin
|
|
x2 = subpanel.xmax
|
|
y2 = subpanel.ymax
|
|
for pred_panel in predictions.panels:
|
|
x3,y3,x4,y4 = pred_panel['bbox']
|
|
|
|
if x1 == x3 and y1 == y3 and x2 == x4 and y2 == y4:
|
|
|
|
pred_panel['order'] = i_panel
|
|
|
|
|
|
for i,(face,body) in enumerate(pairs):
|
|
|
|
x1,y1,x2,y2 = face
|
|
for pred_panel in predictions.faces:
|
|
x3,y3,x4,y4 = pred_panel['bbox']
|
|
|
|
if x1 == x3 and y1 == y3 and x2 == x4 and y2 == y4:
|
|
|
|
pred_panel['char_id'] = int(labels[i])
|
|
|
|
|
|
for panel_id, mapping in results.items():
|
|
|
|
try:
|
|
panel_num = int(panel_id.split('_')[-1])
|
|
except:
|
|
panel_num = 0
|
|
|
|
panel_bubbles = [b for b in predictions.speech_bubbles
|
|
if b['panel_id'] == panel_num]
|
|
panel_faces = [f for f in predictions.faces
|
|
if f['panel_id'] == panel_num]
|
|
|
|
|
|
for bubble_idx, face_idx in mapping.items():
|
|
if bubble_idx < len(panel_bubbles) and face_idx < len(panel_faces):
|
|
bubble_id = panel_bubbles[bubble_idx]['id']
|
|
face_id = panel_faces[face_idx]['id']
|
|
|
|
|
|
|
|
|
|
for b in predictions.speech_bubbles:
|
|
if b['id'] == bubble_id:
|
|
for f in predictions.faces:
|
|
if f['id'] == face_id:
|
|
b['char_id'] = f.get('char_id', None)
|
|
break
|
|
|
|
return predictions
|
|
|
|
def get_speakers(gender_list):
|
|
speakers = [['Alexandra_Hisakawa','f','o'],['Annmarie_Nele','f','o'],['Luis_Moray','m','y'],['Baldur_Sanjin','m','o'],['Ige_Behringer','m','o'],['Eugenio_Mataracı','m','y'],['Camilla_Holmström','f','y'],['Uta_Obando','f','o'],['Alma_María','f','o'],['Gitta_Nikolina','f','o'],['Kazuhiko_Atallah','m','y'],['Tammie_Ema','f','o'],['Barbora_MacLean','f','y'],['Rosemary_Okafor','f','o'],['Craig_Gutsy','m','o'],['Gilberto_Mathias','m','o'],['Suad_Qasim','f','o'],['Chandra_MacFarland','f','o'],['Marcos_Rudaski','m','o'],['Brenda_Stern','f','y'],['Ana_Florence','f','y'],['Royston_Min','m','y'],['Tammy_Grit','f','y'],['Tanja_Adelina','f','y'],['Kumar_Dahl','m','o'],['Abrahan_Mack','m','o'],['Daisy_Studious','f','o'],['Alison_Dietlinde','f','o'],['Ilkin_Urbano','m','o'],['Dionisio_Schuyler','m','o'],['Narelle_Moon','f','o'],['Asya_Anara','f','o'],['Viktor_Eka','m','o'],['Vjollca_Johnnie','f','o'],['Adde_Michal','m','y'],['Viktor_Menelaos','m','y']]
|
|
new_speakers = {"Male":[],"Female":[]}
|
|
for speaker in speakers:
|
|
|
|
if speaker[1] == 'm' and speaker[2] == 'y':
|
|
new_speakers['Male'].append(speaker[0])
|
|
elif speaker[1] == 'f' and speaker[2] == 'y':
|
|
new_speakers['Female'].append(speaker[0])
|
|
for speaker in speakers:
|
|
|
|
if speaker[1] == 'm' and speaker[2] == 'o':
|
|
new_speakers['Male'].append(speaker[0])
|
|
elif speaker[1] == 'f' and speaker[2] == 'o':
|
|
new_speakers['Female'].append(speaker[0])
|
|
|
|
|
|
|
|
|
|
speaker_list = []
|
|
|
|
for gender in gender_list:
|
|
speaker_list.append(new_speakers[gender[0]].pop(0))
|
|
speaker_list.append(new_speakers['Male'].pop(0))
|
|
|
|
return speaker_list |