# ✨ utils.py ── reusable helpers import torch from pathlib import Path from typing import List, Dict import cv2 import matplotlib.pyplot as plt import numpy as np import matplotlib.pyplot as plt import matplotlib.patches as patches import cv2 from utils.identify_speaker import DetectionPredictions def load_model(path: Path, device: str = "cpu") -> "AssocGCN": """ Re-create the network and load the saved weights. """ from train_speaker import AssocGCN # import lazily to avoid circular ref ckpt = torch.load(path, map_location=device) model = AssocGCN().to(device) model.load_state_dict(ckpt["model_state"]) model.eval() # switch to inference mode print(f"🔄 Loaded checkpoint from {path} (epoch {ckpt['epoch']})") return model def show_prediction_per_page(img_path,dets): # %matplotlib inline # Load your manga panel (image) image = cv2.imread(img_path) image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB) # Convert BGR to RGB for display # Draw bounding boxes for body for box in dets.bodies: x_min, y_min, x_max, y_max = box['bbox'] cv2.rectangle(image, (x_min, y_min), (x_max, y_max), color=(255, 0, 0), thickness=2) # Draw bounding boxes for face for box in dets.faces: x_min, y_min, x_max, y_max = box['bbox'] cv2.rectangle(image, (x_min, y_min), (x_max, y_max), color=(0, 255, 0), thickness=2) # Draw bounding boxes for body for box in dets.panels: x_min, y_min, x_max, y_max = box['bbox'] cv2.rectangle(image, (x_min, y_min), (x_max, y_max), color=(0, 0, 255), thickness=2) # Draw bounding boxes for body for box in dets.speech_bubbles: x_min, y_min, x_max, y_max = box['bbox'] cv2.rectangle(image, (x_min, y_min), (x_max, y_max), color=(255, 255, 0), thickness=2) # Display the image plt.imshow(image) plt.axis('off') plt.title('Manga Panel with Bounding Boxes') plt.show() def show_panel_order(img_path,boxOrderEstimator): image = cv2.imread(img_path) image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB) # Convert BGR to RGB for display def show_panel(panel, labeltext, linestyle="solid", edgecolor="red", ax=None): fontsize = 12 # reduce fontsize for memory efficiency if ax is None: ax = plt.gca() rect = patches.Rectangle( (panel.xmin, panel.ymin), panel.width, panel.height, linewidth=2, linestyle=linestyle, edgecolor=edgecolor, facecolor="none" ) ax.add_patch(rect) g_sub_bb_x = panel.xmin + panel.width / 2 g_sub_bb_y = panel.ymin + panel.height / 2 ax.text(g_sub_bb_x, g_sub_bb_y, labeltext, fontsize=fontsize, color=edgecolor, ha='center', va='center') fig, ax = plt.subplots(figsize=(10, 7)) # This avoids multiple axes # Show the image if len(np.array(image).shape) == 2: ax.imshow(image, cmap="gray") else: ax.imshow(image) # Draw bounding boxes for i_panel, panel in enumerate(boxOrderEstimator.ordered_bbs): # print(i_panel,panel) order = i_panel + 1 if len(panel.panels) == 1: show_panel(panel, labeltext=f"{order}", edgecolor="red", ax=ax) else: for subpanel in panel.panels: show_panel(subpanel, edgecolor="orange", linestyle="dotted", labeltext=f"({order})", ax=ax) plt.axis('off') # cleaner output plt.tight_layout() plt.show() # ============================================================================ # VISUALIZATION # ============================================================================ def visualize_speaker_results(image_path: str, predictions: DetectionPredictions, associations: Dict[str, Dict[int, int]], output_path: str = None): """ Visualize detection results and speech-to-speaker associations """ img = cv2.imread(image_path) if img is None: print(f"❌ Could not load image for visualization: {image_path}") return # Colors for different elements (BGR format for OpenCV) colors = { 'panel': (255, 0, 0), # Blue 'bubble': (0, 255, 0), # Green 'face': (0, 0, 255), # Red 'body': (255, 255, 0), # Cyan 'association': (255, 0, 255) # Magenta } # Draw panels/frames for panel in predictions.panels: bbox = [int(x) for x in panel['bbox']] cv2.rectangle(img, (bbox[0], bbox[1]), (bbox[2], bbox[3]), colors['panel'], 2) cv2.putText(img, f"Panel {panel['id']}", (bbox[0], bbox[1]-10), cv2.FONT_HERSHEY_SIMPLEX, 0.6, colors['panel'], 2) # Draw speech bubbles/text for bubble in predictions.speech_bubbles: bbox = [int(x) for x in bubble['bbox']] cv2.rectangle(img, (bbox[0], bbox[1]), (bbox[2], bbox[3]), colors['bubble'], 2) cv2.putText(img, f"T{bubble['id']}", (bbox[0], bbox[1]-5), cv2.FONT_HERSHEY_SIMPLEX, 0.5, colors['bubble'], 2) # Draw faces for face in predictions.faces: bbox = [int(x) for x in face['bbox']] cv2.rectangle(img, (bbox[0], bbox[1]), (bbox[2], bbox[3]), colors['face'], 2) cv2.putText(img, f"F{face['id']}", (bbox[0], bbox[1]-5), cv2.FONT_HERSHEY_SIMPLEX, 0.5, colors['face'], 2) # Draw bodies (optional) for body in predictions.bodies: bbox = [int(x) for x in body['bbox']] cv2.rectangle(img, (bbox[0], bbox[1]), (bbox[2], bbox[3]), colors['body'], 1) cv2.putText(img, f"B{body['id']}", (bbox[0], bbox[1]-5), cv2.FONT_HERSHEY_SIMPLEX, 0.4, colors['body'], 1) # Draw associations for panel_id, mapping in associations.items(): # Extract panel number from panel_id string try: panel_num = int(panel_id.split('_')[-1]) except: panel_num = 0 panel_bubbles = [b for b in predictions.speech_bubbles if b['panel_id'] == panel_num] panel_faces = [f for f in predictions.faces if f['panel_id'] == panel_num] for bubble_idx, face_idx in mapping.items(): if bubble_idx < len(panel_bubbles) and face_idx < len(panel_faces): b_bbox = panel_bubbles[bubble_idx]['bbox'] f_bbox = panel_faces[face_idx]['bbox'] b_center = (int((b_bbox[0] + b_bbox[2]) / 2), int((b_bbox[1] + b_bbox[3]) / 2)) f_center = (int((f_bbox[0] + f_bbox[2]) / 2), int((f_bbox[1] + f_bbox[3]) / 2)) # Draw association line cv2.line(img, b_center, f_center, colors['association'], 3) cv2.circle(img, b_center, 5, colors['association'], -1) cv2.circle(img, f_center, 5, colors['association'], -1) # Add association number mid_point = (int((b_center[0] + f_center[0]) / 2), int((b_center[1] + f_center[1]) / 2)) cv2.putText(img, f"{bubble_idx}→{face_idx}", mid_point, cv2.FONT_HERSHEY_SIMPLEX, 0.5, colors['association'], 2) # Save or display if output_path: cv2.imwrite(output_path, img) print(f"📷 Visualization saved to {output_path}") else: # cv2.imshow("Speech-to-Speaker Associations", img) # cv2.waitKey(0) # cv2.destroyAllWindows() # Display the image plt.imshow(img) plt.axis('off') plt.title('Speech-to-Speaker Associations') plt.show() def process_results(predictions,boxOrderEstimator,pairs,labels,results): # Assign order to each panel for i_panel, panel in enumerate(boxOrderEstimator.ordered_bbs): x1 = panel.xmin y1 = panel.ymin x2 = panel.xmax y2 = panel.ymax for pred_panel in predictions.panels: x3,y3,x4,y4 = pred_panel['bbox'] # print(x1 == x3 , y1 == y3 , x2 == x4 , y2 == y4,x1,y1,x2,y2,x3,y3,x4,y4) if x1 == x3 and y1 == y3 and x2 == x4 and y2 == y4: # print("ok") pred_panel['order'] = i_panel if len(panel.panels) > 1: for subpanel in panel.panels: x1 = subpanel.xmin y1 = subpanel.ymin x2 = subpanel.xmax y2 = subpanel.ymax for pred_panel in predictions.panels: x3,y3,x4,y4 = pred_panel['bbox'] # print(x1 == x3 , y1 == y3 , x2 == x4 , y2 == y4,x1,y1,x2,y2,x3,y3,x4,y4) if x1 == x3 and y1 == y3 and x2 == x4 and y2 == y4: # print("ok") pred_panel['order'] = i_panel # Assign character id to each face for i,(face,body) in enumerate(pairs): # print(i,face,body) x1,y1,x2,y2 = face for pred_panel in predictions.faces: x3,y3,x4,y4 = pred_panel['bbox'] # print(x1 == x3 , y1 == y3 , x2 == x4 , y2 == y4,x1,y1,x2,y2,x3,y3,x4,y4) if x1 == x3 and y1 == y3 and x2 == x4 and y2 == y4: # print("ok") pred_panel['char_id'] = int(labels[i]) # Assign character id to each text for panel_id, mapping in results.items(): # print(panel_id,mapping) try: panel_num = int(panel_id.split('_')[-1]) except: panel_num = 0 panel_bubbles = [b for b in predictions.speech_bubbles if b['panel_id'] == panel_num] panel_faces = [f for f in predictions.faces if f['panel_id'] == panel_num] # print("aaa",panel_num,panel_bubbles,panel_faces) for bubble_idx, face_idx in mapping.items(): if bubble_idx < len(panel_bubbles) and face_idx < len(panel_faces): bubble_id = panel_bubbles[bubble_idx]['id'] face_id = panel_faces[face_idx]['id'] # print("bbb",bubble_id,face_id) # Safely add char_id to the bubble by matching IDs # (in case panel_bubbles and predictions.speech_bubbles are different lists) for b in predictions.speech_bubbles: if b['id'] == bubble_id: for f in predictions.faces: if f['id'] == face_id: b['char_id'] = f.get('char_id', None) break # Stop after finding the bubble return predictions def get_speakers(gender_list): speakers = [['Alexandra_Hisakawa','f','o'],['Annmarie_Nele','f','o'],['Luis_Moray','m','y'],['Baldur_Sanjin','m','o'],['Ige_Behringer','m','o'],['Eugenio_Mataracı','m','y'],['Camilla_Holmström','f','y'],['Uta_Obando','f','o'],['Alma_María','f','o'],['Gitta_Nikolina','f','o'],['Kazuhiko_Atallah','m','y'],['Tammie_Ema','f','o'],['Barbora_MacLean','f','y'],['Rosemary_Okafor','f','o'],['Craig_Gutsy','m','o'],['Gilberto_Mathias','m','o'],['Suad_Qasim','f','o'],['Chandra_MacFarland','f','o'],['Marcos_Rudaski','m','o'],['Brenda_Stern','f','y'],['Ana_Florence','f','y'],['Royston_Min','m','y'],['Tammy_Grit','f','y'],['Tanja_Adelina','f','y'],['Kumar_Dahl','m','o'],['Abrahan_Mack','m','o'],['Daisy_Studious','f','o'],['Alison_Dietlinde','f','o'],['Ilkin_Urbano','m','o'],['Dionisio_Schuyler','m','o'],['Narelle_Moon','f','o'],['Asya_Anara','f','o'],['Viktor_Eka','m','o'],['Vjollca_Johnnie','f','o'],['Adde_Michal','m','y'],['Viktor_Menelaos','m','y']] new_speakers = {"Male":[],"Female":[]} for speaker in speakers: # print(speaker) if speaker[1] == 'm' and speaker[2] == 'y': new_speakers['Male'].append(speaker[0]) elif speaker[1] == 'f' and speaker[2] == 'y': new_speakers['Female'].append(speaker[0]) for speaker in speakers: # print(speaker) if speaker[1] == 'm' and speaker[2] == 'o': new_speakers['Male'].append(speaker[0]) elif speaker[1] == 'f' and speaker[2] == 'o': new_speakers['Female'].append(speaker[0]) # print(new_speakers['Male']) # print(new_speakers['Female']) # return new_speakers speaker_list = [] for gender in gender_list: speaker_list.append(new_speakers[gender[0]].pop(0)) speaker_list.append(new_speakers['Male'].pop(0)) return speaker_list