Sonofica / utils /utilities.py
janmayjay's picture
Add application file
39a7537
raw
history blame
13.1 kB
# ✨ utils.py ── reusable helpers
import torch
from pathlib import Path
from typing import List, Dict
import cv2
import matplotlib.pyplot as plt
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.patches as patches
import cv2
from utils.identify_speaker import DetectionPredictions
def load_model(path: Path,
device: str = "cpu") -> "AssocGCN":
"""
Re-create the network and load the saved weights.
"""
from train_speaker import AssocGCN # import lazily to avoid circular ref
ckpt = torch.load(path, map_location=device)
model = AssocGCN().to(device)
model.load_state_dict(ckpt["model_state"])
model.eval() # switch to inference mode
print(f"🔄 Loaded checkpoint from {path} (epoch {ckpt['epoch']})")
return model
def show_prediction_per_page(img_path,dets):
# %matplotlib inline
# Load your manga panel (image)
image = cv2.imread(img_path)
image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB) # Convert BGR to RGB for display
# Draw bounding boxes for body
for box in dets.bodies:
x_min, y_min, x_max, y_max = box['bbox']
cv2.rectangle(image, (x_min, y_min), (x_max, y_max), color=(255, 0, 0), thickness=2)
# Draw bounding boxes for face
for box in dets.faces:
x_min, y_min, x_max, y_max = box['bbox']
cv2.rectangle(image, (x_min, y_min), (x_max, y_max), color=(0, 255, 0), thickness=2)
# Draw bounding boxes for body
for box in dets.panels:
x_min, y_min, x_max, y_max = box['bbox']
cv2.rectangle(image, (x_min, y_min), (x_max, y_max), color=(0, 0, 255), thickness=2)
# Draw bounding boxes for body
for box in dets.speech_bubbles:
x_min, y_min, x_max, y_max = box['bbox']
cv2.rectangle(image, (x_min, y_min), (x_max, y_max), color=(255, 255, 0), thickness=2)
# Display the image
plt.imshow(image)
plt.axis('off')
plt.title('Manga Panel with Bounding Boxes')
plt.show()
def show_panel_order(img_path,boxOrderEstimator):
image = cv2.imread(img_path)
image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB) # Convert BGR to RGB for display
def show_panel(panel, labeltext, linestyle="solid", edgecolor="red", ax=None):
fontsize = 12 # reduce fontsize for memory efficiency
if ax is None:
ax = plt.gca()
rect = patches.Rectangle(
(panel.xmin, panel.ymin),
panel.width,
panel.height,
linewidth=2,
linestyle=linestyle,
edgecolor=edgecolor,
facecolor="none"
)
ax.add_patch(rect)
g_sub_bb_x = panel.xmin + panel.width / 2
g_sub_bb_y = panel.ymin + panel.height / 2
ax.text(g_sub_bb_x, g_sub_bb_y, labeltext,
fontsize=fontsize, color=edgecolor,
ha='center', va='center')
fig, ax = plt.subplots(figsize=(10, 7)) # This avoids multiple axes
# Show the image
if len(np.array(image).shape) == 2:
ax.imshow(image, cmap="gray")
else:
ax.imshow(image)
# Draw bounding boxes
for i_panel, panel in enumerate(boxOrderEstimator.ordered_bbs):
# print(i_panel,panel)
order = i_panel + 1
if len(panel.panels) == 1:
show_panel(panel, labeltext=f"{order}", edgecolor="red", ax=ax)
else:
for subpanel in panel.panels:
show_panel(subpanel, edgecolor="orange", linestyle="dotted", labeltext=f"({order})", ax=ax)
plt.axis('off') # cleaner output
plt.tight_layout()
plt.show()
# ============================================================================
# VISUALIZATION
# ============================================================================
def visualize_speaker_results(image_path: str, predictions: DetectionPredictions,
associations: Dict[str, Dict[int, int]], output_path: str = None):
"""
Visualize detection results and speech-to-speaker associations
"""
img = cv2.imread(image_path)
if img is None:
print(f"❌ Could not load image for visualization: {image_path}")
return
# Colors for different elements (BGR format for OpenCV)
colors = {
'panel': (255, 0, 0), # Blue
'bubble': (0, 255, 0), # Green
'face': (0, 0, 255), # Red
'body': (255, 255, 0), # Cyan
'association': (255, 0, 255) # Magenta
}
# Draw panels/frames
for panel in predictions.panels:
bbox = [int(x) for x in panel['bbox']]
cv2.rectangle(img, (bbox[0], bbox[1]), (bbox[2], bbox[3]),
colors['panel'], 2)
cv2.putText(img, f"Panel {panel['id']}", (bbox[0], bbox[1]-10),
cv2.FONT_HERSHEY_SIMPLEX, 0.6, colors['panel'], 2)
# Draw speech bubbles/text
for bubble in predictions.speech_bubbles:
bbox = [int(x) for x in bubble['bbox']]
cv2.rectangle(img, (bbox[0], bbox[1]), (bbox[2], bbox[3]),
colors['bubble'], 2)
cv2.putText(img, f"T{bubble['id']}", (bbox[0], bbox[1]-5),
cv2.FONT_HERSHEY_SIMPLEX, 0.5, colors['bubble'], 2)
# Draw faces
for face in predictions.faces:
bbox = [int(x) for x in face['bbox']]
cv2.rectangle(img, (bbox[0], bbox[1]), (bbox[2], bbox[3]),
colors['face'], 2)
cv2.putText(img, f"F{face['id']}", (bbox[0], bbox[1]-5),
cv2.FONT_HERSHEY_SIMPLEX, 0.5, colors['face'], 2)
# Draw bodies (optional)
for body in predictions.bodies:
bbox = [int(x) for x in body['bbox']]
cv2.rectangle(img, (bbox[0], bbox[1]), (bbox[2], bbox[3]),
colors['body'], 1)
cv2.putText(img, f"B{body['id']}", (bbox[0], bbox[1]-5),
cv2.FONT_HERSHEY_SIMPLEX, 0.4, colors['body'], 1)
# Draw associations
for panel_id, mapping in associations.items():
# Extract panel number from panel_id string
try:
panel_num = int(panel_id.split('_')[-1])
except:
panel_num = 0
panel_bubbles = [b for b in predictions.speech_bubbles
if b['panel_id'] == panel_num]
panel_faces = [f for f in predictions.faces
if f['panel_id'] == panel_num]
for bubble_idx, face_idx in mapping.items():
if bubble_idx < len(panel_bubbles) and face_idx < len(panel_faces):
b_bbox = panel_bubbles[bubble_idx]['bbox']
f_bbox = panel_faces[face_idx]['bbox']
b_center = (int((b_bbox[0] + b_bbox[2]) / 2),
int((b_bbox[1] + b_bbox[3]) / 2))
f_center = (int((f_bbox[0] + f_bbox[2]) / 2),
int((f_bbox[1] + f_bbox[3]) / 2))
# Draw association line
cv2.line(img, b_center, f_center, colors['association'], 3)
cv2.circle(img, b_center, 5, colors['association'], -1)
cv2.circle(img, f_center, 5, colors['association'], -1)
# Add association number
mid_point = (int((b_center[0] + f_center[0]) / 2),
int((b_center[1] + f_center[1]) / 2))
cv2.putText(img, f"{bubble_idx}{face_idx}", mid_point,
cv2.FONT_HERSHEY_SIMPLEX, 0.5, colors['association'], 2)
# Save or display
if output_path:
cv2.imwrite(output_path, img)
print(f"📷 Visualization saved to {output_path}")
else:
# cv2.imshow("Speech-to-Speaker Associations", img)
# cv2.waitKey(0)
# cv2.destroyAllWindows()
# Display the image
plt.imshow(img)
plt.axis('off')
plt.title('Speech-to-Speaker Associations')
plt.show()
def process_results(predictions,boxOrderEstimator,pairs,labels,results):
# Assign order to each panel
for i_panel, panel in enumerate(boxOrderEstimator.ordered_bbs):
x1 = panel.xmin
y1 = panel.ymin
x2 = panel.xmax
y2 = panel.ymax
for pred_panel in predictions.panels:
x3,y3,x4,y4 = pred_panel['bbox']
# print(x1 == x3 , y1 == y3 , x2 == x4 , y2 == y4,x1,y1,x2,y2,x3,y3,x4,y4)
if x1 == x3 and y1 == y3 and x2 == x4 and y2 == y4:
# print("ok")
pred_panel['order'] = i_panel
if len(panel.panels) > 1:
for subpanel in panel.panels:
x1 = subpanel.xmin
y1 = subpanel.ymin
x2 = subpanel.xmax
y2 = subpanel.ymax
for pred_panel in predictions.panels:
x3,y3,x4,y4 = pred_panel['bbox']
# print(x1 == x3 , y1 == y3 , x2 == x4 , y2 == y4,x1,y1,x2,y2,x3,y3,x4,y4)
if x1 == x3 and y1 == y3 and x2 == x4 and y2 == y4:
# print("ok")
pred_panel['order'] = i_panel
# Assign character id to each face
for i,(face,body) in enumerate(pairs):
# print(i,face,body)
x1,y1,x2,y2 = face
for pred_panel in predictions.faces:
x3,y3,x4,y4 = pred_panel['bbox']
# print(x1 == x3 , y1 == y3 , x2 == x4 , y2 == y4,x1,y1,x2,y2,x3,y3,x4,y4)
if x1 == x3 and y1 == y3 and x2 == x4 and y2 == y4:
# print("ok")
pred_panel['char_id'] = int(labels[i])
# Assign character id to each text
for panel_id, mapping in results.items():
# print(panel_id,mapping)
try:
panel_num = int(panel_id.split('_')[-1])
except:
panel_num = 0
panel_bubbles = [b for b in predictions.speech_bubbles
if b['panel_id'] == panel_num]
panel_faces = [f for f in predictions.faces
if f['panel_id'] == panel_num]
# print("aaa",panel_num,panel_bubbles,panel_faces)
for bubble_idx, face_idx in mapping.items():
if bubble_idx < len(panel_bubbles) and face_idx < len(panel_faces):
bubble_id = panel_bubbles[bubble_idx]['id']
face_id = panel_faces[face_idx]['id']
# print("bbb",bubble_id,face_id)
# Safely add char_id to the bubble by matching IDs
# (in case panel_bubbles and predictions.speech_bubbles are different lists)
for b in predictions.speech_bubbles:
if b['id'] == bubble_id:
for f in predictions.faces:
if f['id'] == face_id:
b['char_id'] = f.get('char_id', None)
break # Stop after finding the bubble
return predictions
def get_speakers(gender_list):
speakers = [['Alexandra_Hisakawa','f','o'],['Annmarie_Nele','f','o'],['Luis_Moray','m','y'],['Baldur_Sanjin','m','o'],['Ige_Behringer','m','o'],['Eugenio_Mataracı','m','y'],['Camilla_Holmström','f','y'],['Uta_Obando','f','o'],['Alma_María','f','o'],['Gitta_Nikolina','f','o'],['Kazuhiko_Atallah','m','y'],['Tammie_Ema','f','o'],['Barbora_MacLean','f','y'],['Rosemary_Okafor','f','o'],['Craig_Gutsy','m','o'],['Gilberto_Mathias','m','o'],['Suad_Qasim','f','o'],['Chandra_MacFarland','f','o'],['Marcos_Rudaski','m','o'],['Brenda_Stern','f','y'],['Ana_Florence','f','y'],['Royston_Min','m','y'],['Tammy_Grit','f','y'],['Tanja_Adelina','f','y'],['Kumar_Dahl','m','o'],['Abrahan_Mack','m','o'],['Daisy_Studious','f','o'],['Alison_Dietlinde','f','o'],['Ilkin_Urbano','m','o'],['Dionisio_Schuyler','m','o'],['Narelle_Moon','f','o'],['Asya_Anara','f','o'],['Viktor_Eka','m','o'],['Vjollca_Johnnie','f','o'],['Adde_Michal','m','y'],['Viktor_Menelaos','m','y']]
new_speakers = {"Male":[],"Female":[]}
for speaker in speakers:
# print(speaker)
if speaker[1] == 'm' and speaker[2] == 'y':
new_speakers['Male'].append(speaker[0])
elif speaker[1] == 'f' and speaker[2] == 'y':
new_speakers['Female'].append(speaker[0])
for speaker in speakers:
# print(speaker)
if speaker[1] == 'm' and speaker[2] == 'o':
new_speakers['Male'].append(speaker[0])
elif speaker[1] == 'f' and speaker[2] == 'o':
new_speakers['Female'].append(speaker[0])
# print(new_speakers['Male'])
# print(new_speakers['Female'])
# return new_speakers
speaker_list = []
for gender in gender_list:
speaker_list.append(new_speakers[gender[0]].pop(0))
speaker_list.append(new_speakers['Male'].pop(0))
return speaker_list