Spaces:

janmayjay
/

Sonofica

Running

App Files Files Community

Sonofica / utils /utilities.py

janmayjay

Add application file

39a7537 about 1 month ago

raw

history blame

13.1 kB

	# ✨ utils.py ── reusable helpers
	import torch
	from pathlib import Path
	from typing import List, Dict
	import cv2
	import matplotlib.pyplot as plt
	import numpy as np
	import matplotlib.pyplot as plt
	import matplotlib.patches as patches
	import cv2
	from utils.identify_speaker import DetectionPredictions



	def load_model(path: Path,
	device: str = "cpu") -> "AssocGCN":
	"""
	Re-create the network and load the saved weights.
	"""
	from train_speaker import AssocGCN # import lazily to avoid circular ref
	ckpt = torch.load(path, map_location=device)
	model = AssocGCN().to(device)
	model.load_state_dict(ckpt["model_state"])
	model.eval() # switch to inference mode
	print(f"🔄 Loaded checkpoint from {path} (epoch {ckpt['epoch']})")
	return model

	def show_prediction_per_page(img_path,dets):
	# %matplotlib inline
	# Load your manga panel (image)
	image = cv2.imread(img_path)
	image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB) # Convert BGR to RGB for display

	# Draw bounding boxes for body
	for box in dets.bodies:
	x_min, y_min, x_max, y_max = box['bbox']
	cv2.rectangle(image, (x_min, y_min), (x_max, y_max), color=(255, 0, 0), thickness=2)

	# Draw bounding boxes for face
	for box in dets.faces:
	x_min, y_min, x_max, y_max = box['bbox']
	cv2.rectangle(image, (x_min, y_min), (x_max, y_max), color=(0, 255, 0), thickness=2)

	# Draw bounding boxes for body
	for box in dets.panels:
	x_min, y_min, x_max, y_max = box['bbox']
	cv2.rectangle(image, (x_min, y_min), (x_max, y_max), color=(0, 0, 255), thickness=2)

	# Draw bounding boxes for body
	for box in dets.speech_bubbles:
	x_min, y_min, x_max, y_max = box['bbox']
	cv2.rectangle(image, (x_min, y_min), (x_max, y_max), color=(255, 255, 0), thickness=2)

	# Display the image
	plt.imshow(image)
	plt.axis('off')
	plt.title('Manga Panel with Bounding Boxes')
	plt.show()

	def show_panel_order(img_path,boxOrderEstimator):
	image = cv2.imread(img_path)
	image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB) # Convert BGR to RGB for display

	def show_panel(panel, labeltext, linestyle="solid", edgecolor="red", ax=None):
	fontsize = 12 # reduce fontsize for memory efficiency
	if ax is None:
	ax = plt.gca()

	rect = patches.Rectangle(
	(panel.xmin, panel.ymin),
	panel.width,
	panel.height,
	linewidth=2,
	linestyle=linestyle,
	edgecolor=edgecolor,
	facecolor="none"
	)
	ax.add_patch(rect)

	g_sub_bb_x = panel.xmin + panel.width / 2
	g_sub_bb_y = panel.ymin + panel.height / 2
	ax.text(g_sub_bb_x, g_sub_bb_y, labeltext,
	fontsize=fontsize, color=edgecolor,
	ha='center', va='center')


	fig, ax = plt.subplots(figsize=(10, 7)) # This avoids multiple axes
	# Show the image
	if len(np.array(image).shape) == 2:
	ax.imshow(image, cmap="gray")
	else:
	ax.imshow(image)

	# Draw bounding boxes
	for i_panel, panel in enumerate(boxOrderEstimator.ordered_bbs):
	# print(i_panel,panel)
	order = i_panel + 1
	if len(panel.panels) == 1:
	show_panel(panel, labeltext=f"{order}", edgecolor="red", ax=ax)
	else:
	for subpanel in panel.panels:
	show_panel(subpanel, edgecolor="orange", linestyle="dotted", labeltext=f"({order})", ax=ax)

	plt.axis('off') # cleaner output
	plt.tight_layout()
	plt.show()


	# ============================================================================
	# VISUALIZATION
	# ============================================================================

	def visualize_speaker_results(image_path: str, predictions: DetectionPredictions,
	associations: Dict[str, Dict[int, int]], output_path: str = None):
	"""
	Visualize detection results and speech-to-speaker associations
	"""
	img = cv2.imread(image_path)
	if img is None:
	print(f"❌ Could not load image for visualization: {image_path}")
	return

	# Colors for different elements (BGR format for OpenCV)
	colors = {
	'panel': (255, 0, 0), # Blue
	'bubble': (0, 255, 0), # Green
	'face': (0, 0, 255), # Red
	'body': (255, 255, 0), # Cyan
	'association': (255, 0, 255) # Magenta
	}

	# Draw panels/frames
	for panel in predictions.panels:
	bbox = [int(x) for x in panel['bbox']]
	cv2.rectangle(img, (bbox[0], bbox[1]), (bbox[2], bbox[3]),
	colors['panel'], 2)
	cv2.putText(img, f"Panel {panel['id']}", (bbox[0], bbox[1]-10),
	cv2.FONT_HERSHEY_SIMPLEX, 0.6, colors['panel'], 2)

	# Draw speech bubbles/text
	for bubble in predictions.speech_bubbles:
	bbox = [int(x) for x in bubble['bbox']]
	cv2.rectangle(img, (bbox[0], bbox[1]), (bbox[2], bbox[3]),
	colors['bubble'], 2)
	cv2.putText(img, f"T{bubble['id']}", (bbox[0], bbox[1]-5),
	cv2.FONT_HERSHEY_SIMPLEX, 0.5, colors['bubble'], 2)

	# Draw faces
	for face in predictions.faces:
	bbox = [int(x) for x in face['bbox']]
	cv2.rectangle(img, (bbox[0], bbox[1]), (bbox[2], bbox[3]),
	colors['face'], 2)
	cv2.putText(img, f"F{face['id']}", (bbox[0], bbox[1]-5),
	cv2.FONT_HERSHEY_SIMPLEX, 0.5, colors['face'], 2)

	# Draw bodies (optional)
	for body in predictions.bodies:
	bbox = [int(x) for x in body['bbox']]
	cv2.rectangle(img, (bbox[0], bbox[1]), (bbox[2], bbox[3]),
	colors['body'], 1)
	cv2.putText(img, f"B{body['id']}", (bbox[0], bbox[1]-5),
	cv2.FONT_HERSHEY_SIMPLEX, 0.4, colors['body'], 1)

	# Draw associations
	for panel_id, mapping in associations.items():
	# Extract panel number from panel_id string
	try:
	panel_num = int(panel_id.split('_')[-1])
	except:
	panel_num = 0

	panel_bubbles = [b for b in predictions.speech_bubbles
	if b['panel_id'] == panel_num]
	panel_faces = [f for f in predictions.faces
	if f['panel_id'] == panel_num]

	for bubble_idx, face_idx in mapping.items():
	if bubble_idx < len(panel_bubbles) and face_idx < len(panel_faces):
	b_bbox = panel_bubbles[bubble_idx]['bbox']
	f_bbox = panel_faces[face_idx]['bbox']

	b_center = (int((b_bbox[0] + b_bbox[2]) / 2),
	int((b_bbox[1] + b_bbox[3]) / 2))
	f_center = (int((f_bbox[0] + f_bbox[2]) / 2),
	int((f_bbox[1] + f_bbox[3]) / 2))

	# Draw association line
	cv2.line(img, b_center, f_center, colors['association'], 3)
	cv2.circle(img, b_center, 5, colors['association'], -1)
	cv2.circle(img, f_center, 5, colors['association'], -1)

	# Add association number
	mid_point = (int((b_center[0] + f_center[0]) / 2),
	int((b_center[1] + f_center[1]) / 2))
	cv2.putText(img, f"{bubble_idx}→{face_idx}", mid_point,
	cv2.FONT_HERSHEY_SIMPLEX, 0.5, colors['association'], 2)

	# Save or display
	if output_path:
	cv2.imwrite(output_path, img)
	print(f"📷 Visualization saved to {output_path}")
	else:
	# cv2.imshow("Speech-to-Speaker Associations", img)
	# cv2.waitKey(0)
	# cv2.destroyAllWindows()
	# Display the image
	plt.imshow(img)
	plt.axis('off')
	plt.title('Speech-to-Speaker Associations')
	plt.show()

	def process_results(predictions,boxOrderEstimator,pairs,labels,results):
	# Assign order to each panel
	for i_panel, panel in enumerate(boxOrderEstimator.ordered_bbs):
	x1 = panel.xmin
	y1 = panel.ymin
	x2 = panel.xmax
	y2 = panel.ymax
	for pred_panel in predictions.panels:
	x3,y3,x4,y4 = pred_panel['bbox']
	# print(x1 == x3 , y1 == y3 , x2 == x4 , y2 == y4,x1,y1,x2,y2,x3,y3,x4,y4)
	if x1 == x3 and y1 == y3 and x2 == x4 and y2 == y4:
	# print("ok")
	pred_panel['order'] = i_panel

	if len(panel.panels) > 1:
	for subpanel in panel.panels:
	x1 = subpanel.xmin
	y1 = subpanel.ymin
	x2 = subpanel.xmax
	y2 = subpanel.ymax
	for pred_panel in predictions.panels:
	x3,y3,x4,y4 = pred_panel['bbox']
	# print(x1 == x3 , y1 == y3 , x2 == x4 , y2 == y4,x1,y1,x2,y2,x3,y3,x4,y4)
	if x1 == x3 and y1 == y3 and x2 == x4 and y2 == y4:
	# print("ok")
	pred_panel['order'] = i_panel

	# Assign character id to each face
	for i,(face,body) in enumerate(pairs):
	# print(i,face,body)
	x1,y1,x2,y2 = face
	for pred_panel in predictions.faces:
	x3,y3,x4,y4 = pred_panel['bbox']
	# print(x1 == x3 , y1 == y3 , x2 == x4 , y2 == y4,x1,y1,x2,y2,x3,y3,x4,y4)
	if x1 == x3 and y1 == y3 and x2 == x4 and y2 == y4:
	# print("ok")
	pred_panel['char_id'] = int(labels[i])

	# Assign character id to each text
	for panel_id, mapping in results.items():
	# print(panel_id,mapping)
	try:
	panel_num = int(panel_id.split('_')[-1])
	except:
	panel_num = 0

	panel_bubbles = [b for b in predictions.speech_bubbles
	if b['panel_id'] == panel_num]
	panel_faces = [f for f in predictions.faces
	if f['panel_id'] == panel_num]

	# print("aaa",panel_num,panel_bubbles,panel_faces)
	for bubble_idx, face_idx in mapping.items():
	if bubble_idx < len(panel_bubbles) and face_idx < len(panel_faces):
	bubble_id = panel_bubbles[bubble_idx]['id']
	face_id = panel_faces[face_idx]['id']
	# print("bbb",bubble_id,face_id)

	# Safely add char_id to the bubble by matching IDs
	# (in case panel_bubbles and predictions.speech_bubbles are different lists)
	for b in predictions.speech_bubbles:
	if b['id'] == bubble_id:
	for f in predictions.faces:
	if f['id'] == face_id:
	b['char_id'] = f.get('char_id', None)
	break # Stop after finding the bubble

	return predictions

	def get_speakers(gender_list):
	speakers = [['Alexandra_Hisakawa','f','o'],['Annmarie_Nele','f','o'],['Luis_Moray','m','y'],['Baldur_Sanjin','m','o'],['Ige_Behringer','m','o'],['Eugenio_Mataracı','m','y'],['Camilla_Holmström','f','y'],['Uta_Obando','f','o'],['Alma_María','f','o'],['Gitta_Nikolina','f','o'],['Kazuhiko_Atallah','m','y'],['Tammie_Ema','f','o'],['Barbora_MacLean','f','y'],['Rosemary_Okafor','f','o'],['Craig_Gutsy','m','o'],['Gilberto_Mathias','m','o'],['Suad_Qasim','f','o'],['Chandra_MacFarland','f','o'],['Marcos_Rudaski','m','o'],['Brenda_Stern','f','y'],['Ana_Florence','f','y'],['Royston_Min','m','y'],['Tammy_Grit','f','y'],['Tanja_Adelina','f','y'],['Kumar_Dahl','m','o'],['Abrahan_Mack','m','o'],['Daisy_Studious','f','o'],['Alison_Dietlinde','f','o'],['Ilkin_Urbano','m','o'],['Dionisio_Schuyler','m','o'],['Narelle_Moon','f','o'],['Asya_Anara','f','o'],['Viktor_Eka','m','o'],['Vjollca_Johnnie','f','o'],['Adde_Michal','m','y'],['Viktor_Menelaos','m','y']]
	new_speakers = {"Male":[],"Female":[]}
	for speaker in speakers:
	# print(speaker)
	if speaker[1] == 'm' and speaker[2] == 'y':
	new_speakers['Male'].append(speaker[0])
	elif speaker[1] == 'f' and speaker[2] == 'y':
	new_speakers['Female'].append(speaker[0])
	for speaker in speakers:
	# print(speaker)
	if speaker[1] == 'm' and speaker[2] == 'o':
	new_speakers['Male'].append(speaker[0])
	elif speaker[1] == 'f' and speaker[2] == 'o':
	new_speakers['Female'].append(speaker[0])

	# print(new_speakers['Male'])
	# print(new_speakers['Female'])
	# return new_speakers
	speaker_list = []

	for gender in gender_list:
	speaker_list.append(new_speakers[gender[0]].pop(0))
	speaker_list.append(new_speakers['Male'].pop(0))

	return speaker_list