def compute_iou(boxA, boxB): """ Compute Intersection over Union (IoU) between two boxes. Boxes are (x1, y1, x2, y2). """ xA = max(boxA[0], boxB[0]) yA = max(boxA[1], boxB[1]) xB = min(boxA[2], boxB[2]) yB = min(boxA[3], boxB[3]) interW = max(0, xB - xA) interH = max(0, yB - yA) interArea = interW * interH boxAArea = (boxA[2] - boxA[0]) * (boxA[3] - boxA[1]) # boxBArea = (boxB[2] - boxB[0]) * (boxB[3] - boxB[1]) # unionArea = boxAArea + boxBArea - interArea return interArea / boxAArea if boxAArea > 0 else 0.0 def match_faces_to_bodies(face_boxes, body_boxes, iou_threshold=0.9): """ Match each body box to the face box with highest IoU above threshold. Return list of (face_box or None, body_box). If no face meets the threshold for a body, face_box is None. """ matches = [] used_faces = set() for b in body_boxes: best_iou = 0.0 best_f = None for i, f in enumerate(face_boxes): if i in used_faces: continue iou = compute_iou(f, b) if iou > best_iou: best_iou = iou best_f = i if best_iou >= iou_threshold: matches.append((face_boxes[best_f], b)) used_faces.add(best_f) else: # No face sufficiently inside this body → treat body alone matches.append((None, b)) return matches # matches = match_faces_to_bodies(predictions_face,predictions_body) # for face_box, body_box in matches: # print(face_box,body_box) import cv2 import numpy as np def crop_and_prepare_embeddings(image_path, face_boxes, body_boxes, model, fuse_features): """ For each matched (face, body) pair: - If both present: crop both and fuse embeddings. - If only face: crop face and use face embedding alone. - If only body: crop body and use body embedding alone. Returns list of embeddings. """ img = cv2.imread(image_path) pairs = match_faces_to_bodies(face_boxes, body_boxes) embeddings = [] new_pairs = [] for face_box, body_box in pairs: if face_box is not None: x1, y1, x2, y2 = face_box face_crop = img[y1:y2, x1:x2] face_emb = model(face_crop) else: face_emb = None if body_box is not None: x1, y1, x2, y2 = body_box body_crop = img[y1:y2, x1:x2] body_emb = model(body_crop) else: body_emb = None if face_emb is not None and body_emb is not None: emb = fuse_features(face_emb, body_emb) # elif face_emb is not None: # emb = face_emb # else: # emb = body_emb embeddings.append(emb) new_pairs.append((face_box, body_box)) return embeddings,new_pairs """ Prerequisites ------------- pip install torch torchvision scikit-learn opencv-python pillow tqdm Download a backbone checkpoint that outputs 128-D embeddings. Here we assume `reid_backbone.pth` is a ResNet-50 fine-tuned on manga faces/bodies. """ import cv2 import torch import torch.nn as nn import torchvision.transforms as T import numpy as np from sklearn.cluster import DBSCAN from tqdm import tqdm from pathlib import Path # ------------------------------------------------------------------- # 1. Detection post-processing: crop_faces() is given; add crop_bodies() # ------------------------------------------------------------------- def crop_bodies(image_path, boxes): """Crop body regions given YOLO boxes (x1,y1,x2,y2).""" img = cv2.imread(image_path) crops = [] for (x1, y1, x2, y2) in boxes: body = img[y1:y2, x1:x2] crops.append(body) return crops def crop_faces(image_path, boxes): image = cv2.imread(image_path) faces = [] for (x1, y1, x2, y2) in boxes: face = image[y1:y2, x1:x2] # crop the face faces.append(face) return faces # ------------------------------------------------------------------- # 2. Embedding network wrapper # ------------------------------------------------------------------- class ReIDNet(nn.Module): """Backbone that outputs a 128-D L2-normalised embedding.""" def __init__(self, ckpt="./models/model.pth", dim=1031): super().__init__() base = torch.hub.load('pytorch/vision:v0.14.0', 'resnet50', weights=None) base.fc = nn.Linear(base.fc.in_features, dim) self.backbone = base def add_prefix(state_dict, prefix="backbone."): return {f"{prefix}{k}" if not k.startswith(prefix) else k: v for k, v in state_dict.items()} raw = torch.load(ckpt, map_location="cpu") patched = add_prefix(raw) self.load_state_dict(patched) self.backbone.eval() # Normalisation identical to training self.preprocess = T.Compose([ T.ToPILImage(), T.Resize((112, 112)), T.ToTensor(), T.Normalize(mean=[0.5]*3, std=[0.5]*3) ]) @torch.inference_mode() def forward(self, np_image): # tensor = self.preprocess(np_image).unsqueeze(0) # 1×3×112×112 tensor = self.preprocess(np_image).unsqueeze(0).to(next(self.backbone.parameters()).device) feat = self.backbone(tensor) return nn.functional.normalize(feat, p=2, dim=1).cpu().numpy().ravel() # ------------------------------------------------------------------- # 3. Utility to fuse face & body features # ------------------------------------------------------------------- def fuse_features(face_f, body_f, w_face=0.6): """Simple weighted late fusion.""" return np.concatenate([w_face * face_f, (1 - w_face) * body_f]) # ------------------------------------------------------------------- # 4. Main pipeline for a single page # ------------------------------------------------------------------- def count_characters_single_page(img_path, predictions, eps=0.3, min_samples=3, device='cpu'): model = ReIDNet().to(device) face_boxes = [face['bbox'] for face in predictions.faces] body_boxes = [body['bbox'] for body in predictions.bodies] embeddings,pairs = crop_and_prepare_embeddings(img_path, face_boxes, body_boxes, model, fuse_features) fused_embeds = np.vstack(embeddings) # N×256 # ---------------------------------------------------------------- # 5. DBSCAN clustering # ---------------------------------------------------------------- db = DBSCAN(metric='cosine', eps=eps, min_samples=min_samples) labels = db.fit_predict(fused_embeds) # -1 => noise # Count unique positive cluster IDs unique_chars = len(set(l for l in labels if l != -1)) return unique_chars, labels,pairs