def compute_iou(boxA, boxB):
    """
    Compute Intersection over Union (IoU) between two boxes.
    Boxes are (x1, y1, x2, y2).
    """
    xA = max(boxA[0], boxB[0])
    yA = max(boxA[1], boxB[1])
    xB = min(boxA[2], boxB[2])
    yB = min(boxA[3], boxB[3])

    interW = max(0, xB - xA)
    interH = max(0, yB - yA)
    interArea = interW * interH

    boxAArea = (boxA[2] - boxA[0]) * (boxA[3] - boxA[1])
    # boxBArea = (boxB[2] - boxB[0]) * (boxB[3] - boxB[1])
    # unionArea = boxAArea + boxBArea - interArea

    return interArea / boxAArea if boxAArea > 0 else 0.0

def match_faces_to_bodies(face_boxes, body_boxes, iou_threshold=0.9):
    """
    Match each body box to the face box with highest IoU above threshold.
    Return list of (face_box or None, body_box).
    If no face meets the threshold for a body, face_box is None.
    """
    matches = []
    used_faces = set()

    for b in body_boxes:
        best_iou = 0.0
        best_f = None
        for i, f in enumerate(face_boxes):
            if i in used_faces:
                continue
            iou = compute_iou(f, b)
            if iou > best_iou:
                best_iou = iou
                best_f = i
        if best_iou >= iou_threshold:
            matches.append((face_boxes[best_f], b))
            used_faces.add(best_f)
        else:
            # No face sufficiently inside this body → treat body alone
            matches.append((None, b))


    return matches

# matches = match_faces_to_bodies(predictions_face,predictions_body)
# for face_box, body_box in matches:        
#     print(face_box,body_box)

import cv2
import numpy as np

def crop_and_prepare_embeddings(image_path, face_boxes, body_boxes, model, fuse_features):
    """
    For each matched (face, body) pair:
      - If both present: crop both and fuse embeddings.
      - If only face: crop face and use face embedding alone.
      - If only body: crop body and use body embedding alone.
    Returns list of embeddings.
    """
    img = cv2.imread(image_path)
    pairs = match_faces_to_bodies(face_boxes, body_boxes)
    embeddings = []
    new_pairs = []
    for face_box, body_box in pairs:
        if face_box is not None:
            x1, y1, x2, y2 = face_box
            face_crop = img[y1:y2, x1:x2]
            face_emb = model(face_crop)
        else:
            face_emb = None

        if body_box is not None:
            x1, y1, x2, y2 = body_box
            body_crop = img[y1:y2, x1:x2]
            body_emb = model(body_crop)
        else:
            body_emb = None

        if face_emb is not None and body_emb is not None:
            emb = fuse_features(face_emb, body_emb)
        # elif face_emb is not None:
        #     emb = face_emb
        # else:
        #     emb = body_emb

            embeddings.append(emb)
            new_pairs.append((face_box, body_box))

    return embeddings,new_pairs

"""
Prerequisites
-------------
pip install torch torchvision scikit-learn opencv-python pillow tqdm
Download a backbone checkpoint that outputs 128-D embeddings.
Here we assume `reid_backbone.pth` is a ResNet-50 fine-tuned on manga faces/bodies.
"""
import cv2
import torch
import torch.nn as nn
import torchvision.transforms as T
import numpy as np
from sklearn.cluster import DBSCAN
from tqdm import tqdm
from pathlib import Path

# -------------------------------------------------------------------
# 1.  Detection post-processing: crop_faces() is given; add crop_bodies()
# -------------------------------------------------------------------
def crop_bodies(image_path, boxes):
    """Crop body regions given YOLO boxes (x1,y1,x2,y2)."""
    img = cv2.imread(image_path)
    crops = []
    for (x1, y1, x2, y2) in boxes:
        body = img[y1:y2, x1:x2]
        crops.append(body)
    return crops
def crop_faces(image_path, boxes):
    image = cv2.imread(image_path)
    faces = []
    for (x1, y1, x2, y2) in boxes:
        face = image[y1:y2, x1:x2]  # crop the face
        faces.append(face)
    return faces
# -------------------------------------------------------------------
# 2.  Embedding network wrapper
# -------------------------------------------------------------------
class ReIDNet(nn.Module):
    """Backbone that outputs a 128-D L2-normalised embedding."""
    def __init__(self, ckpt="./models/model.pth", dim=1031):
        super().__init__()
        base = torch.hub.load('pytorch/vision:v0.14.0', 'resnet50', weights=None)
        base.fc = nn.Linear(base.fc.in_features, dim)
        self.backbone = base
        def add_prefix(state_dict, prefix="backbone."):
            return {f"{prefix}{k}" if not k.startswith(prefix) else k: v
                    for k, v in state_dict.items()}
        raw = torch.load(ckpt, map_location="cpu")
        patched = add_prefix(raw)   
        self.load_state_dict(patched)
        self.backbone.eval()

        # Normalisation identical to training
        self.preprocess = T.Compose([
            T.ToPILImage(),
            T.Resize((112, 112)),
            T.ToTensor(),
            T.Normalize(mean=[0.5]*3, std=[0.5]*3)
        ])

    @torch.inference_mode()
    def forward(self, np_image):
        # tensor = self.preprocess(np_image).unsqueeze(0)  # 1×3×112×112
        tensor = self.preprocess(np_image).unsqueeze(0).to(next(self.backbone.parameters()).device)
        feat = self.backbone(tensor)
        return nn.functional.normalize(feat, p=2, dim=1).cpu().numpy().ravel()

# -------------------------------------------------------------------
# 3.  Utility to fuse face & body features
# -------------------------------------------------------------------
def fuse_features(face_f, body_f, w_face=0.6):
    """Simple weighted late fusion."""
    return np.concatenate([w_face * face_f, (1 - w_face) * body_f])

# -------------------------------------------------------------------
# 4.  Main pipeline for a single page
# -------------------------------------------------------------------
def count_characters_single_page(img_path,
                                 predictions,
                                 eps=0.3,
                                 min_samples=3,
                                 device='cpu'):
    model = ReIDNet().to(device)
    face_boxes = [face['bbox'] for face in predictions.faces]
    body_boxes = [body['bbox'] for body in predictions.bodies]
    embeddings,pairs = crop_and_prepare_embeddings(img_path, face_boxes, body_boxes, model, fuse_features)
    fused_embeds = np.vstack(embeddings)     # N×256

    # ----------------------------------------------------------------
    # 5.  DBSCAN clustering
    # ----------------------------------------------------------------
    db = DBSCAN(metric='cosine', eps=eps, min_samples=min_samples)
    labels = db.fit_predict(fused_embeds)      # -1 => noise

    # Count unique positive cluster IDs
    unique_chars = len(set(l for l in labels if l != -1))
    return unique_chars, labels,pairs