import torch
from transformers import AutoImageProcessor, AutoModelForImageClassification
from process_audio import create_mel_spectrograms

def predict_image(image):

  device = "gpu" if torch.cuda.is_available() else "cpu"

  processor = AutoImageProcessor.from_pretrained("kubinooo/convnext-tiny-224-audio-deepfake-classification")
  model = AutoModelForImageClassification.from_pretrained("kubinooo/convnext-tiny-224-audio-deepfake-classification")

  if image.mode != 'RGB':
    image = image.convert('RGB')

  image = image
  image.resize((224, 224))

  pixel_values = processor(image, return_tensors="pt").pixel_values

  model.to(device)
  model.eval()

  with torch.no_grad():
    outputs = model(pixel_values)
    logits = outputs.logits
  predicted_class_idx = logits.argmax(-1).item()

  prediction = model.config.id2label[predicted_class_idx]

  if prediction.lower() == "real":
    return {"real": 1.0, "fake": 0.0}  # 100% confidence for real
  else:  # prediction == "fake"
    return {"real": 0.0, "fake": 1.0}


def prediction(file_path):
  total_real = 0.0
  total_fake = 0.0

  pil_images = create_mel_spectrograms(file_path, 2, 0)

  for image in pil_images:
    pred = predict_image(image)
    total_real += pred["real"]
    total_fake += pred["fake"]

  total = len(pil_images)
  if total == 0:
    return {"real": 0.0, "fake": 0.0}

  return {
    "real": round(total_real / total, 2),
    "fake": round(total_fake / total, 2)
  }