import os
import numpy as np
from PIL import Image
from collections import Counter
import matplotlib.pyplot as plt  # at top of file


import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

import torchvision.transforms as transforms


import timm


# -------------------------------
# Transformations for different model inputs
# -------------------------------
# For Model A and Model B, we use small images (50x50)
transform_small = transforms.Compose([
    transforms.Resize((50, 50)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.5]*3, std=[0.5]*3)
])

# For Model C, we use larger images (224x224)
transform_large = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.5]*3, std=[0.5]*3)
])


# --- Model A: CNN-based network for eye and nose regions (12 layers) ---
class ModelA(nn.Module):
    def __init__(self, num_classes=2):
        super(ModelA, self).__init__()
        # Three convolutional blocks, each with 2 conv layers + BN, ReLU, pooling and dropout
        self.block1 = nn.Sequential(
            nn.Conv2d(3, 32, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.BatchNorm2d(32),
            nn.Conv2d(32, 32, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2),
            nn.Dropout(0.3)
        )
        self.block2 = nn.Sequential(
            nn.Conv2d(32, 64, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.BatchNorm2d(64),
            nn.Conv2d(64, 64, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2),
            nn.Dropout(0.3)
        )
        self.block3 = nn.Sequential(
            nn.Conv2d(64, 128, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.BatchNorm2d(128),
            nn.Conv2d(128, 128, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2),
            nn.Dropout(0.3)
        )
        # After three blocks, feature map size for 50x50 input: 50 -> 25 -> ~12 -> ~6
        self.classifier = nn.Sequential(
            nn.Flatten(),
            nn.Linear(128 * 6 * 6, 512),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(512, num_classes)
        )

    def forward(self, x):
        x = self.block1(x)
        x = self.block2(x)
        x = self.block3(x)
        x = self.classifier(x)
        return x



# --- Model B: Simpler CNN-based network (6 layers) ---
class ModelB(nn.Module):
    def __init__(self, num_classes=2):
        super(ModelB, self).__init__()
        # A lighter CNN architecture: three conv layers with pooling and dropout
        self.features = nn.Sequential(
            nn.Conv2d(3, 32, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(2),
            nn.Dropout(0.3),

            nn.Conv2d(32, 64, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(2),
            nn.Dropout(0.3),

            nn.Conv2d(64, 128, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(2),
            nn.Dropout(0.3)
        )
        self.classifier = nn.Sequential(
            nn.Flatten(),
            nn.Linear(128 * 6 * 6, 512),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(512, num_classes)
        )

    def forward(self, x):
        x = self.features(x)
        x = self.classifier(x)
        return x
    

# --- Model C: CNN + ViT based network for the entire face ---
class ModelC(nn.Module):
    def __init__(self, num_classes=2):
        super(ModelC, self).__init__()
        # Feature learning (FL) module: a deep CNN.
        # For demonstration, we use a simpler CNN here.
        self.cnn_feature_extractor = nn.Sequential(
            nn.Conv2d(3, 64, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(2),
            nn.Conv2d(64, 128, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(2),
            nn.Conv2d(128, 256, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(2)
        )
        # Assume feature map size is reduced appropriately (for 224x224, it becomes roughly 28x28)
        # Now use a vision transformer module from the timm library.
        # Note: You may need to install timm (pip install timm).
        self.vit = timm.create_model('vit_base_patch16_224', pretrained=True)
        # Replace the head of ViT to match our number of classes.
        in_features = self.vit.head.in_features
        self.vit.head = nn.Linear(in_features, num_classes)

    def forward(self, x):
        # Extract lower-level features (optional fusion)
        features = self.cnn_feature_extractor(x)
        # For this demonstration, we are feeding the original image to vit.
        # In a more advanced implementation, you can fuse the CNN features with ViT.
        out = self.vit(x)
        return out