File size: 5,153 Bytes
dd98f48 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 |
import os
import numpy as np
from PIL import Image
from collections import Counter
import matplotlib.pyplot as plt # at top of file
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import torchvision.transforms as transforms
import timm
# -------------------------------
# Transformations for different model inputs
# -------------------------------
# For Model A and Model B, we use small images (50x50)
transform_small = transforms.Compose([
transforms.Resize((50, 50)),
transforms.ToTensor(),
transforms.Normalize(mean=[0.5]*3, std=[0.5]*3)
])
# For Model C, we use larger images (224x224)
transform_large = transforms.Compose([
transforms.Resize((224, 224)),
transforms.ToTensor(),
transforms.Normalize(mean=[0.5]*3, std=[0.5]*3)
])
# --- Model A: CNN-based network for eye and nose regions (12 layers) ---
class ModelA(nn.Module):
def __init__(self, num_classes=2):
super(ModelA, self).__init__()
# Three convolutional blocks, each with 2 conv layers + BN, ReLU, pooling and dropout
self.block1 = nn.Sequential(
nn.Conv2d(3, 32, kernel_size=3, padding=1),
nn.ReLU(),
nn.BatchNorm2d(32),
nn.Conv2d(32, 32, kernel_size=3, padding=1),
nn.ReLU(),
nn.MaxPool2d(kernel_size=2),
nn.Dropout(0.3)
)
self.block2 = nn.Sequential(
nn.Conv2d(32, 64, kernel_size=3, padding=1),
nn.ReLU(),
nn.BatchNorm2d(64),
nn.Conv2d(64, 64, kernel_size=3, padding=1),
nn.ReLU(),
nn.MaxPool2d(kernel_size=2),
nn.Dropout(0.3)
)
self.block3 = nn.Sequential(
nn.Conv2d(64, 128, kernel_size=3, padding=1),
nn.ReLU(),
nn.BatchNorm2d(128),
nn.Conv2d(128, 128, kernel_size=3, padding=1),
nn.ReLU(),
nn.MaxPool2d(kernel_size=2),
nn.Dropout(0.3)
)
# After three blocks, feature map size for 50x50 input: 50 -> 25 -> ~12 -> ~6
self.classifier = nn.Sequential(
nn.Flatten(),
nn.Linear(128 * 6 * 6, 512),
nn.ReLU(),
nn.Dropout(0.3),
nn.Linear(512, num_classes)
)
def forward(self, x):
x = self.block1(x)
x = self.block2(x)
x = self.block3(x)
x = self.classifier(x)
return x
# --- Model B: Simpler CNN-based network (6 layers) ---
class ModelB(nn.Module):
def __init__(self, num_classes=2):
super(ModelB, self).__init__()
# A lighter CNN architecture: three conv layers with pooling and dropout
self.features = nn.Sequential(
nn.Conv2d(3, 32, kernel_size=3, padding=1),
nn.ReLU(),
nn.MaxPool2d(2),
nn.Dropout(0.3),
nn.Conv2d(32, 64, kernel_size=3, padding=1),
nn.ReLU(),
nn.MaxPool2d(2),
nn.Dropout(0.3),
nn.Conv2d(64, 128, kernel_size=3, padding=1),
nn.ReLU(),
nn.MaxPool2d(2),
nn.Dropout(0.3)
)
self.classifier = nn.Sequential(
nn.Flatten(),
nn.Linear(128 * 6 * 6, 512),
nn.ReLU(),
nn.Dropout(0.3),
nn.Linear(512, num_classes)
)
def forward(self, x):
x = self.features(x)
x = self.classifier(x)
return x
# --- Model C: CNN + ViT based network for the entire face ---
class ModelC(nn.Module):
def __init__(self, num_classes=2):
super(ModelC, self).__init__()
# Feature learning (FL) module: a deep CNN.
# For demonstration, we use a simpler CNN here.
self.cnn_feature_extractor = nn.Sequential(
nn.Conv2d(3, 64, kernel_size=3, padding=1),
nn.ReLU(),
nn.MaxPool2d(2),
nn.Conv2d(64, 128, kernel_size=3, padding=1),
nn.ReLU(),
nn.MaxPool2d(2),
nn.Conv2d(128, 256, kernel_size=3, padding=1),
nn.ReLU(),
nn.MaxPool2d(2)
)
# Assume feature map size is reduced appropriately (for 224x224, it becomes roughly 28x28)
# Now use a vision transformer module from the timm library.
# Note: You may need to install timm (pip install timm).
self.vit = timm.create_model('vit_base_patch16_224', pretrained=True)
# Replace the head of ViT to match our number of classes.
in_features = self.vit.head.in_features
self.vit.head = nn.Linear(in_features, num_classes)
def forward(self, x):
# Extract lower-level features (optional fusion)
features = self.cnn_feature_extractor(x)
# For this demonstration, we are feeding the original image to vit.
# In a more advanced implementation, you can fuse the CNN features with ViT.
out = self.vit(x)
return out
|