--- language: - en metrics: - accuracy - f1 - precision - recall pipeline_tag: video-classification tags: - i3d - pytorch - crime-detection --- # Smart Surveillance System we leveraged a pre-trained I3D model and fine-tuned it using two strategies: Block-level tuning Adjusting and retraining groups of layers (blocks) to adapt the model to the new dataset. Layer-level tuning Fine-tuning specific layers for more granular control over feature learning. The final classification layer of the I3D model was removed and replaced with a custom output layer tailored to our binary classification task: predicting whether an activity represents a crime (1) or non-crime (0). ## How Run ```python import torch import torch.nn as nn class UCFModel(nn.Module): def __init__(self, model_name="i3d_r50"): super().__init__() self.model_name = model_name self.model = torch.hub.load("facebookresearch/pytorchvideo", model_name, pretrained=True) in_features = self.model.blocks[-1].proj.in_features self.model.blocks[-1].proj = nn.Linear(in_features, 2) def forward(self, frames): return self.model(frames) ``` ```python import torch from PIL import Image from huggingface_hub import hf_hub_download from torchvision import transforms inference_transform = transforms.Compose( [ transforms.Resize((224, 224)), transforms.ToTensor(), transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]), ] ) class UCFInferenceByFrames: def __init__(self, repo_id): self.repo_id = repo_id self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") self.model = self.load_model() def load_model(self): model_path = hf_hub_download(repo_id=self.repo_id, filename="ucf_model.pth") state_dict = torch.load(model_path) model = UCFModel().to(device=self.device) model.load_state_dict(state_dict) model.eval() return model def inference(self, frames): video_tensor_list = [] for frame in frames: frame_pil = Image.fromarray(frame) frame_tensor = inference_transform(frame_pil) video_tensor_list.append(frame_tensor) video_tensor = torch.stack(video_tensor_list) video_tensor = video_tensor.permute(1, 0, 2, 3).unsqueeze(0).float() video_tensor = video_tensor.to(self.device) with torch.no_grad(): output = self.model(video_tensor) return output.argmax(1) ``` ```python import cv2 as cv import numpy as np ucf = UCFInferenceByFrames("amjad-awad/ucf-i3d-model-by-3-block-lr-0.001") def inference(ucf_model, video_path, max_frames=16): cap = cv.VideoCapture(video_path) if not cap.isOpened(): print("No video") return frames = [] while True: ret, frame = cap.read() if not ret: break frames.append(frame) length = len(frames) indices = np.linspace(0, length - 1, max_frames, dtype=int) frames = [frames[i] for i in indices] predict = ucf_model.inference(frames) return "Crime" if int(predict) == 1 else "No-Crime" ``` ```python predict = inference(ucf_model=ucf, video_path="YOUR_VIDEO_PATH.mp4") print(predict) ```