|
--- |
|
language: |
|
- en |
|
metrics: |
|
- accuracy |
|
- f1 |
|
- precision |
|
- recall |
|
pipeline_tag: video-classification |
|
tags: |
|
- i3d |
|
- pytorch |
|
- crime-detection |
|
--- |
|
|
|
|
|
# Smart Surveillance System |
|
|
|
we leveraged a pre-trained I3D model and fine-tuned it using two strategies: |
|
|
|
Block-level tuning Adjusting and retraining groups of layers (blocks) to adapt the model to the new dataset. |
|
|
|
Layer-level tuning Fine-tuning specific layers for more granular control over feature learning. |
|
|
|
The final classification layer of the I3D model was removed and replaced with a custom output layer tailored to our binary classification task: predicting whether an activity represents a crime (1) or non-crime (0). |
|
|
|
## How Run |
|
|
|
```python |
|
import torch |
|
import torch.nn as nn |
|
|
|
|
|
class UCFModel(nn.Module): |
|
def __init__(self, model_name="i3d_r50"): |
|
super().__init__() |
|
self.model_name = model_name |
|
|
|
self.model = torch.hub.load("facebookresearch/pytorchvideo", model_name, pretrained=True) |
|
|
|
in_features = self.model.blocks[-1].proj.in_features |
|
self.model.blocks[-1].proj = nn.Linear(in_features, 2) |
|
|
|
def forward(self, frames): |
|
return self.model(frames) |
|
|
|
``` |
|
|
|
```python |
|
import torch |
|
from PIL import Image |
|
from huggingface_hub import hf_hub_download |
|
from torchvision import transforms |
|
|
|
|
|
inference_transform = transforms.Compose( |
|
[ |
|
transforms.Resize((224, 224)), |
|
transforms.ToTensor(), |
|
transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]), |
|
] |
|
) |
|
|
|
|
|
class UCFInferenceByFrames: |
|
def __init__(self, repo_id): |
|
self.repo_id = repo_id |
|
|
|
self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") |
|
self.model = self.load_model() |
|
|
|
def load_model(self): |
|
model_path = hf_hub_download(repo_id=self.repo_id, filename="ucf_model.pth") |
|
state_dict = torch.load(model_path) |
|
|
|
model = UCFModel().to(device=self.device) |
|
model.load_state_dict(state_dict) |
|
model.eval() |
|
|
|
return model |
|
|
|
def inference(self, frames): |
|
video_tensor_list = [] |
|
for frame in frames: |
|
frame_pil = Image.fromarray(frame) |
|
frame_tensor = inference_transform(frame_pil) |
|
video_tensor_list.append(frame_tensor) |
|
|
|
video_tensor = torch.stack(video_tensor_list) |
|
video_tensor = video_tensor.permute(1, 0, 2, 3).unsqueeze(0).float() |
|
|
|
video_tensor = video_tensor.to(self.device) |
|
|
|
with torch.no_grad(): |
|
output = self.model(video_tensor) |
|
|
|
return output.argmax(1) |
|
``` |
|
|
|
```python |
|
import cv2 as cv |
|
import numpy as np |
|
|
|
ucf = UCFInferenceByFrames("amjad-awad/ucf-i3d-model-by-block-lr-0.01") |
|
|
|
def inference(ucf_model, video_path, max_frames=16): |
|
cap = cv.VideoCapture(video_path) |
|
|
|
if not cap.isOpened(): |
|
print("No video") |
|
return |
|
|
|
frames = [] |
|
|
|
while True: |
|
ret, frame = cap.read() |
|
|
|
if not ret: |
|
break |
|
|
|
frames.append(frame) |
|
|
|
length = len(frames) |
|
indices = np.linspace(0, length - 1, max_frames, dtype=int) |
|
frames = [frames[i] for i in indices] |
|
predict = ucf_model.inference(frames) |
|
|
|
return "Crime" if int(predict) == 1 else "No-Crime" |
|
|
|
``` |
|
|
|
```python |
|
predict = inference(ucf_model=ucf, video_path="YOUR_VIDEO_PATH.mp4") |
|
print(predict) |
|
``` |