File size: 4,996 Bytes
e16b8cb |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 |
import numpy as np
import torch
from PIL import Image
import cv2
import face_recognition
from transformers import ProcessorMixin
class DeepFakeProcessor(ProcessorMixin):
"""Processor for DeepFake detection model."""
def __init__(self, im_size=112, mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]):
self.im_size = im_size
self.mean = mean
self.std = std
def preprocess_frame(self, frame):
"""
Preprocess a single frame.
Args:
frame: PIL Image or numpy array
Returns:
torch.Tensor: Processed frame tensor
"""
# Convert to PIL Image if it's a numpy array
if isinstance(frame, np.ndarray):
frame = Image.fromarray(frame)
# Resize
frame = frame.resize((self.im_size, self.im_size))
# Convert to tensor
frame = np.array(frame).astype(np.float32) / 255.0
frame = (frame - np.array(self.mean)) / np.array(self.std)
frame = frame.transpose(2, 0, 1) # HWC -> CHW
frame = torch.tensor(frame, dtype=torch.float32)
return frame
def extract_frames(self, video_path, sequence_length=20, extract_faces=True):
"""
Extract frames from a video file.
Args:
video_path: Path to the video file
sequence_length: Number of frames to extract
extract_faces: Whether to extract faces from frames
Returns:
torch.Tensor: Tensor of shape (1, sequence_length, 3, im_size, im_size)
"""
frames = []
# Open video file
vidObj = cv2.VideoCapture(video_path)
# Calculate frame interval
total_frames = int(vidObj.get(cv2.CAP_PROP_FRAME_COUNT))
interval = max(1, total_frames // sequence_length)
# Extract frames
count = 0
success = True
while success and len(frames) < sequence_length:
success, image = vidObj.read()
if success and count % interval == 0:
# Convert BGR to RGB
image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
# Extract face if requested
if extract_faces:
face_locations = face_recognition.face_locations(image)
if face_locations:
top, right, bottom, left = face_locations[0]
# Add padding
padding = 40
h, w = image.shape[:2]
top = max(0, top - padding)
bottom = min(h, bottom + padding)
left = max(0, left - padding)
right = min(w, right + padding)
image = image[top:bottom, left:right]
# Preprocess frame
processed_frame = self.preprocess_frame(image)
frames.append(processed_frame)
count += 1
# If we couldn't extract enough frames, duplicate the last one
while len(frames) < sequence_length:
frames.append(frames[-1] if frames else torch.zeros((3, self.im_size, self.im_size)))
# Stack frames
frames = torch.stack(frames)
# Add batch dimension
frames = frames.unsqueeze(0)
return frames
def __call__(self, video_path=None, frames=None, return_tensors="pt", **kwargs):
"""
Process video for the model.
Args:
video_path: Path to the video file
frames: List of frames (PIL Images or numpy arrays)
return_tensors: Return format (only "pt" supported)
Returns:
dict: Processed inputs for the model
"""
if return_tensors != "pt":
raise ValueError("Only 'pt' return tensors are supported")
if video_path is not None:
# Extract frames from video
sequence_length = kwargs.get("sequence_length", 20)
extract_faces = kwargs.get("extract_faces", True)
processed_frames = self.extract_frames(
video_path,
sequence_length=sequence_length,
extract_faces=extract_faces
)
elif frames is not None:
# Process provided frames
processed_frames = torch.stack([self.preprocess_frame(frame) for frame in frames])
processed_frames = processed_frames.unsqueeze(0) # Add batch dimension
else:
raise ValueError("Either video_path or frames must be provided")
return {"pixel_values": processed_frames}
|