File size: 4,260 Bytes
b1fc48a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
import torch
import torchvision
from torchvision import models, transforms
from torch.utils.data import DataLoader
from torch.utils.data.dataset import Dataset
import os
import numpy as np
import matplotlib.pyplot as plt
import cv2
import face_recognition
import sys
import time
from torch.autograd import Variable
from torch import nn

class Model(nn.Module):
  def __init__(self, num_classes, latent_dim=2048, lstm_layers=1, hidden_dim=2048, bidirectional=False):
    super(Model, self).__init__()
    model = models.resnext50_32x4d(pretrained=True)
    self.model = nn.Sequential(*list(model.children())[:-2])
    self.lstm = nn.LSTM(latent_dim, hidden_dim, lstm_layers, bidirectional)
    self.relu = nn.LeakyReLU()
    self.dp = nn.Dropout(0.4)
    self.linear1 = nn.Linear(2048, num_classes)
    self.avgpool = nn.AdaptiveAvgPool2d(1)

  def forward(self, x):
    batch_size, seq_length, c, h, w = x.shape
    x = x.view(batch_size * seq_length, c, h, w)
    fmap = self.model(x)
    x = self.avgpool(fmap)
    x = x.view(batch_size, seq_length, 2048)
    x_lstm, _ = self.lstm(x, None)
    return fmap, self.dp(self.linear1(x_lstm[:, -1, :]))
  
im_size = 112
mean = [0.485, 0.456, 0.406]
std = [0.229, 0.224, 0.225]
sm = nn.Softmax()
inv_normalize = transforms.Normalize(mean=-1*np.divide(mean, std), std=np.divide([1,1,1], std))

def im_convert(tensor):
  image = tensor.to('cpu').clone().detach()
  image = image.squeeze()
  image = inv_normalize(image)
  image = image.numpy()
  image = image.transpose(1, 2, 0)
  image = image.clip(0, 1)
  cv2.imwrite('./2.png', image*255)
  return image

def predict(model, img, path = './'):
  fmap, logits = model(img.to('cuda'))
  params = list(model.parameters())
  weight_softmax = model.linear1.weight.detach().cpu().numpy()
  logits = sm(logits)
  _, prediction = torch.max(logits,1)
  confidence = logits[:, int(prediction.item())].item()*100
  print('confidence of prediction', logits[:, int(prediction.item())].item()*100)
  idx = np.argmax(logits.detach().cpu().numpy())
  bz, nc, h, w = fmap.shape
  out = np.dot(fmap[-1].detach().cpu().numpy().reshape((nc, h*w)).T, weight_softmax[idx, :].T)
  predict = out.reshape(h, w)
  predict = predict - np.min(predict)
  predict_img = predict / np.max(predict)
  predict_img = np.uint8(255*predict_img)
  out = cv2.resize(predict_img, (im_size, im_size))
  heatmap = cv2.applyColorMap(out, cv2.COLORMAP_JET)
  img = im_convert(img[:, -1, :, :, :])
  result = heatmap * 0.5 + img*0.8*255
  cv2.imwrite('./1.png', result)
  result1 = heatmap * 0.5/255 + img*0.8
  r, g, b = cv2.split(result1)
  result1 = cv2.merge((r, g, b))
  plt.imshow(result1)
  plt.show()
  return [int(prediction.item()), confidence]

class validation_dataset(Dataset):
  def __init__(self, video_names, sequence_length = 60, transform = None):
    self.video_names = video_names
    self.transform = transform
    self.count = sequence_length
  def __len__(self):
    return len(self.video_names)
  def __getitem__(self, idx):
    video_path = self.video_names[idx]
    frames = []
    a = int(100/self.count)
    first_frame = np.random.randint(0, a)
    for i, frame in enumerate(self.frame_extract(video_path)):
      faces = face_recognition.face_locations(frame)
      try:
        top, right, bottom, left = faces[0]
        frame = frame[top:bottom, left:right, :]
      except:
        pass
      if self.transform:
        frames.append(self.transform(frame))
      if(len(frames) == self.count):
        break
    if len(frames)==0:
      print('no face found in the video')
      return None
    frames = torch.stack(frames)
    frames = frames[:self.count]
    return frames.unsqueeze(0)

  def frame_extract(self, path):
    vidObj = cv2.VideoCapture(path)
    success = 1
    while success:
      success, image = vidObj.read()
      if success:
        yield image

  def im_plot(tensor):
    image = tensor.cpu().numpy().transpose(1,2,0)
    b,g,r = cv2.split(image)
    image = cv2.merge((r,g,b))
    image = image*[0.22803, 0.22145, 0.216989] + [0.43216, 0.394666, 0.37645]
    image = image*255.0
    plt.imshow(image.astype(int))
    plt.show()