Rahulk2197 commited on
Commit
ee94b36
·
verified ·
1 Parent(s): 78e78e3

Upload 18 files

Browse files
.gitattributes CHANGED
@@ -33,3 +33,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ models/res10_300x300_ssd_iter_140000.caffemodel filter=lfs diff=lfs merge=lfs -text
37
+ models/shape_predictor_68_face_landmarks.dat filter=lfs diff=lfs merge=lfs -text
functions/__pycache__/audio.cpython-312.pyc ADDED
Binary file (5.15 kB). View file
 
functions/__pycache__/fer.cpython-312.pyc ADDED
Binary file (7.79 kB). View file
 
functions/__pycache__/helper.cpython-312.pyc ADDED
Binary file (3.09 kB). View file
 
functions/__pycache__/models.cpython-312.pyc ADDED
Binary file (2.76 kB). View file
 
functions/__pycache__/valence_arousal.cpython-312.pyc ADDED
Binary file (4.86 kB). View file
 
functions/__pycache__/video.cpython-312.pyc ADDED
Binary file (4.89 kB). View file
 
functions/audio.py ADDED
@@ -0,0 +1,97 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import librosa
2
+ import numpy as np
3
+ import torch
4
+ from collections import Counter
5
+ import nltk
6
+ device = "cuda:0" if torch.cuda.is_available() else "cpu"
7
+ torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
8
+ def extract_audio_features(audio_path,asrmodel,asrproc,sentipipe):
9
+ y, sr = librosa.load(audio_path,sr=16000)
10
+ inputs = asrproc(y, sampling_rate=sr, return_tensors="pt").input_features
11
+ inputs = inputs.to(device, dtype=torch_dtype)
12
+ with torch.no_grad():
13
+ generated_ids = asrmodel.generate(inputs)
14
+ transcript = asrproc.batch_decode(generated_ids, skip_special_tokens=True)[0]
15
+ # Sound intensity (RMS)
16
+ rms = librosa.feature.rms(y=y)
17
+ sound_intensity = np.mean(rms)
18
+
19
+ # Fundamental frequency (F0)
20
+ f0, voiced_flag, voiced_probs = librosa.pyin(y, fmin=librosa.note_to_hz('C2'), fmax=librosa.note_to_hz('C7'))
21
+ fundamental_frequency = np.nanmean(f0)
22
+
23
+ # Spectral energy (based on STFT)
24
+ S = np.abs(librosa.stft(y))
25
+ spectral_energy = np.mean(np.sum(S ** 2, axis=0))
26
+
27
+ # Spectral centroid
28
+ spectral_centroid = librosa.feature.spectral_centroid(y=y, sr=sr)
29
+ avg_spectral_centroid = np.mean(spectral_centroid)
30
+
31
+ # Zero-crossing rate
32
+ zcr = librosa.feature.zero_crossing_rate(y)
33
+ zero_crossing_rate = np.mean(zcr)
34
+
35
+ # Pause detection
36
+ silence_threshold = -40
37
+ silent_intervals = librosa.effects.split(y, top_db=silence_threshold) # Split into non-silent intervals
38
+ pause_duration = 0
39
+ for start, end in silent_intervals:
40
+ pause_duration += (end - start) / sr # Add the pause duration in seconds
41
+
42
+ total_duration = librosa.get_duration(y=y, sr=sr)
43
+ pause_rate = (pause_duration / total_duration) * 60 # Convert to pauses per minute
44
+
45
+ # Transcript processing
46
+ words = nltk.word_tokenize(transcript)
47
+ num_words = len(words)
48
+ unique_words = len(set(words))
49
+ word_frequencies = Counter(words)
50
+
51
+ duration_minutes = total_duration / 60
52
+ avg_words_per_minute = num_words / duration_minutes
53
+ avg_unique_words_per_minute = unique_words / duration_minutes
54
+
55
+ # Count of unique words
56
+ unique_word_count = unique_words
57
+
58
+ # Filler word detection
59
+ filler_words = [
60
+ 'uh', 'um', 'like', 'you know', 'ah', 'er', 'hmm', 'well', 'so',
61
+ 'I mean', 'okay', 'right', 'actually', 'basically', 'you see',
62
+ 'sort of', 'kind of', 'yeah', 'literally', 'just', 'I guess',
63
+ 'totally', 'honestly', 'seriously', 'alright'
64
+ ]
65
+ filler_word_count = sum([word_frequencies.get(filler, 0) for filler in filler_words])
66
+ filler_words_per_minute = filler_word_count / duration_minutes
67
+
68
+ # POS tagging
69
+ pos_tags = nltk.pos_tag(words)
70
+ nouns = [word for word, pos in pos_tags if pos.startswith('NN')]
71
+ adjectives = [word for word, pos in pos_tags if pos.startswith('JJ')]
72
+ verbs = [word for word, pos in pos_tags if pos.startswith('VB')]
73
+
74
+ # Sentiment analysis
75
+ sentiment = sentipipe(transcript)
76
+
77
+ print("Nouns: ", nouns)
78
+ print("Adjectives: ", adjectives)
79
+ print("Verbs: ", verbs)
80
+
81
+ return {
82
+ "transcript": transcript, # assuming this is a string
83
+ "sentiment":sentiment,
84
+ "sound_intensity": float(sound_intensity), # convert numpy float to Python float
85
+ "fundamental_frequency": float(fundamental_frequency), # same conversion
86
+ "spectral_energy": float(spectral_energy), # convert to Python float
87
+ "spectral_centroid": float(avg_spectral_centroid), # convert numpy float
88
+ "zero_crossing_rate": float(zero_crossing_rate), # convert to Python float
89
+ "avg_words_per_minute": float(avg_words_per_minute), # same conversion
90
+ "avg_unique_words_per_minute": float(avg_unique_words_per_minute), # convert float
91
+ "unique_word_count": int(unique_word_count), # convert to integer if needed
92
+ "filler_words_per_minute": float(filler_words_per_minute), # convert float
93
+ "noun_count": len(nouns), # Assuming nouns is a list, so no changes needed
94
+ "adjective_count": len(adjectives), # Same here
95
+ "verb_count": len(verbs), # Same here
96
+ "pause_rate": float(pause_rate), # convert to Python float
97
+ }
functions/fer.py ADDED
@@ -0,0 +1,137 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import cv2
2
+ import torch
3
+ import torchvision.transforms as transforms
4
+ from PIL import Image
5
+ import numpy as np
6
+ import timm
7
+ from tqdm import tqdm
8
+ import torch.nn as nn
9
+ import os
10
+ import matplotlib.pyplot as plt
11
+ import matplotlib
12
+ matplotlib.use('Agg')
13
+
14
+ import torch.nn.functional as F
15
+ import pandas as pd
16
+
17
+ class Model:
18
+ def __init__(self,fps,fer_model):
19
+ self.device="cuda" if torch.cuda.is_available() else "cpu"
20
+ self.transform = transforms.Compose([transforms.Resize((224, 224)),
21
+ transforms.ToTensor(),
22
+ transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])]
23
+ )
24
+ self.fermodel= timm.create_model("tf_efficientnet_b0_ns", pretrained=False)
25
+ self.fermodel.classifier = torch.nn.Identity()
26
+ self.fermodel.classifier=nn.Sequential(
27
+ nn.Linear(in_features=1280, out_features=7)
28
+ )
29
+ self.fermodel = torch.load(
30
+ fer_model,
31
+ map_location=self.device)
32
+ self.fermodel.to(self.device)
33
+
34
+ self.class_labels = ["angry", "disgust", "fear", "happy", "neutral", "sad", "surprised"]
35
+ self.emotion_reorder = {
36
+ 0: 6,
37
+ 1: 5,
38
+ 2: 4,
39
+ 3: 1,
40
+ 4: 0,
41
+ 5: 2,
42
+ 6: 3,
43
+ }
44
+ self.label_dict = {
45
+ 0: "angry",
46
+ 1: "disgust",
47
+ 2: "fear",
48
+ 3: "happy",
49
+ 4: "neutral",
50
+ 5: "sad",
51
+ 6: "surprised",
52
+ }
53
+ self.class_wise_frame_count=None
54
+ self.emotion_count = [0] * 7
55
+ self.frame_count=0
56
+ self.fps=fps
57
+ self.df=None
58
+ self.faces_=0
59
+ def predict(self,frames):
60
+ emotion_list=[]
61
+ emt=[]
62
+ for frame in tqdm(frames):
63
+ if frame is not None:
64
+ frame=np.copy(frame)
65
+ face_pil = Image.fromarray(
66
+ cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
67
+ )
68
+ face_tensor = self.transform(face_pil).unsqueeze(0).to(self.device)
69
+ with torch.no_grad():
70
+ output = self.fermodel(face_tensor)
71
+ _, predicted = torch.max(output, 1)
72
+ emotion = self.emotion_reorder[predicted.item()]
73
+ if isinstance(emotion, np.ndarray):
74
+ emotion = (
75
+ emotion.astype(float).item()
76
+ if emotion.size == 1
77
+ else emotion.tolist()
78
+ )
79
+ emotion = torch.tensor(
80
+ [emotion], dtype=torch.float32
81
+ ) # Ensures it's a tensor
82
+ emotion.to(self.device)
83
+ emt.append(emotion)
84
+ self.emotion_count[predicted.item()] += 1
85
+ label = f"{self.label_dict[predicted.item()]}"
86
+ emotion_list.append(label)
87
+ else:
88
+ emt.append('frame error')
89
+ emotion_list.append('frame error')
90
+ return emotion_list,emt
91
+
92
+ def get_data(self,emotion_list,emt):
93
+ self.class_wise_frame_count = dict(zip(self.class_labels, self.emotion_count))
94
+ return emotion_list,self.class_wise_frame_count,emt
95
+
96
+ def fer_predict(video_frames,fps,model):
97
+ emotion_list,emt=model.predict(video_frames)
98
+ return model.get_data(emotion_list,emt)
99
+
100
+ def filter(list1,list2):
101
+ filtered_list1 = [x for i, x in enumerate(list1) if list2[i]!='fnf']
102
+ filtered_list2 = [x for x in list2 if x!='fnf']
103
+ return [filtered_list1,filtered_list2]
104
+
105
+ def plot_graph(x, y_vals, labels, path, calib_vals=None):
106
+ """
107
+ Plots multiple subplots (one for each variable) in one figure.
108
+
109
+ Parameters:
110
+ - x: List of timestamps or frame numbers.
111
+ - y_vals: List of y-values for valence, arousal, and stress (or other metrics).
112
+ - labels: List of variable names corresponding to y_vals (e.g., ['valence', 'arousal', 'stress']).
113
+ - path: Path to save the combined plot.
114
+ - calib_vals: List of calibration values for each variable (optional).
115
+ """
116
+ plt.figure(figsize=(12, 8)) # Create a figure of appropriate size
117
+
118
+ # Iterate over y-values, labels, and calibration values to create subplots
119
+ for i, (y, label) in enumerate(zip(y_vals, labels)):
120
+ y = [value if isinstance(value, (int, float)) else np.nan for value in y]
121
+
122
+ # Create a subplot (3 rows, 1 column, and the current subplot index)
123
+ plt.subplot(len(y_vals), 1, i+1)
124
+ plt.plot(range(len(x)), y, linestyle='-')
125
+
126
+ # Plot calibration line if provided
127
+ if calib_vals and calib_vals[i] is not None:
128
+ plt.axhline(y=calib_vals[i], color='r', linestyle='--', label=f'{label} calibration = {calib_vals[i]}')
129
+
130
+ plt.xlabel('Frame')
131
+ plt.ylabel(label)
132
+ plt.title(f'{label} By Frames')
133
+ plt.legend()
134
+
135
+ plt.tight_layout() # Adjust layout to prevent overlap
136
+ plt.savefig(path)
137
+ plt.clf() # Clear the figure after saving
functions/helper.py ADDED
@@ -0,0 +1,58 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import cv2
2
+ import numpy as np
3
+ import dlib
4
+ from tqdm import tqdm
5
+
6
+
7
+ def extract_face(image, net, predictor):
8
+ (h, w) = image.shape[:2]
9
+ blob = cv2.dnn.blobFromImage(cv2.resize(image, (300, 300)), 1.0, (300, 300), (104.0, 177.0, 123.0))
10
+ net.setInput(blob)
11
+ detections = net.forward()
12
+ for i in range(0, detections.shape[2]):
13
+ confidence = detections[0, 0, i, 2]
14
+
15
+ # Filter out weak detections
16
+ if confidence > 0.5:
17
+ box = detections[0, 0, i, 3:7] * np.array([w, h, w, h])
18
+ (startX, startY, endX, endY) = box.astype("int")
19
+
20
+ # Convert bounding box to dlib rectangle format
21
+ dlib_rect = dlib.rectangle(int(startX), int(startY), int(endX), int(endY))
22
+ gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
23
+ landmarks = predictor(gray, dlib_rect)
24
+ landmarks_np = np.array([[p.x, p.y] for p in landmarks.parts()])
25
+ x, y, w, h = cv2.boundingRect(landmarks_np)
26
+ x -= 25
27
+ y -= 25
28
+ w += 50
29
+ h += 50
30
+
31
+ x = max(0, x)
32
+ y = max(0, y)
33
+ w = min(w, image.shape[1] - x)
34
+ h = min(h, image.shape[0] - y)
35
+ face_crop=image[y:y+h,x:x+w]
36
+ # Crop and resize the face
37
+ try:
38
+ face_crop = cv2.resize(face_crop, (224, 224))
39
+ except:
40
+ face_crop = cv2.resize(image, (224, 224))
41
+ return face_crop,landmarks_np,(w,h)
42
+ return None,None,None
43
+
44
+ def extract_faces_from_frames(frames, net, predictor):
45
+ faces_list = []
46
+ landmarks_list = []
47
+ sizes_list = []
48
+
49
+ for image in tqdm(frames):
50
+ face_crop, landmarks_np, size = extract_face(image, net, predictor)
51
+
52
+ # Append the results to the respective lists
53
+ faces_list.append(face_crop)
54
+ landmarks_list.append(landmarks_np)
55
+ sizes_list.append(size)
56
+
57
+ return faces_list, landmarks_list, sizes_list
58
+
functions/models.py ADDED
@@ -0,0 +1,56 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import nltk
2
+ import torch
3
+ from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
4
+ import os
5
+ from functions.fer import Model
6
+ import cv2
7
+ import dlib
8
+ from functions.valence_arousal import load_models
9
+ # Download necessary NLTK packages
10
+ nltk.download('punkt')
11
+ nltk.download('averaged_perceptron_tagger')
12
+
13
+
14
+ # Device setup
15
+ device = "cuda:0" if torch.cuda.is_available() else "cpu"
16
+ torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
17
+ models_folder=os.path.join(os.path.dirname(os.path.dirname(os.path.abspath(__file__))), 'models')
18
+ fer_model_path=os.path.join(models_folder,'22.6_AffectNet_10K_part2.pt')
19
+ val_ar_feat_path=os.path.join(models_folder,'resnet_features.pt')
20
+ valence_arousal_model=os.path.join(models_folder,'emotion_model.pt')
21
+
22
+
23
+ # Load Whisper model and processor
24
+ model_id = "openai/whisper-small"
25
+ model = AutoModelForSpeechSeq2Seq.from_pretrained(
26
+ model_id, torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True
27
+ )
28
+ model.to(device)
29
+ processor = AutoProcessor.from_pretrained(model_id)
30
+ sentipipe = pipeline("text-classification", model="cardiffnlp/twitter-roberta-base-sentiment", device=device)
31
+
32
+
33
+
34
+
35
+ fer_model=Model(fps=30,fer_model=fer_model_path)
36
+ resnet,emotion_model=load_models(valence_arousal_model,val_ar_feat_path)
37
+
38
+ smile_cascade = cv2.CascadeClassifier(cv2.data.haarcascades + 'haarcascade_smile.xml')
39
+
40
+
41
+ dnn_net = cv2.dnn.readNetFromCaffe("models/deploy.prototxt", "models/res10_300x300_ssd_iter_140000.caffemodel")
42
+
43
+ predictor = dlib.shape_predictor("models/shape_predictor_68_face_landmarks.dat")
44
+
45
+
46
+
47
+
48
+ models_dict={
49
+ 'asrmodel':model,
50
+ 'asrproc':processor,
51
+ 'sentipipe':sentipipe,
52
+ 'fer':fer_model,
53
+ "valence_fer":(resnet,emotion_model),
54
+ 'smile_cascade':smile_cascade,
55
+ 'face':(dnn_net,predictor)
56
+ }
functions/valence_arousal.py ADDED
@@ -0,0 +1,93 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from torchvision import models
2
+ import torch.nn as nn
3
+ import torch
4
+ import torch.nn.functional as F
5
+ import torchvision.transforms as transforms
6
+ from PIL import Image
7
+ import cv2
8
+
9
+ def create_emotion_model(num_ftrs, num_emotions):
10
+ return nn.Sequential(
11
+ nn.Linear(num_ftrs + num_emotions, 128),
12
+ nn.ReLU(),
13
+ nn.Linear(128, 64),
14
+ nn.ReLU(),
15
+ nn.Linear(64, 2),
16
+ )
17
+ def load_models(val_model_path,val_featmodel_path):
18
+ transform = transforms.Compose(
19
+ [
20
+ transforms.Resize((224, 224)),
21
+ transforms.ToTensor(),
22
+ transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
23
+ ]
24
+ )
25
+ device = "cuda" if torch.cuda.is_available() else "cpu"
26
+
27
+ resnet = models.resnet18(pretrained=False)
28
+ num_ftrs = resnet.fc.in_features
29
+ resnet.fc = nn.Identity()
30
+ resnet.load_state_dict(
31
+ torch.load(
32
+ val_featmodel_path,
33
+ map_location=device
34
+ )
35
+ )
36
+ resnet = resnet.to(device)
37
+
38
+ # num_ftrs = resnet.fc.in_features
39
+ num_emotions = 1
40
+ emotion_model = create_emotion_model(num_ftrs, num_emotions).to(device)
41
+ emotion_model.load_state_dict(
42
+ torch.load(
43
+ val_model_path,
44
+ map_location=device
45
+ )
46
+ )
47
+ return resnet,emotion_model
48
+
49
+
50
+
51
+ def va_predict(emotion_model,resnet,faces,emotions):
52
+ transform = transforms.Compose(
53
+ [
54
+ transforms.Resize((224, 224)),
55
+ transforms.ToTensor(),
56
+ transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
57
+ ]
58
+ )
59
+ device = "cuda" if torch.cuda.is_available() else "cpu"
60
+ def model_forward(images, emotions):
61
+ resnet_features = resnet(images)
62
+ batch_size = resnet_features.size(0)
63
+ emotions = emotions.view(batch_size, -1)
64
+ x = torch.cat((resnet_features, emotions), dim=1)
65
+ output = emotion_model(x)
66
+ return output
67
+
68
+ arousal_list = []
69
+ valence_list = []
70
+ stress_list = []
71
+ from tqdm import tqdm
72
+ device = "cuda" if torch.cuda.is_available() else "cpu"
73
+ for face, emotion in tqdm(zip(faces, emotions)):
74
+ if face is not None:
75
+ face_pil = Image.fromarray(cv2.cvtColor(face, cv2.COLOR_BGR2RGB))
76
+ face_tensor = transform(face_pil).unsqueeze(0).to(device)
77
+ # print(emotion)
78
+ # print(emotion)
79
+ emotion = emotion.to(device)
80
+ output_va = model_forward(face_tensor, emotion)
81
+ arousal = output_va[0][0].item()
82
+ norm_arousal = float(output_va[0][0].item()) / 2 + 0.5
83
+ valence = output_va[0][1].item()
84
+ norm_valence = float(output_va[0][1].item()) / 2 + 0.5
85
+ stress = (1 - norm_valence) * norm_arousal
86
+ arousal_list.append(arousal)
87
+ valence_list.append(valence)
88
+ stress_list.append(stress)
89
+ else:
90
+ arousal_list.append('frame error')
91
+ valence_list.append('frame error')
92
+ stress_list.append('frame error')
93
+ return valence_list, arousal_list, stress_list
functions/video.py ADDED
@@ -0,0 +1,114 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ from scipy.spatial import distance as dist
3
+ from imutils import face_utils
4
+ (lStart, lEnd) = face_utils.FACIAL_LANDMARKS_IDXS["left_eye"]
5
+ (rStart, rEnd) = face_utils.FACIAL_LANDMARKS_IDXS["right_eye"]
6
+
7
+ def euclidean_distance(point1, point2):
8
+ return np.linalg.norm(point1 - point2)
9
+
10
+ def eyebrow(landmarks,sizes):
11
+ eyebrow_dist=[]
12
+ for landmark,size in zip(landmarks,sizes):
13
+ if landmark is not None:
14
+ right_eyebrow_inner = landmark[21]
15
+ left_eyebrow_inner = landmark[22]
16
+ eyebrow_distance = euclidean_distance(right_eyebrow_inner, left_eyebrow_inner)
17
+ normalized_eyebrow_distance = eyebrow_distance / size[0]
18
+
19
+ else:
20
+ normalized_eyebrow_distance=None
21
+ eyebrow_dist.append(normalized_eyebrow_distance)
22
+ return eyebrow_dist
23
+
24
+ def eye_aspect_ratio(eye):
25
+ A = dist.euclidean(eye[1], eye[5]) # Vertical distance 1
26
+ B = dist.euclidean(eye[2], eye[4]) # Vertical distance 2
27
+ C = dist.euclidean(eye[0], eye[3]) # Horizontal distance
28
+ ear = (A + B) / (2.0 * C) # EAR formula
29
+ return ear
30
+
31
+ def blinks(landmarks, sizes, fps):
32
+ blink_durations = []
33
+ blink_counter = 0
34
+ total_blinks = 0
35
+ EYE_AR_THRESH = 0.24 # EAR threshold for blink detection
36
+ EYE_AR_CONSEC_FRAMES = 4 # Consecutive frames for blink detection
37
+
38
+ frame_count = 0 # Initialize frame counter
39
+
40
+ for landmark, size in zip(landmarks, sizes):
41
+ if landmark is not None:
42
+ leftEye = landmark[lStart:lEnd]
43
+ rightEye = landmark[rStart:rEnd]
44
+
45
+ leftEAR = eye_aspect_ratio(leftEye)
46
+ rightEAR = eye_aspect_ratio(rightEye)
47
+
48
+ if leftEAR < EYE_AR_THRESH and rightEAR < EYE_AR_THRESH:
49
+ if blink_counter == 0:
50
+ blink_start_frame = frame_count # Start tracking blink in frames
51
+ blink_counter += 1
52
+ else:
53
+ if blink_counter >= EYE_AR_CONSEC_FRAMES:
54
+ blink_end_frame = frame_count
55
+ blink_duration_frames = blink_end_frame - blink_start_frame
56
+ blink_duration_seconds = blink_duration_frames / fps # Convert frames to seconds
57
+ blink_durations.append(blink_duration_seconds) # Store blink duration in seconds
58
+ total_blinks += 1
59
+ blink_counter = 0
60
+
61
+ frame_count += 1 # Increment the frame counter for each loop iteration
62
+
63
+ return blink_durations, total_blinks
64
+
65
+
66
+ def detect_smiles(faces,smile_cascade):
67
+ smiles=[]
68
+ count=0
69
+ for face in faces:
70
+ if face is not None:
71
+ smile = smile_cascade.detectMultiScale(face, scaleFactor=1.8, minNeighbors=20, minSize=(25, 25))
72
+ if len(smile) > 0:
73
+ smiles.append(True)
74
+ count+=1
75
+ else:
76
+ smiles.append(False)
77
+ else:
78
+ smiles.append(None)
79
+ return smiles,count
80
+
81
+ def cal_yawn(landmarks):
82
+ # Corrected lip landmark indices for dlib's 68-point model
83
+ top_lip_idx = [50, 51, 52, 53, 61, 62, 63]
84
+ low_lip_idx = [56, 57, 58, 59, 65, 66, 67]
85
+
86
+ top_lip = np.array([landmarks[idx] for idx in top_lip_idx])
87
+ low_lip = np.array([landmarks[idx] for idx in low_lip_idx])
88
+
89
+ top_mean = np.mean(top_lip, axis=0)
90
+ low_mean = np.mean(low_lip, axis=0)
91
+
92
+ distance = dist.euclidean(top_mean, low_mean)
93
+ return distance
94
+
95
+ def detect_yawn(landmarks,sizes):
96
+ yawn=[]
97
+ count=0
98
+ normalized_yawn_thresh = 0.25
99
+ normalized_lip_distances=[]
100
+ for landmark,size in zip(landmarks,sizes):
101
+ if landmark is not None:
102
+ lip_dist = cal_yawn(landmark)
103
+ face_size = dist.euclidean(landmark[8], landmark[27])
104
+ normalized_lip_dist = lip_dist / face_size
105
+ normalized_lip_distances.append(normalized_lip_dist)
106
+ if normalized_lip_dist > normalized_yawn_thresh:
107
+ yawn.append(True)
108
+ count+=1
109
+ else:
110
+ yawn.append(False)
111
+ else:
112
+ normalized_lip_distances.append(None)
113
+ yawn.append(None)
114
+ return yawn,normalized_lip_distances,count
models/22.6_AffectNet_10K_part2.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:92ef53adb843700faa3c54ae6f3e0f4105e04e099f9190dd66aafc360afdb2bf
3
+ size 16425358
models/deploy.prototxt ADDED
@@ -0,0 +1,1789 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ input: "data"
2
+ input_shape {
3
+ dim: 1
4
+ dim: 3
5
+ dim: 300
6
+ dim: 300
7
+ }
8
+
9
+ layer {
10
+ name: "data_bn"
11
+ type: "BatchNorm"
12
+ bottom: "data"
13
+ top: "data_bn"
14
+ param {
15
+ lr_mult: 0.0
16
+ }
17
+ param {
18
+ lr_mult: 0.0
19
+ }
20
+ param {
21
+ lr_mult: 0.0
22
+ }
23
+ }
24
+ layer {
25
+ name: "data_scale"
26
+ type: "Scale"
27
+ bottom: "data_bn"
28
+ top: "data_bn"
29
+ param {
30
+ lr_mult: 1.0
31
+ decay_mult: 1.0
32
+ }
33
+ param {
34
+ lr_mult: 2.0
35
+ decay_mult: 1.0
36
+ }
37
+ scale_param {
38
+ bias_term: true
39
+ }
40
+ }
41
+ layer {
42
+ name: "conv1_h"
43
+ type: "Convolution"
44
+ bottom: "data_bn"
45
+ top: "conv1_h"
46
+ param {
47
+ lr_mult: 1.0
48
+ decay_mult: 1.0
49
+ }
50
+ param {
51
+ lr_mult: 2.0
52
+ decay_mult: 1.0
53
+ }
54
+ convolution_param {
55
+ num_output: 32
56
+ pad: 3
57
+ kernel_size: 7
58
+ stride: 2
59
+ weight_filler {
60
+ type: "msra"
61
+ variance_norm: FAN_OUT
62
+ }
63
+ bias_filler {
64
+ type: "constant"
65
+ value: 0.0
66
+ }
67
+ }
68
+ }
69
+ layer {
70
+ name: "conv1_bn_h"
71
+ type: "BatchNorm"
72
+ bottom: "conv1_h"
73
+ top: "conv1_h"
74
+ param {
75
+ lr_mult: 0.0
76
+ }
77
+ param {
78
+ lr_mult: 0.0
79
+ }
80
+ param {
81
+ lr_mult: 0.0
82
+ }
83
+ }
84
+ layer {
85
+ name: "conv1_scale_h"
86
+ type: "Scale"
87
+ bottom: "conv1_h"
88
+ top: "conv1_h"
89
+ param {
90
+ lr_mult: 1.0
91
+ decay_mult: 1.0
92
+ }
93
+ param {
94
+ lr_mult: 2.0
95
+ decay_mult: 1.0
96
+ }
97
+ scale_param {
98
+ bias_term: true
99
+ }
100
+ }
101
+ layer {
102
+ name: "conv1_relu"
103
+ type: "ReLU"
104
+ bottom: "conv1_h"
105
+ top: "conv1_h"
106
+ }
107
+ layer {
108
+ name: "conv1_pool"
109
+ type: "Pooling"
110
+ bottom: "conv1_h"
111
+ top: "conv1_pool"
112
+ pooling_param {
113
+ kernel_size: 3
114
+ stride: 2
115
+ }
116
+ }
117
+ layer {
118
+ name: "layer_64_1_conv1_h"
119
+ type: "Convolution"
120
+ bottom: "conv1_pool"
121
+ top: "layer_64_1_conv1_h"
122
+ param {
123
+ lr_mult: 1.0
124
+ decay_mult: 1.0
125
+ }
126
+ convolution_param {
127
+ num_output: 32
128
+ bias_term: false
129
+ pad: 1
130
+ kernel_size: 3
131
+ stride: 1
132
+ weight_filler {
133
+ type: "msra"
134
+ }
135
+ bias_filler {
136
+ type: "constant"
137
+ value: 0.0
138
+ }
139
+ }
140
+ }
141
+ layer {
142
+ name: "layer_64_1_bn2_h"
143
+ type: "BatchNorm"
144
+ bottom: "layer_64_1_conv1_h"
145
+ top: "layer_64_1_conv1_h"
146
+ param {
147
+ lr_mult: 0.0
148
+ }
149
+ param {
150
+ lr_mult: 0.0
151
+ }
152
+ param {
153
+ lr_mult: 0.0
154
+ }
155
+ }
156
+ layer {
157
+ name: "layer_64_1_scale2_h"
158
+ type: "Scale"
159
+ bottom: "layer_64_1_conv1_h"
160
+ top: "layer_64_1_conv1_h"
161
+ param {
162
+ lr_mult: 1.0
163
+ decay_mult: 1.0
164
+ }
165
+ param {
166
+ lr_mult: 2.0
167
+ decay_mult: 1.0
168
+ }
169
+ scale_param {
170
+ bias_term: true
171
+ }
172
+ }
173
+ layer {
174
+ name: "layer_64_1_relu2"
175
+ type: "ReLU"
176
+ bottom: "layer_64_1_conv1_h"
177
+ top: "layer_64_1_conv1_h"
178
+ }
179
+ layer {
180
+ name: "layer_64_1_conv2_h"
181
+ type: "Convolution"
182
+ bottom: "layer_64_1_conv1_h"
183
+ top: "layer_64_1_conv2_h"
184
+ param {
185
+ lr_mult: 1.0
186
+ decay_mult: 1.0
187
+ }
188
+ convolution_param {
189
+ num_output: 32
190
+ bias_term: false
191
+ pad: 1
192
+ kernel_size: 3
193
+ stride: 1
194
+ weight_filler {
195
+ type: "msra"
196
+ }
197
+ bias_filler {
198
+ type: "constant"
199
+ value: 0.0
200
+ }
201
+ }
202
+ }
203
+ layer {
204
+ name: "layer_64_1_sum"
205
+ type: "Eltwise"
206
+ bottom: "layer_64_1_conv2_h"
207
+ bottom: "conv1_pool"
208
+ top: "layer_64_1_sum"
209
+ }
210
+ layer {
211
+ name: "layer_128_1_bn1_h"
212
+ type: "BatchNorm"
213
+ bottom: "layer_64_1_sum"
214
+ top: "layer_128_1_bn1_h"
215
+ param {
216
+ lr_mult: 0.0
217
+ }
218
+ param {
219
+ lr_mult: 0.0
220
+ }
221
+ param {
222
+ lr_mult: 0.0
223
+ }
224
+ }
225
+ layer {
226
+ name: "layer_128_1_scale1_h"
227
+ type: "Scale"
228
+ bottom: "layer_128_1_bn1_h"
229
+ top: "layer_128_1_bn1_h"
230
+ param {
231
+ lr_mult: 1.0
232
+ decay_mult: 1.0
233
+ }
234
+ param {
235
+ lr_mult: 2.0
236
+ decay_mult: 1.0
237
+ }
238
+ scale_param {
239
+ bias_term: true
240
+ }
241
+ }
242
+ layer {
243
+ name: "layer_128_1_relu1"
244
+ type: "ReLU"
245
+ bottom: "layer_128_1_bn1_h"
246
+ top: "layer_128_1_bn1_h"
247
+ }
248
+ layer {
249
+ name: "layer_128_1_conv1_h"
250
+ type: "Convolution"
251
+ bottom: "layer_128_1_bn1_h"
252
+ top: "layer_128_1_conv1_h"
253
+ param {
254
+ lr_mult: 1.0
255
+ decay_mult: 1.0
256
+ }
257
+ convolution_param {
258
+ num_output: 128
259
+ bias_term: false
260
+ pad: 1
261
+ kernel_size: 3
262
+ stride: 2
263
+ weight_filler {
264
+ type: "msra"
265
+ }
266
+ bias_filler {
267
+ type: "constant"
268
+ value: 0.0
269
+ }
270
+ }
271
+ }
272
+ layer {
273
+ name: "layer_128_1_bn2"
274
+ type: "BatchNorm"
275
+ bottom: "layer_128_1_conv1_h"
276
+ top: "layer_128_1_conv1_h"
277
+ param {
278
+ lr_mult: 0.0
279
+ }
280
+ param {
281
+ lr_mult: 0.0
282
+ }
283
+ param {
284
+ lr_mult: 0.0
285
+ }
286
+ }
287
+ layer {
288
+ name: "layer_128_1_scale2"
289
+ type: "Scale"
290
+ bottom: "layer_128_1_conv1_h"
291
+ top: "layer_128_1_conv1_h"
292
+ param {
293
+ lr_mult: 1.0
294
+ decay_mult: 1.0
295
+ }
296
+ param {
297
+ lr_mult: 2.0
298
+ decay_mult: 1.0
299
+ }
300
+ scale_param {
301
+ bias_term: true
302
+ }
303
+ }
304
+ layer {
305
+ name: "layer_128_1_relu2"
306
+ type: "ReLU"
307
+ bottom: "layer_128_1_conv1_h"
308
+ top: "layer_128_1_conv1_h"
309
+ }
310
+ layer {
311
+ name: "layer_128_1_conv2"
312
+ type: "Convolution"
313
+ bottom: "layer_128_1_conv1_h"
314
+ top: "layer_128_1_conv2"
315
+ param {
316
+ lr_mult: 1.0
317
+ decay_mult: 1.0
318
+ }
319
+ convolution_param {
320
+ num_output: 128
321
+ bias_term: false
322
+ pad: 1
323
+ kernel_size: 3
324
+ stride: 1
325
+ weight_filler {
326
+ type: "msra"
327
+ }
328
+ bias_filler {
329
+ type: "constant"
330
+ value: 0.0
331
+ }
332
+ }
333
+ }
334
+ layer {
335
+ name: "layer_128_1_conv_expand_h"
336
+ type: "Convolution"
337
+ bottom: "layer_128_1_bn1_h"
338
+ top: "layer_128_1_conv_expand_h"
339
+ param {
340
+ lr_mult: 1.0
341
+ decay_mult: 1.0
342
+ }
343
+ convolution_param {
344
+ num_output: 128
345
+ bias_term: false
346
+ pad: 0
347
+ kernel_size: 1
348
+ stride: 2
349
+ weight_filler {
350
+ type: "msra"
351
+ }
352
+ bias_filler {
353
+ type: "constant"
354
+ value: 0.0
355
+ }
356
+ }
357
+ }
358
+ layer {
359
+ name: "layer_128_1_sum"
360
+ type: "Eltwise"
361
+ bottom: "layer_128_1_conv2"
362
+ bottom: "layer_128_1_conv_expand_h"
363
+ top: "layer_128_1_sum"
364
+ }
365
+ layer {
366
+ name: "layer_256_1_bn1"
367
+ type: "BatchNorm"
368
+ bottom: "layer_128_1_sum"
369
+ top: "layer_256_1_bn1"
370
+ param {
371
+ lr_mult: 0.0
372
+ }
373
+ param {
374
+ lr_mult: 0.0
375
+ }
376
+ param {
377
+ lr_mult: 0.0
378
+ }
379
+ }
380
+ layer {
381
+ name: "layer_256_1_scale1"
382
+ type: "Scale"
383
+ bottom: "layer_256_1_bn1"
384
+ top: "layer_256_1_bn1"
385
+ param {
386
+ lr_mult: 1.0
387
+ decay_mult: 1.0
388
+ }
389
+ param {
390
+ lr_mult: 2.0
391
+ decay_mult: 1.0
392
+ }
393
+ scale_param {
394
+ bias_term: true
395
+ }
396
+ }
397
+ layer {
398
+ name: "layer_256_1_relu1"
399
+ type: "ReLU"
400
+ bottom: "layer_256_1_bn1"
401
+ top: "layer_256_1_bn1"
402
+ }
403
+ layer {
404
+ name: "layer_256_1_conv1"
405
+ type: "Convolution"
406
+ bottom: "layer_256_1_bn1"
407
+ top: "layer_256_1_conv1"
408
+ param {
409
+ lr_mult: 1.0
410
+ decay_mult: 1.0
411
+ }
412
+ convolution_param {
413
+ num_output: 256
414
+ bias_term: false
415
+ pad: 1
416
+ kernel_size: 3
417
+ stride: 2
418
+ weight_filler {
419
+ type: "msra"
420
+ }
421
+ bias_filler {
422
+ type: "constant"
423
+ value: 0.0
424
+ }
425
+ }
426
+ }
427
+ layer {
428
+ name: "layer_256_1_bn2"
429
+ type: "BatchNorm"
430
+ bottom: "layer_256_1_conv1"
431
+ top: "layer_256_1_conv1"
432
+ param {
433
+ lr_mult: 0.0
434
+ }
435
+ param {
436
+ lr_mult: 0.0
437
+ }
438
+ param {
439
+ lr_mult: 0.0
440
+ }
441
+ }
442
+ layer {
443
+ name: "layer_256_1_scale2"
444
+ type: "Scale"
445
+ bottom: "layer_256_1_conv1"
446
+ top: "layer_256_1_conv1"
447
+ param {
448
+ lr_mult: 1.0
449
+ decay_mult: 1.0
450
+ }
451
+ param {
452
+ lr_mult: 2.0
453
+ decay_mult: 1.0
454
+ }
455
+ scale_param {
456
+ bias_term: true
457
+ }
458
+ }
459
+ layer {
460
+ name: "layer_256_1_relu2"
461
+ type: "ReLU"
462
+ bottom: "layer_256_1_conv1"
463
+ top: "layer_256_1_conv1"
464
+ }
465
+ layer {
466
+ name: "layer_256_1_conv2"
467
+ type: "Convolution"
468
+ bottom: "layer_256_1_conv1"
469
+ top: "layer_256_1_conv2"
470
+ param {
471
+ lr_mult: 1.0
472
+ decay_mult: 1.0
473
+ }
474
+ convolution_param {
475
+ num_output: 256
476
+ bias_term: false
477
+ pad: 1
478
+ kernel_size: 3
479
+ stride: 1
480
+ weight_filler {
481
+ type: "msra"
482
+ }
483
+ bias_filler {
484
+ type: "constant"
485
+ value: 0.0
486
+ }
487
+ }
488
+ }
489
+ layer {
490
+ name: "layer_256_1_conv_expand"
491
+ type: "Convolution"
492
+ bottom: "layer_256_1_bn1"
493
+ top: "layer_256_1_conv_expand"
494
+ param {
495
+ lr_mult: 1.0
496
+ decay_mult: 1.0
497
+ }
498
+ convolution_param {
499
+ num_output: 256
500
+ bias_term: false
501
+ pad: 0
502
+ kernel_size: 1
503
+ stride: 2
504
+ weight_filler {
505
+ type: "msra"
506
+ }
507
+ bias_filler {
508
+ type: "constant"
509
+ value: 0.0
510
+ }
511
+ }
512
+ }
513
+ layer {
514
+ name: "layer_256_1_sum"
515
+ type: "Eltwise"
516
+ bottom: "layer_256_1_conv2"
517
+ bottom: "layer_256_1_conv_expand"
518
+ top: "layer_256_1_sum"
519
+ }
520
+ layer {
521
+ name: "layer_512_1_bn1"
522
+ type: "BatchNorm"
523
+ bottom: "layer_256_1_sum"
524
+ top: "layer_512_1_bn1"
525
+ param {
526
+ lr_mult: 0.0
527
+ }
528
+ param {
529
+ lr_mult: 0.0
530
+ }
531
+ param {
532
+ lr_mult: 0.0
533
+ }
534
+ }
535
+ layer {
536
+ name: "layer_512_1_scale1"
537
+ type: "Scale"
538
+ bottom: "layer_512_1_bn1"
539
+ top: "layer_512_1_bn1"
540
+ param {
541
+ lr_mult: 1.0
542
+ decay_mult: 1.0
543
+ }
544
+ param {
545
+ lr_mult: 2.0
546
+ decay_mult: 1.0
547
+ }
548
+ scale_param {
549
+ bias_term: true
550
+ }
551
+ }
552
+ layer {
553
+ name: "layer_512_1_relu1"
554
+ type: "ReLU"
555
+ bottom: "layer_512_1_bn1"
556
+ top: "layer_512_1_bn1"
557
+ }
558
+ layer {
559
+ name: "layer_512_1_conv1_h"
560
+ type: "Convolution"
561
+ bottom: "layer_512_1_bn1"
562
+ top: "layer_512_1_conv1_h"
563
+ param {
564
+ lr_mult: 1.0
565
+ decay_mult: 1.0
566
+ }
567
+ convolution_param {
568
+ num_output: 128
569
+ bias_term: false
570
+ pad: 1
571
+ kernel_size: 3
572
+ stride: 1 # 2
573
+ weight_filler {
574
+ type: "msra"
575
+ }
576
+ bias_filler {
577
+ type: "constant"
578
+ value: 0.0
579
+ }
580
+ }
581
+ }
582
+ layer {
583
+ name: "layer_512_1_bn2_h"
584
+ type: "BatchNorm"
585
+ bottom: "layer_512_1_conv1_h"
586
+ top: "layer_512_1_conv1_h"
587
+ param {
588
+ lr_mult: 0.0
589
+ }
590
+ param {
591
+ lr_mult: 0.0
592
+ }
593
+ param {
594
+ lr_mult: 0.0
595
+ }
596
+ }
597
+ layer {
598
+ name: "layer_512_1_scale2_h"
599
+ type: "Scale"
600
+ bottom: "layer_512_1_conv1_h"
601
+ top: "layer_512_1_conv1_h"
602
+ param {
603
+ lr_mult: 1.0
604
+ decay_mult: 1.0
605
+ }
606
+ param {
607
+ lr_mult: 2.0
608
+ decay_mult: 1.0
609
+ }
610
+ scale_param {
611
+ bias_term: true
612
+ }
613
+ }
614
+ layer {
615
+ name: "layer_512_1_relu2"
616
+ type: "ReLU"
617
+ bottom: "layer_512_1_conv1_h"
618
+ top: "layer_512_1_conv1_h"
619
+ }
620
+ layer {
621
+ name: "layer_512_1_conv2_h"
622
+ type: "Convolution"
623
+ bottom: "layer_512_1_conv1_h"
624
+ top: "layer_512_1_conv2_h"
625
+ param {
626
+ lr_mult: 1.0
627
+ decay_mult: 1.0
628
+ }
629
+ convolution_param {
630
+ num_output: 256
631
+ bias_term: false
632
+ pad: 2 # 1
633
+ kernel_size: 3
634
+ stride: 1
635
+ dilation: 2
636
+ weight_filler {
637
+ type: "msra"
638
+ }
639
+ bias_filler {
640
+ type: "constant"
641
+ value: 0.0
642
+ }
643
+ }
644
+ }
645
+ layer {
646
+ name: "layer_512_1_conv_expand_h"
647
+ type: "Convolution"
648
+ bottom: "layer_512_1_bn1"
649
+ top: "layer_512_1_conv_expand_h"
650
+ param {
651
+ lr_mult: 1.0
652
+ decay_mult: 1.0
653
+ }
654
+ convolution_param {
655
+ num_output: 256
656
+ bias_term: false
657
+ pad: 0
658
+ kernel_size: 1
659
+ stride: 1 # 2
660
+ weight_filler {
661
+ type: "msra"
662
+ }
663
+ bias_filler {
664
+ type: "constant"
665
+ value: 0.0
666
+ }
667
+ }
668
+ }
669
+ layer {
670
+ name: "layer_512_1_sum"
671
+ type: "Eltwise"
672
+ bottom: "layer_512_1_conv2_h"
673
+ bottom: "layer_512_1_conv_expand_h"
674
+ top: "layer_512_1_sum"
675
+ }
676
+ layer {
677
+ name: "last_bn_h"
678
+ type: "BatchNorm"
679
+ bottom: "layer_512_1_sum"
680
+ top: "layer_512_1_sum"
681
+ param {
682
+ lr_mult: 0.0
683
+ }
684
+ param {
685
+ lr_mult: 0.0
686
+ }
687
+ param {
688
+ lr_mult: 0.0
689
+ }
690
+ }
691
+ layer {
692
+ name: "last_scale_h"
693
+ type: "Scale"
694
+ bottom: "layer_512_1_sum"
695
+ top: "layer_512_1_sum"
696
+ param {
697
+ lr_mult: 1.0
698
+ decay_mult: 1.0
699
+ }
700
+ param {
701
+ lr_mult: 2.0
702
+ decay_mult: 1.0
703
+ }
704
+ scale_param {
705
+ bias_term: true
706
+ }
707
+ }
708
+ layer {
709
+ name: "last_relu"
710
+ type: "ReLU"
711
+ bottom: "layer_512_1_sum"
712
+ top: "fc7"
713
+ }
714
+
715
+ layer {
716
+ name: "conv6_1_h"
717
+ type: "Convolution"
718
+ bottom: "fc7"
719
+ top: "conv6_1_h"
720
+ param {
721
+ lr_mult: 1
722
+ decay_mult: 1
723
+ }
724
+ param {
725
+ lr_mult: 2
726
+ decay_mult: 0
727
+ }
728
+ convolution_param {
729
+ num_output: 128
730
+ pad: 0
731
+ kernel_size: 1
732
+ stride: 1
733
+ weight_filler {
734
+ type: "xavier"
735
+ }
736
+ bias_filler {
737
+ type: "constant"
738
+ value: 0
739
+ }
740
+ }
741
+ }
742
+ layer {
743
+ name: "conv6_1_relu"
744
+ type: "ReLU"
745
+ bottom: "conv6_1_h"
746
+ top: "conv6_1_h"
747
+ }
748
+ layer {
749
+ name: "conv6_2_h"
750
+ type: "Convolution"
751
+ bottom: "conv6_1_h"
752
+ top: "conv6_2_h"
753
+ param {
754
+ lr_mult: 1
755
+ decay_mult: 1
756
+ }
757
+ param {
758
+ lr_mult: 2
759
+ decay_mult: 0
760
+ }
761
+ convolution_param {
762
+ num_output: 256
763
+ pad: 1
764
+ kernel_size: 3
765
+ stride: 2
766
+ weight_filler {
767
+ type: "xavier"
768
+ }
769
+ bias_filler {
770
+ type: "constant"
771
+ value: 0
772
+ }
773
+ }
774
+ }
775
+ layer {
776
+ name: "conv6_2_relu"
777
+ type: "ReLU"
778
+ bottom: "conv6_2_h"
779
+ top: "conv6_2_h"
780
+ }
781
+ layer {
782
+ name: "conv7_1_h"
783
+ type: "Convolution"
784
+ bottom: "conv6_2_h"
785
+ top: "conv7_1_h"
786
+ param {
787
+ lr_mult: 1
788
+ decay_mult: 1
789
+ }
790
+ param {
791
+ lr_mult: 2
792
+ decay_mult: 0
793
+ }
794
+ convolution_param {
795
+ num_output: 64
796
+ pad: 0
797
+ kernel_size: 1
798
+ stride: 1
799
+ weight_filler {
800
+ type: "xavier"
801
+ }
802
+ bias_filler {
803
+ type: "constant"
804
+ value: 0
805
+ }
806
+ }
807
+ }
808
+ layer {
809
+ name: "conv7_1_relu"
810
+ type: "ReLU"
811
+ bottom: "conv7_1_h"
812
+ top: "conv7_1_h"
813
+ }
814
+ layer {
815
+ name: "conv7_2_h"
816
+ type: "Convolution"
817
+ bottom: "conv7_1_h"
818
+ top: "conv7_2_h"
819
+ param {
820
+ lr_mult: 1
821
+ decay_mult: 1
822
+ }
823
+ param {
824
+ lr_mult: 2
825
+ decay_mult: 0
826
+ }
827
+ convolution_param {
828
+ num_output: 128
829
+ pad: 1
830
+ kernel_size: 3
831
+ stride: 2
832
+ weight_filler {
833
+ type: "xavier"
834
+ }
835
+ bias_filler {
836
+ type: "constant"
837
+ value: 0
838
+ }
839
+ }
840
+ }
841
+ layer {
842
+ name: "conv7_2_relu"
843
+ type: "ReLU"
844
+ bottom: "conv7_2_h"
845
+ top: "conv7_2_h"
846
+ }
847
+ layer {
848
+ name: "conv8_1_h"
849
+ type: "Convolution"
850
+ bottom: "conv7_2_h"
851
+ top: "conv8_1_h"
852
+ param {
853
+ lr_mult: 1
854
+ decay_mult: 1
855
+ }
856
+ param {
857
+ lr_mult: 2
858
+ decay_mult: 0
859
+ }
860
+ convolution_param {
861
+ num_output: 64
862
+ pad: 0
863
+ kernel_size: 1
864
+ stride: 1
865
+ weight_filler {
866
+ type: "xavier"
867
+ }
868
+ bias_filler {
869
+ type: "constant"
870
+ value: 0
871
+ }
872
+ }
873
+ }
874
+ layer {
875
+ name: "conv8_1_relu"
876
+ type: "ReLU"
877
+ bottom: "conv8_1_h"
878
+ top: "conv8_1_h"
879
+ }
880
+ layer {
881
+ name: "conv8_2_h"
882
+ type: "Convolution"
883
+ bottom: "conv8_1_h"
884
+ top: "conv8_2_h"
885
+ param {
886
+ lr_mult: 1
887
+ decay_mult: 1
888
+ }
889
+ param {
890
+ lr_mult: 2
891
+ decay_mult: 0
892
+ }
893
+ convolution_param {
894
+ num_output: 128
895
+ pad: 1
896
+ kernel_size: 3
897
+ stride: 1
898
+ weight_filler {
899
+ type: "xavier"
900
+ }
901
+ bias_filler {
902
+ type: "constant"
903
+ value: 0
904
+ }
905
+ }
906
+ }
907
+ layer {
908
+ name: "conv8_2_relu"
909
+ type: "ReLU"
910
+ bottom: "conv8_2_h"
911
+ top: "conv8_2_h"
912
+ }
913
+ layer {
914
+ name: "conv9_1_h"
915
+ type: "Convolution"
916
+ bottom: "conv8_2_h"
917
+ top: "conv9_1_h"
918
+ param {
919
+ lr_mult: 1
920
+ decay_mult: 1
921
+ }
922
+ param {
923
+ lr_mult: 2
924
+ decay_mult: 0
925
+ }
926
+ convolution_param {
927
+ num_output: 64
928
+ pad: 0
929
+ kernel_size: 1
930
+ stride: 1
931
+ weight_filler {
932
+ type: "xavier"
933
+ }
934
+ bias_filler {
935
+ type: "constant"
936
+ value: 0
937
+ }
938
+ }
939
+ }
940
+ layer {
941
+ name: "conv9_1_relu"
942
+ type: "ReLU"
943
+ bottom: "conv9_1_h"
944
+ top: "conv9_1_h"
945
+ }
946
+ layer {
947
+ name: "conv9_2_h"
948
+ type: "Convolution"
949
+ bottom: "conv9_1_h"
950
+ top: "conv9_2_h"
951
+ param {
952
+ lr_mult: 1
953
+ decay_mult: 1
954
+ }
955
+ param {
956
+ lr_mult: 2
957
+ decay_mult: 0
958
+ }
959
+ convolution_param {
960
+ num_output: 128
961
+ pad: 1
962
+ kernel_size: 3
963
+ stride: 1
964
+ weight_filler {
965
+ type: "xavier"
966
+ }
967
+ bias_filler {
968
+ type: "constant"
969
+ value: 0
970
+ }
971
+ }
972
+ }
973
+ layer {
974
+ name: "conv9_2_relu"
975
+ type: "ReLU"
976
+ bottom: "conv9_2_h"
977
+ top: "conv9_2_h"
978
+ }
979
+ layer {
980
+ name: "conv4_3_norm"
981
+ type: "Normalize"
982
+ bottom: "layer_256_1_bn1"
983
+ top: "conv4_3_norm"
984
+ norm_param {
985
+ across_spatial: false
986
+ scale_filler {
987
+ type: "constant"
988
+ value: 20
989
+ }
990
+ channel_shared: false
991
+ }
992
+ }
993
+ layer {
994
+ name: "conv4_3_norm_mbox_loc"
995
+ type: "Convolution"
996
+ bottom: "conv4_3_norm"
997
+ top: "conv4_3_norm_mbox_loc"
998
+ param {
999
+ lr_mult: 1
1000
+ decay_mult: 1
1001
+ }
1002
+ param {
1003
+ lr_mult: 2
1004
+ decay_mult: 0
1005
+ }
1006
+ convolution_param {
1007
+ num_output: 16
1008
+ pad: 1
1009
+ kernel_size: 3
1010
+ stride: 1
1011
+ weight_filler {
1012
+ type: "xavier"
1013
+ }
1014
+ bias_filler {
1015
+ type: "constant"
1016
+ value: 0
1017
+ }
1018
+ }
1019
+ }
1020
+ layer {
1021
+ name: "conv4_3_norm_mbox_loc_perm"
1022
+ type: "Permute"
1023
+ bottom: "conv4_3_norm_mbox_loc"
1024
+ top: "conv4_3_norm_mbox_loc_perm"
1025
+ permute_param {
1026
+ order: 0
1027
+ order: 2
1028
+ order: 3
1029
+ order: 1
1030
+ }
1031
+ }
1032
+ layer {
1033
+ name: "conv4_3_norm_mbox_loc_flat"
1034
+ type: "Flatten"
1035
+ bottom: "conv4_3_norm_mbox_loc_perm"
1036
+ top: "conv4_3_norm_mbox_loc_flat"
1037
+ flatten_param {
1038
+ axis: 1
1039
+ }
1040
+ }
1041
+ layer {
1042
+ name: "conv4_3_norm_mbox_conf"
1043
+ type: "Convolution"
1044
+ bottom: "conv4_3_norm"
1045
+ top: "conv4_3_norm_mbox_conf"
1046
+ param {
1047
+ lr_mult: 1
1048
+ decay_mult: 1
1049
+ }
1050
+ param {
1051
+ lr_mult: 2
1052
+ decay_mult: 0
1053
+ }
1054
+ convolution_param {
1055
+ num_output: 8 # 84
1056
+ pad: 1
1057
+ kernel_size: 3
1058
+ stride: 1
1059
+ weight_filler {
1060
+ type: "xavier"
1061
+ }
1062
+ bias_filler {
1063
+ type: "constant"
1064
+ value: 0
1065
+ }
1066
+ }
1067
+ }
1068
+ layer {
1069
+ name: "conv4_3_norm_mbox_conf_perm"
1070
+ type: "Permute"
1071
+ bottom: "conv4_3_norm_mbox_conf"
1072
+ top: "conv4_3_norm_mbox_conf_perm"
1073
+ permute_param {
1074
+ order: 0
1075
+ order: 2
1076
+ order: 3
1077
+ order: 1
1078
+ }
1079
+ }
1080
+ layer {
1081
+ name: "conv4_3_norm_mbox_conf_flat"
1082
+ type: "Flatten"
1083
+ bottom: "conv4_3_norm_mbox_conf_perm"
1084
+ top: "conv4_3_norm_mbox_conf_flat"
1085
+ flatten_param {
1086
+ axis: 1
1087
+ }
1088
+ }
1089
+ layer {
1090
+ name: "conv4_3_norm_mbox_priorbox"
1091
+ type: "PriorBox"
1092
+ bottom: "conv4_3_norm"
1093
+ bottom: "data"
1094
+ top: "conv4_3_norm_mbox_priorbox"
1095
+ prior_box_param {
1096
+ min_size: 30.0
1097
+ max_size: 60.0
1098
+ aspect_ratio: 2
1099
+ flip: true
1100
+ clip: false
1101
+ variance: 0.1
1102
+ variance: 0.1
1103
+ variance: 0.2
1104
+ variance: 0.2
1105
+ step: 8
1106
+ offset: 0.5
1107
+ }
1108
+ }
1109
+ layer {
1110
+ name: "fc7_mbox_loc"
1111
+ type: "Convolution"
1112
+ bottom: "fc7"
1113
+ top: "fc7_mbox_loc"
1114
+ param {
1115
+ lr_mult: 1
1116
+ decay_mult: 1
1117
+ }
1118
+ param {
1119
+ lr_mult: 2
1120
+ decay_mult: 0
1121
+ }
1122
+ convolution_param {
1123
+ num_output: 24
1124
+ pad: 1
1125
+ kernel_size: 3
1126
+ stride: 1
1127
+ weight_filler {
1128
+ type: "xavier"
1129
+ }
1130
+ bias_filler {
1131
+ type: "constant"
1132
+ value: 0
1133
+ }
1134
+ }
1135
+ }
1136
+ layer {
1137
+ name: "fc7_mbox_loc_perm"
1138
+ type: "Permute"
1139
+ bottom: "fc7_mbox_loc"
1140
+ top: "fc7_mbox_loc_perm"
1141
+ permute_param {
1142
+ order: 0
1143
+ order: 2
1144
+ order: 3
1145
+ order: 1
1146
+ }
1147
+ }
1148
+ layer {
1149
+ name: "fc7_mbox_loc_flat"
1150
+ type: "Flatten"
1151
+ bottom: "fc7_mbox_loc_perm"
1152
+ top: "fc7_mbox_loc_flat"
1153
+ flatten_param {
1154
+ axis: 1
1155
+ }
1156
+ }
1157
+ layer {
1158
+ name: "fc7_mbox_conf"
1159
+ type: "Convolution"
1160
+ bottom: "fc7"
1161
+ top: "fc7_mbox_conf"
1162
+ param {
1163
+ lr_mult: 1
1164
+ decay_mult: 1
1165
+ }
1166
+ param {
1167
+ lr_mult: 2
1168
+ decay_mult: 0
1169
+ }
1170
+ convolution_param {
1171
+ num_output: 12 # 126
1172
+ pad: 1
1173
+ kernel_size: 3
1174
+ stride: 1
1175
+ weight_filler {
1176
+ type: "xavier"
1177
+ }
1178
+ bias_filler {
1179
+ type: "constant"
1180
+ value: 0
1181
+ }
1182
+ }
1183
+ }
1184
+ layer {
1185
+ name: "fc7_mbox_conf_perm"
1186
+ type: "Permute"
1187
+ bottom: "fc7_mbox_conf"
1188
+ top: "fc7_mbox_conf_perm"
1189
+ permute_param {
1190
+ order: 0
1191
+ order: 2
1192
+ order: 3
1193
+ order: 1
1194
+ }
1195
+ }
1196
+ layer {
1197
+ name: "fc7_mbox_conf_flat"
1198
+ type: "Flatten"
1199
+ bottom: "fc7_mbox_conf_perm"
1200
+ top: "fc7_mbox_conf_flat"
1201
+ flatten_param {
1202
+ axis: 1
1203
+ }
1204
+ }
1205
+ layer {
1206
+ name: "fc7_mbox_priorbox"
1207
+ type: "PriorBox"
1208
+ bottom: "fc7"
1209
+ bottom: "data"
1210
+ top: "fc7_mbox_priorbox"
1211
+ prior_box_param {
1212
+ min_size: 60.0
1213
+ max_size: 111.0
1214
+ aspect_ratio: 2
1215
+ aspect_ratio: 3
1216
+ flip: true
1217
+ clip: false
1218
+ variance: 0.1
1219
+ variance: 0.1
1220
+ variance: 0.2
1221
+ variance: 0.2
1222
+ step: 16
1223
+ offset: 0.5
1224
+ }
1225
+ }
1226
+ layer {
1227
+ name: "conv6_2_mbox_loc"
1228
+ type: "Convolution"
1229
+ bottom: "conv6_2_h"
1230
+ top: "conv6_2_mbox_loc"
1231
+ param {
1232
+ lr_mult: 1
1233
+ decay_mult: 1
1234
+ }
1235
+ param {
1236
+ lr_mult: 2
1237
+ decay_mult: 0
1238
+ }
1239
+ convolution_param {
1240
+ num_output: 24
1241
+ pad: 1
1242
+ kernel_size: 3
1243
+ stride: 1
1244
+ weight_filler {
1245
+ type: "xavier"
1246
+ }
1247
+ bias_filler {
1248
+ type: "constant"
1249
+ value: 0
1250
+ }
1251
+ }
1252
+ }
1253
+ layer {
1254
+ name: "conv6_2_mbox_loc_perm"
1255
+ type: "Permute"
1256
+ bottom: "conv6_2_mbox_loc"
1257
+ top: "conv6_2_mbox_loc_perm"
1258
+ permute_param {
1259
+ order: 0
1260
+ order: 2
1261
+ order: 3
1262
+ order: 1
1263
+ }
1264
+ }
1265
+ layer {
1266
+ name: "conv6_2_mbox_loc_flat"
1267
+ type: "Flatten"
1268
+ bottom: "conv6_2_mbox_loc_perm"
1269
+ top: "conv6_2_mbox_loc_flat"
1270
+ flatten_param {
1271
+ axis: 1
1272
+ }
1273
+ }
1274
+ layer {
1275
+ name: "conv6_2_mbox_conf"
1276
+ type: "Convolution"
1277
+ bottom: "conv6_2_h"
1278
+ top: "conv6_2_mbox_conf"
1279
+ param {
1280
+ lr_mult: 1
1281
+ decay_mult: 1
1282
+ }
1283
+ param {
1284
+ lr_mult: 2
1285
+ decay_mult: 0
1286
+ }
1287
+ convolution_param {
1288
+ num_output: 12 # 126
1289
+ pad: 1
1290
+ kernel_size: 3
1291
+ stride: 1
1292
+ weight_filler {
1293
+ type: "xavier"
1294
+ }
1295
+ bias_filler {
1296
+ type: "constant"
1297
+ value: 0
1298
+ }
1299
+ }
1300
+ }
1301
+ layer {
1302
+ name: "conv6_2_mbox_conf_perm"
1303
+ type: "Permute"
1304
+ bottom: "conv6_2_mbox_conf"
1305
+ top: "conv6_2_mbox_conf_perm"
1306
+ permute_param {
1307
+ order: 0
1308
+ order: 2
1309
+ order: 3
1310
+ order: 1
1311
+ }
1312
+ }
1313
+ layer {
1314
+ name: "conv6_2_mbox_conf_flat"
1315
+ type: "Flatten"
1316
+ bottom: "conv6_2_mbox_conf_perm"
1317
+ top: "conv6_2_mbox_conf_flat"
1318
+ flatten_param {
1319
+ axis: 1
1320
+ }
1321
+ }
1322
+ layer {
1323
+ name: "conv6_2_mbox_priorbox"
1324
+ type: "PriorBox"
1325
+ bottom: "conv6_2_h"
1326
+ bottom: "data"
1327
+ top: "conv6_2_mbox_priorbox"
1328
+ prior_box_param {
1329
+ min_size: 111.0
1330
+ max_size: 162.0
1331
+ aspect_ratio: 2
1332
+ aspect_ratio: 3
1333
+ flip: true
1334
+ clip: false
1335
+ variance: 0.1
1336
+ variance: 0.1
1337
+ variance: 0.2
1338
+ variance: 0.2
1339
+ step: 32
1340
+ offset: 0.5
1341
+ }
1342
+ }
1343
+ layer {
1344
+ name: "conv7_2_mbox_loc"
1345
+ type: "Convolution"
1346
+ bottom: "conv7_2_h"
1347
+ top: "conv7_2_mbox_loc"
1348
+ param {
1349
+ lr_mult: 1
1350
+ decay_mult: 1
1351
+ }
1352
+ param {
1353
+ lr_mult: 2
1354
+ decay_mult: 0
1355
+ }
1356
+ convolution_param {
1357
+ num_output: 24
1358
+ pad: 1
1359
+ kernel_size: 3
1360
+ stride: 1
1361
+ weight_filler {
1362
+ type: "xavier"
1363
+ }
1364
+ bias_filler {
1365
+ type: "constant"
1366
+ value: 0
1367
+ }
1368
+ }
1369
+ }
1370
+ layer {
1371
+ name: "conv7_2_mbox_loc_perm"
1372
+ type: "Permute"
1373
+ bottom: "conv7_2_mbox_loc"
1374
+ top: "conv7_2_mbox_loc_perm"
1375
+ permute_param {
1376
+ order: 0
1377
+ order: 2
1378
+ order: 3
1379
+ order: 1
1380
+ }
1381
+ }
1382
+ layer {
1383
+ name: "conv7_2_mbox_loc_flat"
1384
+ type: "Flatten"
1385
+ bottom: "conv7_2_mbox_loc_perm"
1386
+ top: "conv7_2_mbox_loc_flat"
1387
+ flatten_param {
1388
+ axis: 1
1389
+ }
1390
+ }
1391
+ layer {
1392
+ name: "conv7_2_mbox_conf"
1393
+ type: "Convolution"
1394
+ bottom: "conv7_2_h"
1395
+ top: "conv7_2_mbox_conf"
1396
+ param {
1397
+ lr_mult: 1
1398
+ decay_mult: 1
1399
+ }
1400
+ param {
1401
+ lr_mult: 2
1402
+ decay_mult: 0
1403
+ }
1404
+ convolution_param {
1405
+ num_output: 12 # 126
1406
+ pad: 1
1407
+ kernel_size: 3
1408
+ stride: 1
1409
+ weight_filler {
1410
+ type: "xavier"
1411
+ }
1412
+ bias_filler {
1413
+ type: "constant"
1414
+ value: 0
1415
+ }
1416
+ }
1417
+ }
1418
+ layer {
1419
+ name: "conv7_2_mbox_conf_perm"
1420
+ type: "Permute"
1421
+ bottom: "conv7_2_mbox_conf"
1422
+ top: "conv7_2_mbox_conf_perm"
1423
+ permute_param {
1424
+ order: 0
1425
+ order: 2
1426
+ order: 3
1427
+ order: 1
1428
+ }
1429
+ }
1430
+ layer {
1431
+ name: "conv7_2_mbox_conf_flat"
1432
+ type: "Flatten"
1433
+ bottom: "conv7_2_mbox_conf_perm"
1434
+ top: "conv7_2_mbox_conf_flat"
1435
+ flatten_param {
1436
+ axis: 1
1437
+ }
1438
+ }
1439
+ layer {
1440
+ name: "conv7_2_mbox_priorbox"
1441
+ type: "PriorBox"
1442
+ bottom: "conv7_2_h"
1443
+ bottom: "data"
1444
+ top: "conv7_2_mbox_priorbox"
1445
+ prior_box_param {
1446
+ min_size: 162.0
1447
+ max_size: 213.0
1448
+ aspect_ratio: 2
1449
+ aspect_ratio: 3
1450
+ flip: true
1451
+ clip: false
1452
+ variance: 0.1
1453
+ variance: 0.1
1454
+ variance: 0.2
1455
+ variance: 0.2
1456
+ step: 64
1457
+ offset: 0.5
1458
+ }
1459
+ }
1460
+ layer {
1461
+ name: "conv8_2_mbox_loc"
1462
+ type: "Convolution"
1463
+ bottom: "conv8_2_h"
1464
+ top: "conv8_2_mbox_loc"
1465
+ param {
1466
+ lr_mult: 1
1467
+ decay_mult: 1
1468
+ }
1469
+ param {
1470
+ lr_mult: 2
1471
+ decay_mult: 0
1472
+ }
1473
+ convolution_param {
1474
+ num_output: 16
1475
+ pad: 1
1476
+ kernel_size: 3
1477
+ stride: 1
1478
+ weight_filler {
1479
+ type: "xavier"
1480
+ }
1481
+ bias_filler {
1482
+ type: "constant"
1483
+ value: 0
1484
+ }
1485
+ }
1486
+ }
1487
+ layer {
1488
+ name: "conv8_2_mbox_loc_perm"
1489
+ type: "Permute"
1490
+ bottom: "conv8_2_mbox_loc"
1491
+ top: "conv8_2_mbox_loc_perm"
1492
+ permute_param {
1493
+ order: 0
1494
+ order: 2
1495
+ order: 3
1496
+ order: 1
1497
+ }
1498
+ }
1499
+ layer {
1500
+ name: "conv8_2_mbox_loc_flat"
1501
+ type: "Flatten"
1502
+ bottom: "conv8_2_mbox_loc_perm"
1503
+ top: "conv8_2_mbox_loc_flat"
1504
+ flatten_param {
1505
+ axis: 1
1506
+ }
1507
+ }
1508
+ layer {
1509
+ name: "conv8_2_mbox_conf"
1510
+ type: "Convolution"
1511
+ bottom: "conv8_2_h"
1512
+ top: "conv8_2_mbox_conf"
1513
+ param {
1514
+ lr_mult: 1
1515
+ decay_mult: 1
1516
+ }
1517
+ param {
1518
+ lr_mult: 2
1519
+ decay_mult: 0
1520
+ }
1521
+ convolution_param {
1522
+ num_output: 8 # 84
1523
+ pad: 1
1524
+ kernel_size: 3
1525
+ stride: 1
1526
+ weight_filler {
1527
+ type: "xavier"
1528
+ }
1529
+ bias_filler {
1530
+ type: "constant"
1531
+ value: 0
1532
+ }
1533
+ }
1534
+ }
1535
+ layer {
1536
+ name: "conv8_2_mbox_conf_perm"
1537
+ type: "Permute"
1538
+ bottom: "conv8_2_mbox_conf"
1539
+ top: "conv8_2_mbox_conf_perm"
1540
+ permute_param {
1541
+ order: 0
1542
+ order: 2
1543
+ order: 3
1544
+ order: 1
1545
+ }
1546
+ }
1547
+ layer {
1548
+ name: "conv8_2_mbox_conf_flat"
1549
+ type: "Flatten"
1550
+ bottom: "conv8_2_mbox_conf_perm"
1551
+ top: "conv8_2_mbox_conf_flat"
1552
+ flatten_param {
1553
+ axis: 1
1554
+ }
1555
+ }
1556
+ layer {
1557
+ name: "conv8_2_mbox_priorbox"
1558
+ type: "PriorBox"
1559
+ bottom: "conv8_2_h"
1560
+ bottom: "data"
1561
+ top: "conv8_2_mbox_priorbox"
1562
+ prior_box_param {
1563
+ min_size: 213.0
1564
+ max_size: 264.0
1565
+ aspect_ratio: 2
1566
+ flip: true
1567
+ clip: false
1568
+ variance: 0.1
1569
+ variance: 0.1
1570
+ variance: 0.2
1571
+ variance: 0.2
1572
+ step: 100
1573
+ offset: 0.5
1574
+ }
1575
+ }
1576
+ layer {
1577
+ name: "conv9_2_mbox_loc"
1578
+ type: "Convolution"
1579
+ bottom: "conv9_2_h"
1580
+ top: "conv9_2_mbox_loc"
1581
+ param {
1582
+ lr_mult: 1
1583
+ decay_mult: 1
1584
+ }
1585
+ param {
1586
+ lr_mult: 2
1587
+ decay_mult: 0
1588
+ }
1589
+ convolution_param {
1590
+ num_output: 16
1591
+ pad: 1
1592
+ kernel_size: 3
1593
+ stride: 1
1594
+ weight_filler {
1595
+ type: "xavier"
1596
+ }
1597
+ bias_filler {
1598
+ type: "constant"
1599
+ value: 0
1600
+ }
1601
+ }
1602
+ }
1603
+ layer {
1604
+ name: "conv9_2_mbox_loc_perm"
1605
+ type: "Permute"
1606
+ bottom: "conv9_2_mbox_loc"
1607
+ top: "conv9_2_mbox_loc_perm"
1608
+ permute_param {
1609
+ order: 0
1610
+ order: 2
1611
+ order: 3
1612
+ order: 1
1613
+ }
1614
+ }
1615
+ layer {
1616
+ name: "conv9_2_mbox_loc_flat"
1617
+ type: "Flatten"
1618
+ bottom: "conv9_2_mbox_loc_perm"
1619
+ top: "conv9_2_mbox_loc_flat"
1620
+ flatten_param {
1621
+ axis: 1
1622
+ }
1623
+ }
1624
+ layer {
1625
+ name: "conv9_2_mbox_conf"
1626
+ type: "Convolution"
1627
+ bottom: "conv9_2_h"
1628
+ top: "conv9_2_mbox_conf"
1629
+ param {
1630
+ lr_mult: 1
1631
+ decay_mult: 1
1632
+ }
1633
+ param {
1634
+ lr_mult: 2
1635
+ decay_mult: 0
1636
+ }
1637
+ convolution_param {
1638
+ num_output: 8 # 84
1639
+ pad: 1
1640
+ kernel_size: 3
1641
+ stride: 1
1642
+ weight_filler {
1643
+ type: "xavier"
1644
+ }
1645
+ bias_filler {
1646
+ type: "constant"
1647
+ value: 0
1648
+ }
1649
+ }
1650
+ }
1651
+ layer {
1652
+ name: "conv9_2_mbox_conf_perm"
1653
+ type: "Permute"
1654
+ bottom: "conv9_2_mbox_conf"
1655
+ top: "conv9_2_mbox_conf_perm"
1656
+ permute_param {
1657
+ order: 0
1658
+ order: 2
1659
+ order: 3
1660
+ order: 1
1661
+ }
1662
+ }
1663
+ layer {
1664
+ name: "conv9_2_mbox_conf_flat"
1665
+ type: "Flatten"
1666
+ bottom: "conv9_2_mbox_conf_perm"
1667
+ top: "conv9_2_mbox_conf_flat"
1668
+ flatten_param {
1669
+ axis: 1
1670
+ }
1671
+ }
1672
+ layer {
1673
+ name: "conv9_2_mbox_priorbox"
1674
+ type: "PriorBox"
1675
+ bottom: "conv9_2_h"
1676
+ bottom: "data"
1677
+ top: "conv9_2_mbox_priorbox"
1678
+ prior_box_param {
1679
+ min_size: 264.0
1680
+ max_size: 315.0
1681
+ aspect_ratio: 2
1682
+ flip: true
1683
+ clip: false
1684
+ variance: 0.1
1685
+ variance: 0.1
1686
+ variance: 0.2
1687
+ variance: 0.2
1688
+ step: 300
1689
+ offset: 0.5
1690
+ }
1691
+ }
1692
+ layer {
1693
+ name: "mbox_loc"
1694
+ type: "Concat"
1695
+ bottom: "conv4_3_norm_mbox_loc_flat"
1696
+ bottom: "fc7_mbox_loc_flat"
1697
+ bottom: "conv6_2_mbox_loc_flat"
1698
+ bottom: "conv7_2_mbox_loc_flat"
1699
+ bottom: "conv8_2_mbox_loc_flat"
1700
+ bottom: "conv9_2_mbox_loc_flat"
1701
+ top: "mbox_loc"
1702
+ concat_param {
1703
+ axis: 1
1704
+ }
1705
+ }
1706
+ layer {
1707
+ name: "mbox_conf"
1708
+ type: "Concat"
1709
+ bottom: "conv4_3_norm_mbox_conf_flat"
1710
+ bottom: "fc7_mbox_conf_flat"
1711
+ bottom: "conv6_2_mbox_conf_flat"
1712
+ bottom: "conv7_2_mbox_conf_flat"
1713
+ bottom: "conv8_2_mbox_conf_flat"
1714
+ bottom: "conv9_2_mbox_conf_flat"
1715
+ top: "mbox_conf"
1716
+ concat_param {
1717
+ axis: 1
1718
+ }
1719
+ }
1720
+ layer {
1721
+ name: "mbox_priorbox"
1722
+ type: "Concat"
1723
+ bottom: "conv4_3_norm_mbox_priorbox"
1724
+ bottom: "fc7_mbox_priorbox"
1725
+ bottom: "conv6_2_mbox_priorbox"
1726
+ bottom: "conv7_2_mbox_priorbox"
1727
+ bottom: "conv8_2_mbox_priorbox"
1728
+ bottom: "conv9_2_mbox_priorbox"
1729
+ top: "mbox_priorbox"
1730
+ concat_param {
1731
+ axis: 2
1732
+ }
1733
+ }
1734
+
1735
+ layer {
1736
+ name: "mbox_conf_reshape"
1737
+ type: "Reshape"
1738
+ bottom: "mbox_conf"
1739
+ top: "mbox_conf_reshape"
1740
+ reshape_param {
1741
+ shape {
1742
+ dim: 0
1743
+ dim: -1
1744
+ dim: 2
1745
+ }
1746
+ }
1747
+ }
1748
+ layer {
1749
+ name: "mbox_conf_softmax"
1750
+ type: "Softmax"
1751
+ bottom: "mbox_conf_reshape"
1752
+ top: "mbox_conf_softmax"
1753
+ softmax_param {
1754
+ axis: 2
1755
+ }
1756
+ }
1757
+ layer {
1758
+ name: "mbox_conf_flatten"
1759
+ type: "Flatten"
1760
+ bottom: "mbox_conf_softmax"
1761
+ top: "mbox_conf_flatten"
1762
+ flatten_param {
1763
+ axis: 1
1764
+ }
1765
+ }
1766
+
1767
+ layer {
1768
+ name: "detection_out"
1769
+ type: "DetectionOutput"
1770
+ bottom: "mbox_loc"
1771
+ bottom: "mbox_conf_flatten"
1772
+ bottom: "mbox_priorbox"
1773
+ top: "detection_out"
1774
+ include {
1775
+ phase: TEST
1776
+ }
1777
+ detection_output_param {
1778
+ num_classes: 2
1779
+ share_location: true
1780
+ background_label_id: 0
1781
+ nms_param {
1782
+ nms_threshold: 0.45
1783
+ top_k: 400
1784
+ }
1785
+ code_type: CENTER_SIZE
1786
+ keep_top_k: 200
1787
+ confidence_threshold: 0.01
1788
+ }
1789
+ }
models/emotion_model.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:19c707063376d618b3e56df7537b6e103c7426e759c63ce5a4c33df414ce3612
3
+ size 299634
models/res10_300x300_ssd_iter_140000.caffemodel ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2a56a11a57a4a295956b0660b4a3d76bbdca2206c4961cea8efe7d95c7cb2f2d
3
+ size 10666211
models/resnet_features.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:58abd9efad0ad01490f42835b4dd082b346e0bfa3b6b9a6041295de905cf688f
3
+ size 44786742
models/shape_predictor_68_face_landmarks.dat ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fbdc2cb80eb9aa7a758672cbfdda32ba6300efe9b6e6c7a299ff7e736b11b92f
3
+ size 99693937