Spaces:

Rahulk2197
/

Interview

Sleeping

App Files Files Community

Rahulk2197 commited on Oct 12, 2024

Commit

ee94b36

verified ·

1 Parent(s): 78e78e3

Upload 18 files

Browse files

Files changed (19) hide show

.gitattributes +2 -0
functions/__pycache__/audio.cpython-312.pyc +0 -0
functions/__pycache__/fer.cpython-312.pyc +0 -0
functions/__pycache__/helper.cpython-312.pyc +0 -0
functions/__pycache__/models.cpython-312.pyc +0 -0
functions/__pycache__/valence_arousal.cpython-312.pyc +0 -0
functions/__pycache__/video.cpython-312.pyc +0 -0
functions/audio.py +97 -0
functions/fer.py +137 -0
functions/helper.py +58 -0
functions/models.py +56 -0
functions/valence_arousal.py +93 -0
functions/video.py +114 -0
models/22.6_AffectNet_10K_part2.pt +3 -0
models/deploy.prototxt +1789 -0
models/emotion_model.pt +3 -0
models/res10_300x300_ssd_iter_140000.caffemodel +3 -0
models/resnet_features.pt +3 -0
models/shape_predictor_68_face_landmarks.dat +3 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+models/res10_300x300_ssd_iter_140000.caffemodel filter=lfs diff=lfs merge=lfs -text
+models/shape_predictor_68_face_landmarks.dat filter=lfs diff=lfs merge=lfs -text

functions/__pycache__/audio.cpython-312.pyc ADDED Viewed

Binary file (5.15 kB). View file

functions/__pycache__/fer.cpython-312.pyc ADDED Viewed

Binary file (7.79 kB). View file

functions/__pycache__/helper.cpython-312.pyc ADDED Viewed

Binary file (3.09 kB). View file

functions/__pycache__/models.cpython-312.pyc ADDED Viewed

Binary file (2.76 kB). View file

functions/__pycache__/valence_arousal.cpython-312.pyc ADDED Viewed

Binary file (4.86 kB). View file

functions/__pycache__/video.cpython-312.pyc ADDED Viewed

Binary file (4.89 kB). View file

functions/audio.py ADDED Viewed

	@@ -0,0 +1,97 @@

+import librosa
+import numpy as np
+import torch
+from collections import Counter
+import nltk
+device = "cuda:0" if torch.cuda.is_available() else "cpu"
+torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
+def extract_audio_features(audio_path,asrmodel,asrproc,sentipipe):
+    y, sr = librosa.load(audio_path,sr=16000)
+    inputs = asrproc(y, sampling_rate=sr, return_tensors="pt").input_features
+    inputs = inputs.to(device, dtype=torch_dtype)
+    with torch.no_grad():
+        generated_ids = asrmodel.generate(inputs)
+        transcript = asrproc.batch_decode(generated_ids, skip_special_tokens=True)[0]
+    # Sound intensity (RMS)
+    rms = librosa.feature.rms(y=y)
+    sound_intensity = np.mean(rms)
+    # Fundamental frequency (F0)
+    f0, voiced_flag, voiced_probs = librosa.pyin(y, fmin=librosa.note_to_hz('C2'), fmax=librosa.note_to_hz('C7'))
+    fundamental_frequency = np.nanmean(f0)
+    # Spectral energy (based on STFT)
+    S = np.abs(librosa.stft(y))
+    spectral_energy = np.mean(np.sum(S ** 2, axis=0))
+    # Spectral centroid
+    spectral_centroid = librosa.feature.spectral_centroid(y=y, sr=sr)
+    avg_spectral_centroid = np.mean(spectral_centroid)
+    # Zero-crossing rate
+    zcr = librosa.feature.zero_crossing_rate(y)
+    zero_crossing_rate = np.mean(zcr)
+    # Pause detection
+    silence_threshold = -40
+    silent_intervals = librosa.effects.split(y, top_db=silence_threshold)  # Split into non-silent intervals
+    pause_duration = 0
+    for start, end in silent_intervals:
+        pause_duration += (end - start) / sr  # Add the pause duration in seconds
+    total_duration = librosa.get_duration(y=y, sr=sr)
+    pause_rate = (pause_duration / total_duration) * 60  # Convert to pauses per minute
+    # Transcript processing
+    words = nltk.word_tokenize(transcript)
+    num_words = len(words)
+    unique_words = len(set(words))
+    word_frequencies = Counter(words)
+    duration_minutes = total_duration / 60
+    avg_words_per_minute = num_words / duration_minutes
+    avg_unique_words_per_minute = unique_words / duration_minutes
+    # Count of unique words
+    unique_word_count = unique_words
+    # Filler word detection
+    filler_words = [
+        'uh', 'um', 'like', 'you know', 'ah', 'er', 'hmm', 'well', 'so',
+        'I mean', 'okay', 'right', 'actually', 'basically', 'you see',
+        'sort of', 'kind of', 'yeah', 'literally', 'just', 'I guess',
+        'totally', 'honestly', 'seriously', 'alright'
+    ]
+    filler_word_count = sum([word_frequencies.get(filler, 0) for filler in filler_words])
+    filler_words_per_minute = filler_word_count / duration_minutes
+    # POS tagging
+    pos_tags = nltk.pos_tag(words)
+    nouns = [word for word, pos in pos_tags if pos.startswith('NN')]
+    adjectives = [word for word, pos in pos_tags if pos.startswith('JJ')]
+    verbs = [word for word, pos in pos_tags if pos.startswith('VB')]
+    # Sentiment analysis
+    sentiment = sentipipe(transcript)
+    print("Nouns: ", nouns)
+    print("Adjectives: ", adjectives)
+    print("Verbs: ", verbs)
+    return {
+    "transcript": transcript,  # assuming this is a string
+    "sentiment":sentiment,
+    "sound_intensity": float(sound_intensity),  # convert numpy float to Python float
+    "fundamental_frequency": float(fundamental_frequency),  # same conversion
+    "spectral_energy": float(spectral_energy),  # convert to Python float
+    "spectral_centroid": float(avg_spectral_centroid),  # convert numpy float
+    "zero_crossing_rate": float(zero_crossing_rate),  # convert to Python float
+    "avg_words_per_minute": float(avg_words_per_minute),  # same conversion
+    "avg_unique_words_per_minute": float(avg_unique_words_per_minute),  # convert float
+    "unique_word_count": int(unique_word_count),  # convert to integer if needed
+    "filler_words_per_minute": float(filler_words_per_minute),  # convert float
+    "noun_count": len(nouns),  # Assuming nouns is a list, so no changes needed
+    "adjective_count": len(adjectives),  # Same here
+    "verb_count": len(verbs),  # Same here
+    "pause_rate": float(pause_rate), # convert to Python float
+}

functions/fer.py ADDED Viewed

	@@ -0,0 +1,137 @@

+import cv2
+import torch
+import torchvision.transforms as transforms
+from PIL import Image
+import numpy as np
+import timm
+from tqdm import tqdm
+import torch.nn as nn
+import os
+import matplotlib.pyplot as plt
+import matplotlib
+matplotlib.use('Agg')
+import torch.nn.functional as F
+import pandas as pd
+class Model:
+    def __init__(self,fps,fer_model):
+        self.device="cuda" if torch.cuda.is_available() else "cpu"
+        self.transform = transforms.Compose([transforms.Resize((224, 224)),
+            transforms.ToTensor(),
+            transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])]
+        )
+        self.fermodel= timm.create_model("tf_efficientnet_b0_ns", pretrained=False)
+        self.fermodel.classifier = torch.nn.Identity()
+        self.fermodel.classifier=nn.Sequential(
+        nn.Linear(in_features=1280, out_features=7)
+        )
+        self.fermodel = torch.load(
+        fer_model,
+        map_location=self.device)
+        self.fermodel.to(self.device)
+        self.class_labels = ["angry", "disgust", "fear", "happy", "neutral", "sad", "surprised"]
+        self.emotion_reorder = {
+        0: 6,
+        1: 5,
+        2: 4,
+        3: 1,
+        4: 0,
+        5: 2,
+        6: 3,
+        }
+        self.label_dict = {
+                            0: "angry",
+                            1: "disgust",
+                            2: "fear",
+                            3: "happy",
+                            4: "neutral",
+                            5: "sad",
+                            6: "surprised",
+                        }
+        self.class_wise_frame_count=None
+        self.emotion_count = [0] * 7
+        self.frame_count=0
+        self.fps=fps
+        self.df=None
+        self.faces_=0
+    def predict(self,frames):
+        emotion_list=[]
+        emt=[]
+        for frame in tqdm(frames):
+            if frame is not None:
+                frame=np.copy(frame)
+                face_pil = Image.fromarray(
+                                cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
+                            )
+                face_tensor = self.transform(face_pil).unsqueeze(0).to(self.device)
+                with torch.no_grad():
+                    output = self.fermodel(face_tensor)
+                    _, predicted = torch.max(output, 1)
+                    emotion = self.emotion_reorder[predicted.item()]
+                    if isinstance(emotion, np.ndarray):
+                        emotion = (
+                            emotion.astype(float).item()
+                            if emotion.size == 1
+                            else emotion.tolist()
+                                    )
+                    emotion = torch.tensor(
+                                    [emotion], dtype=torch.float32
+                                )  # Ensures it's a tensor
+                    emotion.to(self.device)
+                    emt.append(emotion)
+                self.emotion_count[predicted.item()] += 1
+                label = f"{self.label_dict[predicted.item()]}"
+                emotion_list.append(label)
+            else:
+                emt.append('frame error')
+                emotion_list.append('frame error')
+        return emotion_list,emt
+    def get_data(self,emotion_list,emt):
+        self.class_wise_frame_count = dict(zip(self.class_labels, self.emotion_count))
+        return emotion_list,self.class_wise_frame_count,emt
+def fer_predict(video_frames,fps,model):
+    emotion_list,emt=model.predict(video_frames)
+    return model.get_data(emotion_list,emt)
+def filter(list1,list2):
+    filtered_list1 = [x for i, x in enumerate(list1) if list2[i]!='fnf']
+    filtered_list2 = [x for x in list2 if x!='fnf']
+    return [filtered_list1,filtered_list2]
+def plot_graph(x, y_vals, labels, path, calib_vals=None):
+    """
+    Plots multiple subplots (one for each variable) in one figure.
+    Parameters:
+    - x: List of timestamps or frame numbers.
+    - y_vals: List of y-values for valence, arousal, and stress (or other metrics).
+    - labels: List of variable names corresponding to y_vals (e.g., ['valence', 'arousal', 'stress']).
+    - path: Path to save the combined plot.
+    - calib_vals: List of calibration values for each variable (optional).
+    """
+    plt.figure(figsize=(12, 8))  # Create a figure of appropriate size
+    # Iterate over y-values, labels, and calibration values to create subplots
+    for i, (y, label) in enumerate(zip(y_vals, labels)):
+        y = [value if isinstance(value, (int, float)) else np.nan for value in y]
+        # Create a subplot (3 rows, 1 column, and the current subplot index)
+        plt.subplot(len(y_vals), 1, i+1)
+        plt.plot(range(len(x)), y, linestyle='-')
+        # Plot calibration line if provided
+        if calib_vals and calib_vals[i] is not None:
+            plt.axhline(y=calib_vals[i], color='r', linestyle='--', label=f'{label} calibration = {calib_vals[i]}')
+        plt.xlabel('Frame')
+        plt.ylabel(label)
+        plt.title(f'{label} By Frames')
+        plt.legend()
+    plt.tight_layout()  # Adjust layout to prevent overlap
+    plt.savefig(path)
+    plt.clf()  # Clear the figure after saving

functions/helper.py ADDED Viewed

	@@ -0,0 +1,58 @@

+import cv2
+import numpy as np
+import dlib
+from tqdm import tqdm
+def extract_face(image, net, predictor):
+    (h, w) = image.shape[:2]
+    blob = cv2.dnn.blobFromImage(cv2.resize(image, (300, 300)), 1.0, (300, 300), (104.0, 177.0, 123.0))
+    net.setInput(blob)
+    detections = net.forward()
+    for i in range(0, detections.shape[2]):
+        confidence = detections[0, 0, i, 2]
+        # Filter out weak detections
+        if confidence > 0.5:
+            box = detections[0, 0, i, 3:7] * np.array([w, h, w, h])
+            (startX, startY, endX, endY) = box.astype("int")
+            # Convert bounding box to dlib rectangle format
+            dlib_rect = dlib.rectangle(int(startX), int(startY), int(endX), int(endY))
+            gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
+            landmarks = predictor(gray, dlib_rect)
+            landmarks_np = np.array([[p.x, p.y] for p in landmarks.parts()])
+            x, y, w, h = cv2.boundingRect(landmarks_np)
+            x -= 25
+            y -= 25
+            w += 50
+            h += 50
+            x = max(0, x)
+            y = max(0, y)
+            w = min(w, image.shape[1] - x)
+            h = min(h, image.shape[0] - y)
+            face_crop=image[y:y+h,x:x+w]
+            # Crop and resize the face
+            try:
+                face_crop = cv2.resize(face_crop, (224, 224))
+            except:
+                face_crop = cv2.resize(image, (224, 224))
+            return face_crop,landmarks_np,(w,h)
+    return None,None,None
+def extract_faces_from_frames(frames, net, predictor):
+    faces_list = []
+    landmarks_list = []
+    sizes_list = []
+    for image in tqdm(frames):
+        face_crop, landmarks_np, size = extract_face(image, net, predictor)
+        # Append the results to the respective lists
+        faces_list.append(face_crop)
+        landmarks_list.append(landmarks_np)
+        sizes_list.append(size)
+    return faces_list, landmarks_list, sizes_list

functions/models.py ADDED Viewed

	@@ -0,0 +1,56 @@

+import nltk
+import torch
+from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
+import os
+from functions.fer import Model
+import cv2
+import dlib
+from functions.valence_arousal import load_models
+# Download necessary NLTK packages
+nltk.download('punkt')
+nltk.download('averaged_perceptron_tagger')
+# Device setup
+device = "cuda:0" if torch.cuda.is_available() else "cpu"
+torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
+models_folder=os.path.join(os.path.dirname(os.path.dirname(os.path.abspath(__file__))), 'models')
+fer_model_path=os.path.join(models_folder,'22.6_AffectNet_10K_part2.pt')
+val_ar_feat_path=os.path.join(models_folder,'resnet_features.pt')
+valence_arousal_model=os.path.join(models_folder,'emotion_model.pt')
+# Load Whisper model and processor
+model_id = "openai/whisper-small"
+model = AutoModelForSpeechSeq2Seq.from_pretrained(
+    model_id, torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True
+)
+model.to(device)
+processor = AutoProcessor.from_pretrained(model_id)
+sentipipe = pipeline("text-classification", model="cardiffnlp/twitter-roberta-base-sentiment", device=device)
+fer_model=Model(fps=30,fer_model=fer_model_path)
+resnet,emotion_model=load_models(valence_arousal_model,val_ar_feat_path)
+smile_cascade = cv2.CascadeClassifier(cv2.data.haarcascades + 'haarcascade_smile.xml')
+dnn_net = cv2.dnn.readNetFromCaffe("models/deploy.prototxt", "models/res10_300x300_ssd_iter_140000.caffemodel")
+predictor = dlib.shape_predictor("models/shape_predictor_68_face_landmarks.dat")
+models_dict={
+    'asrmodel':model,
+    'asrproc':processor,
+    'sentipipe':sentipipe,
+    'fer':fer_model,
+    "valence_fer":(resnet,emotion_model),
+    'smile_cascade':smile_cascade,
+    'face':(dnn_net,predictor)
+}

functions/valence_arousal.py ADDED Viewed

	@@ -0,0 +1,93 @@

+from torchvision import models
+import torch.nn as nn
+import torch
+import torch.nn.functional as F
+import torchvision.transforms as transforms
+from PIL import Image
+import cv2
+def create_emotion_model(num_ftrs, num_emotions):
+    return nn.Sequential(
+        nn.Linear(num_ftrs + num_emotions, 128),
+        nn.ReLU(),
+        nn.Linear(128, 64),
+        nn.ReLU(),
+        nn.Linear(64, 2),
+    )
+def load_models(val_model_path,val_featmodel_path):
+    transform = transforms.Compose(
+        [
+            transforms.Resize((224, 224)),
+            transforms.ToTensor(),
+            transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
+        ]
+    )
+    device = "cuda" if torch.cuda.is_available() else "cpu"
+    resnet = models.resnet18(pretrained=False)
+    num_ftrs = resnet.fc.in_features
+    resnet.fc = nn.Identity()
+    resnet.load_state_dict(
+        torch.load(
+            val_featmodel_path,
+            map_location=device
+        )
+    )
+    resnet = resnet.to(device)
+    # num_ftrs = resnet.fc.in_features
+    num_emotions = 1
+    emotion_model = create_emotion_model(num_ftrs, num_emotions).to(device)
+    emotion_model.load_state_dict(
+        torch.load(
+            val_model_path,
+            map_location=device
+        )
+    )
+    return resnet,emotion_model
+def va_predict(emotion_model,resnet,faces,emotions):
+    transform = transforms.Compose(
+        [
+            transforms.Resize((224, 224)),
+            transforms.ToTensor(),
+            transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
+        ]
+    )
+    device = "cuda" if torch.cuda.is_available() else "cpu"
+    def model_forward(images, emotions):
+        resnet_features = resnet(images)
+        batch_size = resnet_features.size(0)
+        emotions = emotions.view(batch_size, -1)
+        x = torch.cat((resnet_features, emotions), dim=1)
+        output = emotion_model(x)
+        return output
+    arousal_list = []
+    valence_list = []
+    stress_list = []
+    from tqdm import tqdm
+    device = "cuda" if torch.cuda.is_available() else "cpu"
+    for face, emotion in tqdm(zip(faces, emotions)):
+        if face is not None:
+            face_pil = Image.fromarray(cv2.cvtColor(face, cv2.COLOR_BGR2RGB))
+            face_tensor = transform(face_pil).unsqueeze(0).to(device)
+            # print(emotion)
+            # print(emotion)
+            emotion = emotion.to(device)
+            output_va = model_forward(face_tensor, emotion)
+            arousal = output_va[0][0].item()
+            norm_arousal = float(output_va[0][0].item()) / 2 + 0.5
+            valence = output_va[0][1].item()
+            norm_valence = float(output_va[0][1].item()) / 2 + 0.5
+            stress = (1 - norm_valence) * norm_arousal
+            arousal_list.append(arousal)
+            valence_list.append(valence)
+            stress_list.append(stress)
+        else:
+            arousal_list.append('frame error')
+            valence_list.append('frame error')
+            stress_list.append('frame error')
+    return valence_list, arousal_list, stress_list

functions/video.py ADDED Viewed

	@@ -0,0 +1,114 @@

+import numpy as np
+from scipy.spatial import distance as dist
+from imutils import face_utils
+(lStart, lEnd) = face_utils.FACIAL_LANDMARKS_IDXS["left_eye"]
+(rStart, rEnd) = face_utils.FACIAL_LANDMARKS_IDXS["right_eye"]
+def euclidean_distance(point1, point2):
+    return np.linalg.norm(point1 - point2)
+def eyebrow(landmarks,sizes):
+    eyebrow_dist=[]
+    for landmark,size in zip(landmarks,sizes):
+        if landmark is not None:
+            right_eyebrow_inner = landmark[21]
+            left_eyebrow_inner = landmark[22]
+            eyebrow_distance = euclidean_distance(right_eyebrow_inner, left_eyebrow_inner)
+            normalized_eyebrow_distance = eyebrow_distance / size[0]
+        else:
+            normalized_eyebrow_distance=None
+        eyebrow_dist.append(normalized_eyebrow_distance)
+    return eyebrow_dist
+def eye_aspect_ratio(eye):
+    A = dist.euclidean(eye[1], eye[5])  # Vertical distance 1
+    B = dist.euclidean(eye[2], eye[4])  # Vertical distance 2
+    C = dist.euclidean(eye[0], eye[3])  # Horizontal distance
+    ear = (A + B) / (2.0 * C)  # EAR formula
+    return ear
+def blinks(landmarks, sizes, fps):
+    blink_durations = []
+    blink_counter = 0
+    total_blinks = 0
+    EYE_AR_THRESH = 0.24  # EAR threshold for blink detection
+    EYE_AR_CONSEC_FRAMES = 4  # Consecutive frames for blink detection
+    frame_count = 0  # Initialize frame counter
+    for landmark, size in zip(landmarks, sizes):
+        if landmark is not None:
+            leftEye = landmark[lStart:lEnd]
+            rightEye = landmark[rStart:rEnd]
+            leftEAR = eye_aspect_ratio(leftEye)
+            rightEAR = eye_aspect_ratio(rightEye)
+            if leftEAR < EYE_AR_THRESH and rightEAR < EYE_AR_THRESH:
+                if blink_counter == 0:
+                    blink_start_frame = frame_count  # Start tracking blink in frames
+                blink_counter += 1
+            else:
+                if blink_counter >= EYE_AR_CONSEC_FRAMES:
+                    blink_end_frame = frame_count
+                    blink_duration_frames = blink_end_frame - blink_start_frame
+                    blink_duration_seconds = blink_duration_frames / fps  # Convert frames to seconds
+                    blink_durations.append(blink_duration_seconds)  # Store blink duration in seconds
+                    total_blinks += 1
+                blink_counter = 0
+        frame_count += 1  # Increment the frame counter for each loop iteration
+    return blink_durations, total_blinks
+def detect_smiles(faces,smile_cascade):
+    smiles=[]
+    count=0
+    for face in faces:
+        if face is not None:
+            smile = smile_cascade.detectMultiScale(face, scaleFactor=1.8, minNeighbors=20, minSize=(25, 25))
+            if len(smile) > 0:
+                smiles.append(True)
+                count+=1
+            else:
+                smiles.append(False)
+        else:
+            smiles.append(None)
+    return smiles,count
+def cal_yawn(landmarks):
+    # Corrected lip landmark indices for dlib's 68-point model
+    top_lip_idx = [50, 51, 52, 53, 61, 62, 63]
+    low_lip_idx = [56, 57, 58, 59, 65, 66, 67]
+    top_lip = np.array([landmarks[idx] for idx in top_lip_idx])
+    low_lip = np.array([landmarks[idx] for idx in low_lip_idx])
+    top_mean = np.mean(top_lip, axis=0)
+    low_mean = np.mean(low_lip, axis=0)
+    distance = dist.euclidean(top_mean, low_mean)
+    return distance
+def detect_yawn(landmarks,sizes):
+    yawn=[]
+    count=0
+    normalized_yawn_thresh = 0.25
+    normalized_lip_distances=[]
+    for landmark,size in zip(landmarks,sizes):
+        if landmark is not None:
+            lip_dist = cal_yawn(landmark)
+            face_size = dist.euclidean(landmark[8], landmark[27])
+            normalized_lip_dist = lip_dist / face_size
+            normalized_lip_distances.append(normalized_lip_dist)
+            if normalized_lip_dist > normalized_yawn_thresh:
+                yawn.append(True)
+                count+=1
+            else:
+                yawn.append(False)
+        else:
+            normalized_lip_distances.append(None)
+            yawn.append(None)
+    return yawn,normalized_lip_distances,count

models/22.6_AffectNet_10K_part2.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:92ef53adb843700faa3c54ae6f3e0f4105e04e099f9190dd66aafc360afdb2bf
+size 16425358

models/deploy.prototxt ADDED Viewed

	@@ -0,0 +1,1789 @@

+input: "data"
+input_shape {
+  dim: 1
+  dim: 3
+  dim: 300
+  dim: 300
+}
+layer {
+  name: "data_bn"
+  type: "BatchNorm"
+  bottom: "data"
+  top: "data_bn"
+  param {
+    lr_mult: 0.0
+  }
+  param {
+    lr_mult: 0.0
+  }
+  param {
+    lr_mult: 0.0
+  }
+}
+layer {
+  name: "data_scale"
+  type: "Scale"
+  bottom: "data_bn"
+  top: "data_bn"
+  param {
+    lr_mult: 1.0
+    decay_mult: 1.0
+  }
+  param {
+    lr_mult: 2.0
+    decay_mult: 1.0
+  }
+  scale_param {
+    bias_term: true
+  }
+}
+layer {
+  name: "conv1_h"
+  type: "Convolution"
+  bottom: "data_bn"
+  top: "conv1_h"
+  param {
+    lr_mult: 1.0
+    decay_mult: 1.0
+  }
+  param {
+    lr_mult: 2.0
+    decay_mult: 1.0
+  }
+  convolution_param {
+    num_output: 32
+    pad: 3
+    kernel_size: 7
+    stride: 2
+    weight_filler {
+      type: "msra"
+      variance_norm: FAN_OUT
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.0
+    }
+  }
+}
+layer {
+  name: "conv1_bn_h"
+  type: "BatchNorm"
+  bottom: "conv1_h"
+  top: "conv1_h"
+  param {
+    lr_mult: 0.0
+  }
+  param {
+    lr_mult: 0.0
+  }
+  param {
+    lr_mult: 0.0
+  }
+}
+layer {
+  name: "conv1_scale_h"
+  type: "Scale"
+  bottom: "conv1_h"
+  top: "conv1_h"
+  param {
+    lr_mult: 1.0
+    decay_mult: 1.0
+  }
+  param {
+    lr_mult: 2.0
+    decay_mult: 1.0
+  }
+  scale_param {
+    bias_term: true
+  }
+}
+layer {
+  name: "conv1_relu"
+  type: "ReLU"
+  bottom: "conv1_h"
+  top: "conv1_h"
+}
+layer {
+  name: "conv1_pool"
+  type: "Pooling"
+  bottom: "conv1_h"
+  top: "conv1_pool"
+  pooling_param {
+    kernel_size: 3
+    stride: 2
+  }
+}
+layer {
+  name: "layer_64_1_conv1_h"
+  type: "Convolution"
+  bottom: "conv1_pool"
+  top: "layer_64_1_conv1_h"
+  param {
+    lr_mult: 1.0
+    decay_mult: 1.0
+  }
+  convolution_param {
+    num_output: 32
+    bias_term: false
+    pad: 1
+    kernel_size: 3
+    stride: 1
+    weight_filler {
+      type: "msra"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.0
+    }
+  }
+}
+layer {
+  name: "layer_64_1_bn2_h"
+  type: "BatchNorm"
+  bottom: "layer_64_1_conv1_h"
+  top: "layer_64_1_conv1_h"
+  param {
+    lr_mult: 0.0
+  }
+  param {
+    lr_mult: 0.0
+  }
+  param {
+    lr_mult: 0.0
+  }
+}
+layer {
+  name: "layer_64_1_scale2_h"
+  type: "Scale"
+  bottom: "layer_64_1_conv1_h"
+  top: "layer_64_1_conv1_h"
+  param {
+    lr_mult: 1.0
+    decay_mult: 1.0
+  }
+  param {
+    lr_mult: 2.0
+    decay_mult: 1.0
+  }
+  scale_param {
+    bias_term: true
+  }
+}
+layer {
+  name: "layer_64_1_relu2"
+  type: "ReLU"
+  bottom: "layer_64_1_conv1_h"
+  top: "layer_64_1_conv1_h"
+}
+layer {
+  name: "layer_64_1_conv2_h"
+  type: "Convolution"
+  bottom: "layer_64_1_conv1_h"
+  top: "layer_64_1_conv2_h"
+  param {
+    lr_mult: 1.0
+    decay_mult: 1.0
+  }
+  convolution_param {
+    num_output: 32
+    bias_term: false
+    pad: 1
+    kernel_size: 3
+    stride: 1
+    weight_filler {
+      type: "msra"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.0
+    }
+  }
+}
+layer {
+  name: "layer_64_1_sum"
+  type: "Eltwise"
+  bottom: "layer_64_1_conv2_h"
+  bottom: "conv1_pool"
+  top: "layer_64_1_sum"
+}
+layer {
+  name: "layer_128_1_bn1_h"
+  type: "BatchNorm"
+  bottom: "layer_64_1_sum"
+  top: "layer_128_1_bn1_h"
+  param {
+    lr_mult: 0.0
+  }
+  param {
+    lr_mult: 0.0
+  }
+  param {
+    lr_mult: 0.0
+  }
+}
+layer {
+  name: "layer_128_1_scale1_h"
+  type: "Scale"
+  bottom: "layer_128_1_bn1_h"
+  top: "layer_128_1_bn1_h"
+  param {
+    lr_mult: 1.0
+    decay_mult: 1.0
+  }
+  param {
+    lr_mult: 2.0
+    decay_mult: 1.0
+  }
+  scale_param {
+    bias_term: true
+  }
+}
+layer {
+  name: "layer_128_1_relu1"
+  type: "ReLU"
+  bottom: "layer_128_1_bn1_h"
+  top: "layer_128_1_bn1_h"
+}
+layer {
+  name: "layer_128_1_conv1_h"
+  type: "Convolution"
+  bottom: "layer_128_1_bn1_h"
+  top: "layer_128_1_conv1_h"
+  param {
+    lr_mult: 1.0
+    decay_mult: 1.0
+  }
+  convolution_param {
+    num_output: 128
+    bias_term: false
+    pad: 1
+    kernel_size: 3
+    stride: 2
+    weight_filler {
+      type: "msra"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.0
+    }
+  }
+}
+layer {
+  name: "layer_128_1_bn2"
+  type: "BatchNorm"
+  bottom: "layer_128_1_conv1_h"
+  top: "layer_128_1_conv1_h"
+  param {
+    lr_mult: 0.0
+  }
+  param {
+    lr_mult: 0.0
+  }
+  param {
+    lr_mult: 0.0
+  }
+}
+layer {
+  name: "layer_128_1_scale2"
+  type: "Scale"
+  bottom: "layer_128_1_conv1_h"
+  top: "layer_128_1_conv1_h"
+  param {
+    lr_mult: 1.0
+    decay_mult: 1.0
+  }
+  param {
+    lr_mult: 2.0
+    decay_mult: 1.0
+  }
+  scale_param {
+    bias_term: true
+  }
+}
+layer {
+  name: "layer_128_1_relu2"
+  type: "ReLU"
+  bottom: "layer_128_1_conv1_h"
+  top: "layer_128_1_conv1_h"
+}
+layer {
+  name: "layer_128_1_conv2"
+  type: "Convolution"
+  bottom: "layer_128_1_conv1_h"
+  top: "layer_128_1_conv2"
+  param {
+    lr_mult: 1.0
+    decay_mult: 1.0
+  }
+  convolution_param {
+    num_output: 128
+    bias_term: false
+    pad: 1
+    kernel_size: 3
+    stride: 1
+    weight_filler {
+      type: "msra"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.0
+    }
+  }
+}
+layer {
+  name: "layer_128_1_conv_expand_h"
+  type: "Convolution"
+  bottom: "layer_128_1_bn1_h"
+  top: "layer_128_1_conv_expand_h"
+  param {
+    lr_mult: 1.0
+    decay_mult: 1.0
+  }
+  convolution_param {
+    num_output: 128
+    bias_term: false
+    pad: 0
+    kernel_size: 1
+    stride: 2
+    weight_filler {
+      type: "msra"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.0
+    }
+  }
+}
+layer {
+  name: "layer_128_1_sum"
+  type: "Eltwise"
+  bottom: "layer_128_1_conv2"
+  bottom: "layer_128_1_conv_expand_h"
+  top: "layer_128_1_sum"
+}
+layer {
+  name: "layer_256_1_bn1"
+  type: "BatchNorm"
+  bottom: "layer_128_1_sum"
+  top: "layer_256_1_bn1"
+  param {
+    lr_mult: 0.0
+  }
+  param {
+    lr_mult: 0.0
+  }
+  param {
+    lr_mult: 0.0
+  }
+}
+layer {
+  name: "layer_256_1_scale1"
+  type: "Scale"
+  bottom: "layer_256_1_bn1"
+  top: "layer_256_1_bn1"
+  param {
+    lr_mult: 1.0
+    decay_mult: 1.0
+  }
+  param {
+    lr_mult: 2.0
+    decay_mult: 1.0
+  }
+  scale_param {
+    bias_term: true
+  }
+}
+layer {
+  name: "layer_256_1_relu1"
+  type: "ReLU"
+  bottom: "layer_256_1_bn1"
+  top: "layer_256_1_bn1"
+}
+layer {
+  name: "layer_256_1_conv1"
+  type: "Convolution"
+  bottom: "layer_256_1_bn1"
+  top: "layer_256_1_conv1"
+  param {
+    lr_mult: 1.0
+    decay_mult: 1.0
+  }
+  convolution_param {
+    num_output: 256
+    bias_term: false
+    pad: 1
+    kernel_size: 3
+    stride: 2
+    weight_filler {
+      type: "msra"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.0
+    }
+  }
+}
+layer {
+  name: "layer_256_1_bn2"
+  type: "BatchNorm"
+  bottom: "layer_256_1_conv1"
+  top: "layer_256_1_conv1"
+  param {
+    lr_mult: 0.0
+  }
+  param {
+    lr_mult: 0.0
+  }
+  param {
+    lr_mult: 0.0
+  }
+}
+layer {
+  name: "layer_256_1_scale2"
+  type: "Scale"
+  bottom: "layer_256_1_conv1"
+  top: "layer_256_1_conv1"
+  param {
+    lr_mult: 1.0
+    decay_mult: 1.0
+  }
+  param {
+    lr_mult: 2.0
+    decay_mult: 1.0
+  }
+  scale_param {
+    bias_term: true
+  }
+}
+layer {
+  name: "layer_256_1_relu2"
+  type: "ReLU"
+  bottom: "layer_256_1_conv1"
+  top: "layer_256_1_conv1"
+}
+layer {
+  name: "layer_256_1_conv2"
+  type: "Convolution"
+  bottom: "layer_256_1_conv1"
+  top: "layer_256_1_conv2"
+  param {
+    lr_mult: 1.0
+    decay_mult: 1.0
+  }
+  convolution_param {
+    num_output: 256
+    bias_term: false
+    pad: 1
+    kernel_size: 3
+    stride: 1
+    weight_filler {
+      type: "msra"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.0
+    }
+  }
+}
+layer {
+  name: "layer_256_1_conv_expand"
+  type: "Convolution"
+  bottom: "layer_256_1_bn1"
+  top: "layer_256_1_conv_expand"
+  param {
+    lr_mult: 1.0
+    decay_mult: 1.0
+  }
+  convolution_param {
+    num_output: 256
+    bias_term: false
+    pad: 0
+    kernel_size: 1
+    stride: 2
+    weight_filler {
+      type: "msra"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.0
+    }
+  }
+}
+layer {
+  name: "layer_256_1_sum"
+  type: "Eltwise"
+  bottom: "layer_256_1_conv2"
+  bottom: "layer_256_1_conv_expand"
+  top: "layer_256_1_sum"
+}
+layer {
+  name: "layer_512_1_bn1"
+  type: "BatchNorm"
+  bottom: "layer_256_1_sum"
+  top: "layer_512_1_bn1"
+  param {
+    lr_mult: 0.0
+  }
+  param {
+    lr_mult: 0.0
+  }
+  param {
+    lr_mult: 0.0
+  }
+}
+layer {
+  name: "layer_512_1_scale1"
+  type: "Scale"
+  bottom: "layer_512_1_bn1"
+  top: "layer_512_1_bn1"
+  param {
+    lr_mult: 1.0
+    decay_mult: 1.0
+  }
+  param {
+    lr_mult: 2.0
+    decay_mult: 1.0
+  }
+  scale_param {
+    bias_term: true
+  }
+}
+layer {
+  name: "layer_512_1_relu1"
+  type: "ReLU"
+  bottom: "layer_512_1_bn1"
+  top: "layer_512_1_bn1"
+}
+layer {
+  name: "layer_512_1_conv1_h"
+  type: "Convolution"
+  bottom: "layer_512_1_bn1"
+  top: "layer_512_1_conv1_h"
+  param {
+    lr_mult: 1.0
+    decay_mult: 1.0
+  }
+  convolution_param {
+    num_output: 128
+    bias_term: false
+    pad: 1
+    kernel_size: 3
+    stride: 1 # 2
+    weight_filler {
+      type: "msra"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.0
+    }
+  }
+}
+layer {
+  name: "layer_512_1_bn2_h"
+  type: "BatchNorm"
+  bottom: "layer_512_1_conv1_h"
+  top: "layer_512_1_conv1_h"
+  param {
+    lr_mult: 0.0
+  }
+  param {
+    lr_mult: 0.0
+  }
+  param {
+    lr_mult: 0.0
+  }
+}
+layer {
+  name: "layer_512_1_scale2_h"
+  type: "Scale"
+  bottom: "layer_512_1_conv1_h"
+  top: "layer_512_1_conv1_h"
+  param {
+    lr_mult: 1.0
+    decay_mult: 1.0
+  }
+  param {
+    lr_mult: 2.0
+    decay_mult: 1.0
+  }
+  scale_param {
+    bias_term: true
+  }
+}
+layer {
+  name: "layer_512_1_relu2"
+  type: "ReLU"
+  bottom: "layer_512_1_conv1_h"
+  top: "layer_512_1_conv1_h"
+}
+layer {
+  name: "layer_512_1_conv2_h"
+  type: "Convolution"
+  bottom: "layer_512_1_conv1_h"
+  top: "layer_512_1_conv2_h"
+  param {
+    lr_mult: 1.0
+    decay_mult: 1.0
+  }
+  convolution_param {
+    num_output: 256
+    bias_term: false
+    pad: 2 # 1
+    kernel_size: 3
+    stride: 1
+    dilation: 2
+    weight_filler {
+      type: "msra"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.0
+    }
+  }
+}
+layer {
+  name: "layer_512_1_conv_expand_h"
+  type: "Convolution"
+  bottom: "layer_512_1_bn1"
+  top: "layer_512_1_conv_expand_h"
+  param {
+    lr_mult: 1.0
+    decay_mult: 1.0
+  }
+  convolution_param {
+    num_output: 256
+    bias_term: false
+    pad: 0
+    kernel_size: 1
+    stride: 1 # 2
+    weight_filler {
+      type: "msra"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.0
+    }
+  }
+}
+layer {
+  name: "layer_512_1_sum"
+  type: "Eltwise"
+  bottom: "layer_512_1_conv2_h"
+  bottom: "layer_512_1_conv_expand_h"
+  top: "layer_512_1_sum"
+}
+layer {
+  name: "last_bn_h"
+  type: "BatchNorm"
+  bottom: "layer_512_1_sum"
+  top: "layer_512_1_sum"
+  param {
+    lr_mult: 0.0
+  }
+  param {
+    lr_mult: 0.0
+  }
+  param {
+    lr_mult: 0.0
+  }
+}
+layer {
+  name: "last_scale_h"
+  type: "Scale"
+  bottom: "layer_512_1_sum"
+  top: "layer_512_1_sum"
+  param {
+    lr_mult: 1.0
+    decay_mult: 1.0
+  }
+  param {
+    lr_mult: 2.0
+    decay_mult: 1.0
+  }
+  scale_param {
+    bias_term: true
+  }
+}
+layer {
+  name: "last_relu"
+  type: "ReLU"
+  bottom: "layer_512_1_sum"
+  top: "fc7"
+}
+layer {
+  name: "conv6_1_h"
+  type: "Convolution"
+  bottom: "fc7"
+  top: "conv6_1_h"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 128
+    pad: 0
+    kernel_size: 1
+    stride: 1
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+layer {
+  name: "conv6_1_relu"
+  type: "ReLU"
+  bottom: "conv6_1_h"
+  top: "conv6_1_h"
+}
+layer {
+  name: "conv6_2_h"
+  type: "Convolution"
+  bottom: "conv6_1_h"
+  top: "conv6_2_h"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 256
+    pad: 1
+    kernel_size: 3
+    stride: 2
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+layer {
+  name: "conv6_2_relu"
+  type: "ReLU"
+  bottom: "conv6_2_h"
+  top: "conv6_2_h"
+}
+layer {
+  name: "conv7_1_h"
+  type: "Convolution"
+  bottom: "conv6_2_h"
+  top: "conv7_1_h"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 64
+    pad: 0
+    kernel_size: 1
+    stride: 1
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+layer {
+  name: "conv7_1_relu"
+  type: "ReLU"
+  bottom: "conv7_1_h"
+  top: "conv7_1_h"
+}
+layer {
+  name: "conv7_2_h"
+  type: "Convolution"
+  bottom: "conv7_1_h"
+  top: "conv7_2_h"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 128
+    pad: 1
+    kernel_size: 3
+    stride: 2
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+layer {
+  name: "conv7_2_relu"
+  type: "ReLU"
+  bottom: "conv7_2_h"
+  top: "conv7_2_h"
+}
+layer {
+  name: "conv8_1_h"
+  type: "Convolution"
+  bottom: "conv7_2_h"
+  top: "conv8_1_h"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 64
+    pad: 0
+    kernel_size: 1
+    stride: 1
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+layer {
+  name: "conv8_1_relu"
+  type: "ReLU"
+  bottom: "conv8_1_h"
+  top: "conv8_1_h"
+}
+layer {
+  name: "conv8_2_h"
+  type: "Convolution"
+  bottom: "conv8_1_h"
+  top: "conv8_2_h"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 128
+    pad: 1
+    kernel_size: 3
+    stride: 1
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+layer {
+  name: "conv8_2_relu"
+  type: "ReLU"
+  bottom: "conv8_2_h"
+  top: "conv8_2_h"
+}
+layer {
+  name: "conv9_1_h"
+  type: "Convolution"
+  bottom: "conv8_2_h"
+  top: "conv9_1_h"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 64
+    pad: 0
+    kernel_size: 1
+    stride: 1
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+layer {
+  name: "conv9_1_relu"
+  type: "ReLU"
+  bottom: "conv9_1_h"
+  top: "conv9_1_h"
+}
+layer {
+  name: "conv9_2_h"
+  type: "Convolution"
+  bottom: "conv9_1_h"
+  top: "conv9_2_h"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 128
+    pad: 1
+    kernel_size: 3
+    stride: 1
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+layer {
+  name: "conv9_2_relu"
+  type: "ReLU"
+  bottom: "conv9_2_h"
+  top: "conv9_2_h"
+}
+layer {
+  name: "conv4_3_norm"
+  type: "Normalize"
+  bottom: "layer_256_1_bn1"
+  top: "conv4_3_norm"
+  norm_param {
+    across_spatial: false
+    scale_filler {
+      type: "constant"
+      value: 20
+    }
+    channel_shared: false
+  }
+}
+layer {
+  name: "conv4_3_norm_mbox_loc"
+  type: "Convolution"
+  bottom: "conv4_3_norm"
+  top: "conv4_3_norm_mbox_loc"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 16
+    pad: 1
+    kernel_size: 3
+    stride: 1
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+layer {
+  name: "conv4_3_norm_mbox_loc_perm"
+  type: "Permute"
+  bottom: "conv4_3_norm_mbox_loc"
+  top: "conv4_3_norm_mbox_loc_perm"
+  permute_param {
+    order: 0
+    order: 2
+    order: 3
+    order: 1
+  }
+}
+layer {
+  name: "conv4_3_norm_mbox_loc_flat"
+  type: "Flatten"
+  bottom: "conv4_3_norm_mbox_loc_perm"
+  top: "conv4_3_norm_mbox_loc_flat"
+  flatten_param {
+    axis: 1
+  }
+}
+layer {
+  name: "conv4_3_norm_mbox_conf"
+  type: "Convolution"
+  bottom: "conv4_3_norm"
+  top: "conv4_3_norm_mbox_conf"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 8 # 84
+    pad: 1
+    kernel_size: 3
+    stride: 1
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+layer {
+  name: "conv4_3_norm_mbox_conf_perm"
+  type: "Permute"
+  bottom: "conv4_3_norm_mbox_conf"
+  top: "conv4_3_norm_mbox_conf_perm"
+  permute_param {
+    order: 0
+    order: 2
+    order: 3
+    order: 1
+  }
+}
+layer {
+  name: "conv4_3_norm_mbox_conf_flat"
+  type: "Flatten"
+  bottom: "conv4_3_norm_mbox_conf_perm"
+  top: "conv4_3_norm_mbox_conf_flat"
+  flatten_param {
+    axis: 1
+  }
+}
+layer {
+  name: "conv4_3_norm_mbox_priorbox"
+  type: "PriorBox"
+  bottom: "conv4_3_norm"
+  bottom: "data"
+  top: "conv4_3_norm_mbox_priorbox"
+  prior_box_param {
+    min_size: 30.0
+    max_size: 60.0
+    aspect_ratio: 2
+    flip: true
+    clip: false
+    variance: 0.1
+    variance: 0.1
+    variance: 0.2
+    variance: 0.2
+    step: 8
+    offset: 0.5
+  }
+}
+layer {
+  name: "fc7_mbox_loc"
+  type: "Convolution"
+  bottom: "fc7"
+  top: "fc7_mbox_loc"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 24
+    pad: 1
+    kernel_size: 3
+    stride: 1
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+layer {
+  name: "fc7_mbox_loc_perm"
+  type: "Permute"
+  bottom: "fc7_mbox_loc"
+  top: "fc7_mbox_loc_perm"
+  permute_param {
+    order: 0
+    order: 2
+    order: 3
+    order: 1
+  }
+}
+layer {
+  name: "fc7_mbox_loc_flat"
+  type: "Flatten"
+  bottom: "fc7_mbox_loc_perm"
+  top: "fc7_mbox_loc_flat"
+  flatten_param {
+    axis: 1
+  }
+}
+layer {
+  name: "fc7_mbox_conf"
+  type: "Convolution"
+  bottom: "fc7"
+  top: "fc7_mbox_conf"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 12 # 126
+    pad: 1
+    kernel_size: 3
+    stride: 1
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+layer {
+  name: "fc7_mbox_conf_perm"
+  type: "Permute"
+  bottom: "fc7_mbox_conf"
+  top: "fc7_mbox_conf_perm"
+  permute_param {
+    order: 0
+    order: 2
+    order: 3
+    order: 1
+  }
+}
+layer {
+  name: "fc7_mbox_conf_flat"
+  type: "Flatten"
+  bottom: "fc7_mbox_conf_perm"
+  top: "fc7_mbox_conf_flat"
+  flatten_param {
+    axis: 1
+  }
+}
+layer {
+  name: "fc7_mbox_priorbox"
+  type: "PriorBox"
+  bottom: "fc7"
+  bottom: "data"
+  top: "fc7_mbox_priorbox"
+  prior_box_param {
+    min_size: 60.0
+    max_size: 111.0
+    aspect_ratio: 2
+    aspect_ratio: 3
+    flip: true
+    clip: false
+    variance: 0.1
+    variance: 0.1
+    variance: 0.2
+    variance: 0.2
+    step: 16
+    offset: 0.5
+  }
+}
+layer {
+  name: "conv6_2_mbox_loc"
+  type: "Convolution"
+  bottom: "conv6_2_h"
+  top: "conv6_2_mbox_loc"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 24
+    pad: 1
+    kernel_size: 3
+    stride: 1
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+layer {
+  name: "conv6_2_mbox_loc_perm"
+  type: "Permute"
+  bottom: "conv6_2_mbox_loc"
+  top: "conv6_2_mbox_loc_perm"
+  permute_param {
+    order: 0
+    order: 2
+    order: 3
+    order: 1
+  }
+}
+layer {
+  name: "conv6_2_mbox_loc_flat"
+  type: "Flatten"
+  bottom: "conv6_2_mbox_loc_perm"
+  top: "conv6_2_mbox_loc_flat"
+  flatten_param {
+    axis: 1
+  }
+}
+layer {
+  name: "conv6_2_mbox_conf"
+  type: "Convolution"
+  bottom: "conv6_2_h"
+  top: "conv6_2_mbox_conf"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 12 # 126
+    pad: 1
+    kernel_size: 3
+    stride: 1
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+layer {
+  name: "conv6_2_mbox_conf_perm"
+  type: "Permute"
+  bottom: "conv6_2_mbox_conf"
+  top: "conv6_2_mbox_conf_perm"
+  permute_param {
+    order: 0
+    order: 2
+    order: 3
+    order: 1
+  }
+}
+layer {
+  name: "conv6_2_mbox_conf_flat"
+  type: "Flatten"
+  bottom: "conv6_2_mbox_conf_perm"
+  top: "conv6_2_mbox_conf_flat"
+  flatten_param {
+    axis: 1
+  }
+}
+layer {
+  name: "conv6_2_mbox_priorbox"
+  type: "PriorBox"
+  bottom: "conv6_2_h"
+  bottom: "data"
+  top: "conv6_2_mbox_priorbox"
+  prior_box_param {
+    min_size: 111.0
+    max_size: 162.0
+    aspect_ratio: 2
+    aspect_ratio: 3
+    flip: true
+    clip: false
+    variance: 0.1
+    variance: 0.1
+    variance: 0.2
+    variance: 0.2
+    step: 32
+    offset: 0.5
+  }
+}
+layer {
+  name: "conv7_2_mbox_loc"
+  type: "Convolution"
+  bottom: "conv7_2_h"
+  top: "conv7_2_mbox_loc"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 24
+    pad: 1
+    kernel_size: 3
+    stride: 1
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+layer {
+  name: "conv7_2_mbox_loc_perm"
+  type: "Permute"
+  bottom: "conv7_2_mbox_loc"
+  top: "conv7_2_mbox_loc_perm"
+  permute_param {
+    order: 0
+    order: 2
+    order: 3
+    order: 1
+  }
+}
+layer {
+  name: "conv7_2_mbox_loc_flat"
+  type: "Flatten"
+  bottom: "conv7_2_mbox_loc_perm"
+  top: "conv7_2_mbox_loc_flat"
+  flatten_param {
+    axis: 1
+  }
+}
+layer {
+  name: "conv7_2_mbox_conf"
+  type: "Convolution"
+  bottom: "conv7_2_h"
+  top: "conv7_2_mbox_conf"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 12 # 126
+    pad: 1
+    kernel_size: 3
+    stride: 1
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+layer {
+  name: "conv7_2_mbox_conf_perm"
+  type: "Permute"
+  bottom: "conv7_2_mbox_conf"
+  top: "conv7_2_mbox_conf_perm"
+  permute_param {
+    order: 0
+    order: 2
+    order: 3
+    order: 1
+  }
+}
+layer {
+  name: "conv7_2_mbox_conf_flat"
+  type: "Flatten"
+  bottom: "conv7_2_mbox_conf_perm"
+  top: "conv7_2_mbox_conf_flat"
+  flatten_param {
+    axis: 1
+  }
+}
+layer {
+  name: "conv7_2_mbox_priorbox"
+  type: "PriorBox"
+  bottom: "conv7_2_h"
+  bottom: "data"
+  top: "conv7_2_mbox_priorbox"
+  prior_box_param {
+    min_size: 162.0
+    max_size: 213.0
+    aspect_ratio: 2
+    aspect_ratio: 3
+    flip: true
+    clip: false
+    variance: 0.1
+    variance: 0.1
+    variance: 0.2
+    variance: 0.2
+    step: 64
+    offset: 0.5
+  }
+}
+layer {
+  name: "conv8_2_mbox_loc"
+  type: "Convolution"
+  bottom: "conv8_2_h"
+  top: "conv8_2_mbox_loc"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 16
+    pad: 1
+    kernel_size: 3
+    stride: 1
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+layer {
+  name: "conv8_2_mbox_loc_perm"
+  type: "Permute"
+  bottom: "conv8_2_mbox_loc"
+  top: "conv8_2_mbox_loc_perm"
+  permute_param {
+    order: 0
+    order: 2
+    order: 3
+    order: 1
+  }
+}
+layer {
+  name: "conv8_2_mbox_loc_flat"
+  type: "Flatten"
+  bottom: "conv8_2_mbox_loc_perm"
+  top: "conv8_2_mbox_loc_flat"
+  flatten_param {
+    axis: 1
+  }
+}
+layer {
+  name: "conv8_2_mbox_conf"
+  type: "Convolution"
+  bottom: "conv8_2_h"
+  top: "conv8_2_mbox_conf"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 8 # 84
+    pad: 1
+    kernel_size: 3
+    stride: 1
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+layer {
+  name: "conv8_2_mbox_conf_perm"
+  type: "Permute"
+  bottom: "conv8_2_mbox_conf"
+  top: "conv8_2_mbox_conf_perm"
+  permute_param {
+    order: 0
+    order: 2
+    order: 3
+    order: 1
+  }
+}
+layer {
+  name: "conv8_2_mbox_conf_flat"
+  type: "Flatten"
+  bottom: "conv8_2_mbox_conf_perm"
+  top: "conv8_2_mbox_conf_flat"
+  flatten_param {
+    axis: 1
+  }
+}
+layer {
+  name: "conv8_2_mbox_priorbox"
+  type: "PriorBox"
+  bottom: "conv8_2_h"
+  bottom: "data"
+  top: "conv8_2_mbox_priorbox"
+  prior_box_param {
+    min_size: 213.0
+    max_size: 264.0
+    aspect_ratio: 2
+    flip: true
+    clip: false
+    variance: 0.1
+    variance: 0.1
+    variance: 0.2
+    variance: 0.2
+    step: 100
+    offset: 0.5
+  }
+}
+layer {
+  name: "conv9_2_mbox_loc"
+  type: "Convolution"
+  bottom: "conv9_2_h"
+  top: "conv9_2_mbox_loc"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 16
+    pad: 1
+    kernel_size: 3
+    stride: 1
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+layer {
+  name: "conv9_2_mbox_loc_perm"
+  type: "Permute"
+  bottom: "conv9_2_mbox_loc"
+  top: "conv9_2_mbox_loc_perm"
+  permute_param {
+    order: 0
+    order: 2
+    order: 3
+    order: 1
+  }
+}
+layer {
+  name: "conv9_2_mbox_loc_flat"
+  type: "Flatten"
+  bottom: "conv9_2_mbox_loc_perm"
+  top: "conv9_2_mbox_loc_flat"
+  flatten_param {
+    axis: 1
+  }
+}
+layer {
+  name: "conv9_2_mbox_conf"
+  type: "Convolution"
+  bottom: "conv9_2_h"
+  top: "conv9_2_mbox_conf"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 8 # 84
+    pad: 1
+    kernel_size: 3
+    stride: 1
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+layer {
+  name: "conv9_2_mbox_conf_perm"
+  type: "Permute"
+  bottom: "conv9_2_mbox_conf"
+  top: "conv9_2_mbox_conf_perm"
+  permute_param {
+    order: 0
+    order: 2
+    order: 3
+    order: 1
+  }
+}
+layer {
+  name: "conv9_2_mbox_conf_flat"
+  type: "Flatten"
+  bottom: "conv9_2_mbox_conf_perm"
+  top: "conv9_2_mbox_conf_flat"
+  flatten_param {
+    axis: 1
+  }
+}
+layer {
+  name: "conv9_2_mbox_priorbox"
+  type: "PriorBox"
+  bottom: "conv9_2_h"
+  bottom: "data"
+  top: "conv9_2_mbox_priorbox"
+  prior_box_param {
+    min_size: 264.0
+    max_size: 315.0
+    aspect_ratio: 2
+    flip: true
+    clip: false
+    variance: 0.1
+    variance: 0.1
+    variance: 0.2
+    variance: 0.2
+    step: 300
+    offset: 0.5
+  }
+}
+layer {
+  name: "mbox_loc"
+  type: "Concat"
+  bottom: "conv4_3_norm_mbox_loc_flat"
+  bottom: "fc7_mbox_loc_flat"
+  bottom: "conv6_2_mbox_loc_flat"
+  bottom: "conv7_2_mbox_loc_flat"
+  bottom: "conv8_2_mbox_loc_flat"
+  bottom: "conv9_2_mbox_loc_flat"
+  top: "mbox_loc"
+  concat_param {
+    axis: 1
+  }
+}
+layer {
+  name: "mbox_conf"
+  type: "Concat"
+  bottom: "conv4_3_norm_mbox_conf_flat"
+  bottom: "fc7_mbox_conf_flat"
+  bottom: "conv6_2_mbox_conf_flat"
+  bottom: "conv7_2_mbox_conf_flat"
+  bottom: "conv8_2_mbox_conf_flat"
+  bottom: "conv9_2_mbox_conf_flat"
+  top: "mbox_conf"
+  concat_param {
+    axis: 1
+  }
+}
+layer {
+  name: "mbox_priorbox"
+  type: "Concat"
+  bottom: "conv4_3_norm_mbox_priorbox"
+  bottom: "fc7_mbox_priorbox"
+  bottom: "conv6_2_mbox_priorbox"
+  bottom: "conv7_2_mbox_priorbox"
+  bottom: "conv8_2_mbox_priorbox"
+  bottom: "conv9_2_mbox_priorbox"
+  top: "mbox_priorbox"
+  concat_param {
+    axis: 2
+  }
+}
+layer {
+  name: "mbox_conf_reshape"
+  type: "Reshape"
+  bottom: "mbox_conf"
+  top: "mbox_conf_reshape"
+  reshape_param {
+    shape {
+      dim: 0
+      dim: -1
+      dim: 2
+    }
+  }
+}
+layer {
+  name: "mbox_conf_softmax"
+  type: "Softmax"
+  bottom: "mbox_conf_reshape"
+  top: "mbox_conf_softmax"
+  softmax_param {
+    axis: 2
+  }
+}
+layer {
+  name: "mbox_conf_flatten"
+  type: "Flatten"
+  bottom: "mbox_conf_softmax"
+  top: "mbox_conf_flatten"
+  flatten_param {
+    axis: 1
+  }
+}
+layer {
+  name: "detection_out"
+  type: "DetectionOutput"
+  bottom: "mbox_loc"
+  bottom: "mbox_conf_flatten"
+  bottom: "mbox_priorbox"
+  top: "detection_out"
+  include {
+    phase: TEST
+  }
+  detection_output_param {
+    num_classes: 2
+    share_location: true
+    background_label_id: 0
+    nms_param {
+      nms_threshold: 0.45
+      top_k: 400
+    }
+    code_type: CENTER_SIZE
+    keep_top_k: 200
+    confidence_threshold: 0.01
+  }
+}

models/emotion_model.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:19c707063376d618b3e56df7537b6e103c7426e759c63ce5a4c33df414ce3612
+size 299634

models/res10_300x300_ssd_iter_140000.caffemodel ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:2a56a11a57a4a295956b0660b4a3d76bbdca2206c4961cea8efe7d95c7cb2f2d
+size 10666211

models/resnet_features.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:58abd9efad0ad01490f42835b4dd082b346e0bfa3b6b9a6041295de905cf688f
+size 44786742

models/shape_predictor_68_face_landmarks.dat ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:fbdc2cb80eb9aa7a758672cbfdda32ba6300efe9b6e6c7a299ff7e736b11b92f
+size 99693937