Spaces:
Sleeping
Sleeping
Upload 15 files
Browse files- app.py +1 -1
- functions/__pycache__/audio.cpython-312.pyc +0 -0
- functions/__pycache__/fer.cpython-312.pyc +0 -0
- functions/__pycache__/helper.cpython-312.pyc +0 -0
- functions/__pycache__/video.cpython-312.pyc +0 -0
- functions/audio.py +145 -99
- functions/fer.py +10 -4
- functions/helper.py +125 -2
- functions/video.py +131 -71
- main.py +27 -18
- requirements.txt +0 -0
app.py
CHANGED
@@ -70,5 +70,5 @@ if uploaded_file is not None:
|
|
70 |
|
71 |
# Clean up temporary files
|
72 |
os.remove(temp_file_path)
|
73 |
-
shutil.rmtree(output_folder)
|
74 |
os.remove(zip_file_path)
|
|
|
70 |
|
71 |
# Clean up temporary files
|
72 |
os.remove(temp_file_path)
|
73 |
+
# shutil.rmtree(output_folder)
|
74 |
os.remove(zip_file_path)
|
functions/__pycache__/audio.cpython-312.pyc
CHANGED
Binary files a/functions/__pycache__/audio.cpython-312.pyc and b/functions/__pycache__/audio.cpython-312.pyc differ
|
|
functions/__pycache__/fer.cpython-312.pyc
CHANGED
Binary files a/functions/__pycache__/fer.cpython-312.pyc and b/functions/__pycache__/fer.cpython-312.pyc differ
|
|
functions/__pycache__/helper.cpython-312.pyc
CHANGED
Binary files a/functions/__pycache__/helper.cpython-312.pyc and b/functions/__pycache__/helper.cpython-312.pyc differ
|
|
functions/__pycache__/video.cpython-312.pyc
CHANGED
Binary files a/functions/__pycache__/video.cpython-312.pyc and b/functions/__pycache__/video.cpython-312.pyc differ
|
|
functions/audio.py
CHANGED
@@ -1,99 +1,145 @@
|
|
1 |
-
import librosa
|
2 |
-
import numpy as np
|
3 |
-
import torch
|
4 |
-
from collections import Counter
|
5 |
-
import nltk
|
6 |
-
|
7 |
-
|
8 |
-
|
9 |
-
|
10 |
-
|
11 |
-
|
12 |
-
|
13 |
-
|
14 |
-
|
15 |
-
|
16 |
-
|
17 |
-
|
18 |
-
|
19 |
-
|
20 |
-
|
21 |
-
#
|
22 |
-
|
23 |
-
|
24 |
-
|
25 |
-
|
26 |
-
|
27 |
-
|
28 |
-
|
29 |
-
|
30 |
-
|
31 |
-
|
32 |
-
|
33 |
-
|
34 |
-
|
35 |
-
|
36 |
-
|
37 |
-
|
38 |
-
|
39 |
-
|
40 |
-
|
41 |
-
|
42 |
-
|
43 |
-
|
44 |
-
|
45 |
-
|
46 |
-
|
47 |
-
|
48 |
-
|
49 |
-
|
50 |
-
|
51 |
-
|
52 |
-
|
53 |
-
|
54 |
-
|
55 |
-
|
56 |
-
|
57 |
-
|
58 |
-
|
59 |
-
|
60 |
-
|
61 |
-
|
62 |
-
|
63 |
-
|
64 |
-
|
65 |
-
|
66 |
-
|
67 |
-
|
68 |
-
|
69 |
-
|
70 |
-
|
71 |
-
|
72 |
-
|
73 |
-
|
74 |
-
|
75 |
-
|
76 |
-
|
77 |
-
|
78 |
-
|
79 |
-
|
80 |
-
|
81 |
-
|
82 |
-
|
83 |
-
|
84 |
-
|
85 |
-
|
86 |
-
|
87 |
-
|
88 |
-
|
89 |
-
|
90 |
-
|
91 |
-
|
92 |
-
|
93 |
-
|
94 |
-
|
95 |
-
|
96 |
-
|
97 |
-
|
98 |
-
|
99 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import librosa
|
2 |
+
import numpy as np
|
3 |
+
import torch
|
4 |
+
from collections import Counter
|
5 |
+
import nltk
|
6 |
+
import string
|
7 |
+
import matplotlib.pyplot as plt
|
8 |
+
from wordcloud import WordCloud
|
9 |
+
|
10 |
+
nltk.download('punkt')
|
11 |
+
nltk.download('averaged_perceptron_tagger')
|
12 |
+
device = "cuda:0" if torch.cuda.is_available() else "cpu"
|
13 |
+
torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
|
14 |
+
|
15 |
+
def get_pitch_list(y,sr):
|
16 |
+
hop_length = int(sr / 30) # hop_length determines how far apart the frames are
|
17 |
+
|
18 |
+
# Extract the pitch (F0) using librosa's piptrack method
|
19 |
+
pitches, magnitudes = librosa.piptrack(y=y, sr=sr, hop_length=hop_length)
|
20 |
+
|
21 |
+
# Get the pitch frequencies from the pitch array
|
22 |
+
pitch_frequencies = []
|
23 |
+
|
24 |
+
for t in range(pitches.shape[1]):
|
25 |
+
index = magnitudes[:, t].argmax() # Get the index of the maximum magnitude
|
26 |
+
pitch = pitches[index, t]
|
27 |
+
|
28 |
+
pitch_frequencies.append(pitch)
|
29 |
+
|
30 |
+
# Convert pitch_frequencies to a NumPy array
|
31 |
+
pitch_frequencies = np.array(pitch_frequencies)
|
32 |
+
print("shape : ",pitch_frequencies.shape)
|
33 |
+
return pitch_frequencies
|
34 |
+
|
35 |
+
|
36 |
+
def extract_audio_features(audio_path, asrmodel, asrproc, sentipipe, duration, wordcloud_path):
|
37 |
+
y, sr = librosa.load(audio_path, sr=16000)
|
38 |
+
inputs = asrproc(y, sampling_rate=sr, return_tensors="pt").input_features
|
39 |
+
inputs = inputs.to(device, dtype=torch_dtype)
|
40 |
+
with torch.no_grad():
|
41 |
+
generated_ids = asrmodel.generate(inputs)
|
42 |
+
transcript = asrproc.batch_decode(generated_ids, skip_special_tokens=True)[0]
|
43 |
+
|
44 |
+
# Sound intensity (RMS)
|
45 |
+
rms = librosa.feature.rms(y=y)
|
46 |
+
sound_intensity = np.mean(rms)
|
47 |
+
|
48 |
+
# Pitch list
|
49 |
+
pitches=get_pitch_list(y,sr)
|
50 |
+
|
51 |
+
# Fundamental frequency (F0)
|
52 |
+
f0, voiced_flag, voiced_probs = librosa.pyin(y, fmin=librosa.note_to_hz('C2'), fmax=librosa.note_to_hz('C7'))
|
53 |
+
fundamental_frequency = np.nanmean(f0)
|
54 |
+
|
55 |
+
# Spectral energy (based on STFT)
|
56 |
+
S = np.abs(librosa.stft(y))
|
57 |
+
spectral_energy = np.mean(np.sum(S ** 2, axis=0))
|
58 |
+
|
59 |
+
# Spectral centroid
|
60 |
+
spectral_centroid = librosa.feature.spectral_centroid(y=y, sr=sr)
|
61 |
+
avg_spectral_centroid = np.mean(spectral_centroid)
|
62 |
+
|
63 |
+
# Zero-crossing rate
|
64 |
+
zcr = librosa.feature.zero_crossing_rate(y)
|
65 |
+
zero_crossing_rate = np.mean(zcr)
|
66 |
+
|
67 |
+
# Pause detection
|
68 |
+
silence_threshold = -40
|
69 |
+
silent_intervals = librosa.effects.split(y, top_db=silence_threshold)
|
70 |
+
pause_duration = 0
|
71 |
+
for start, end in silent_intervals:
|
72 |
+
pause_duration += (end - start) / sr
|
73 |
+
|
74 |
+
total_duration = librosa.get_duration(y=y, sr=sr)
|
75 |
+
pause_rate = (pause_duration / total_duration) * 60 # Convert to pauses per minute
|
76 |
+
|
77 |
+
# Transcript processing
|
78 |
+
words = nltk.word_tokenize(transcript)
|
79 |
+
words = [word.lower() for word in words if word not in string.punctuation]
|
80 |
+
num_words = len(words)
|
81 |
+
unique_words = len(set(words))
|
82 |
+
word_frequencies = Counter(words)
|
83 |
+
|
84 |
+
# Duration in minutes
|
85 |
+
duration_minutes = total_duration / 60
|
86 |
+
avg_words_per_minute = num_words / duration_minutes
|
87 |
+
avg_unique_words_per_minute = unique_words / duration_minutes
|
88 |
+
|
89 |
+
# Filler word detection
|
90 |
+
filler_words = [
|
91 |
+
'uh', 'um', 'like', 'you know', 'ah', 'er', 'hmm', 'well', 'so',
|
92 |
+
'I mean', 'okay', 'right', 'actually', 'basically', 'you see',
|
93 |
+
'sort of', 'kind of', 'yeah', 'literally', 'just', 'I guess',
|
94 |
+
'totally', 'honestly', 'seriously', 'alright'
|
95 |
+
]
|
96 |
+
filler_word_count = sum([word_frequencies.get(filler, 0) for filler in filler_words])
|
97 |
+
filler_words_per_minute = filler_word_count / duration_minutes
|
98 |
+
|
99 |
+
# POS tagging
|
100 |
+
pos_tags = nltk.pos_tag(words)
|
101 |
+
nouns = [word for word, pos in pos_tags if pos.startswith('NN')]
|
102 |
+
adjectives = [word for word, pos in pos_tags if pos.startswith('JJ')]
|
103 |
+
verbs = [word for word, pos in pos_tags if pos.startswith('VB')]
|
104 |
+
|
105 |
+
# Sentiment analysis
|
106 |
+
sentiment = sentipipe(transcript)
|
107 |
+
sentiment_mapping = {
|
108 |
+
"LABEL_0": "Negative",
|
109 |
+
"LABEL_1": "Neutral",
|
110 |
+
"LABEL_2": "Positive"
|
111 |
+
}
|
112 |
+
sentiment[0]['label'] = sentiment_mapping[sentiment[0]['label']]
|
113 |
+
|
114 |
+
# Generate Word Cloud and Save it as an Image
|
115 |
+
wordcloud = WordCloud(width=800, height=400, background_color='white').generate_from_frequencies(word_frequencies)
|
116 |
+
|
117 |
+
# Save the Word Cloud to the provided path
|
118 |
+
plt.figure(figsize=(10, 5))
|
119 |
+
plt.imshow(wordcloud, interpolation='bilinear')
|
120 |
+
plt.axis('off')
|
121 |
+
plt.savefig(wordcloud_path, format='png')
|
122 |
+
plt.close()
|
123 |
+
|
124 |
+
print("Nouns: ", nouns)
|
125 |
+
print("Adjectives: ", adjectives)
|
126 |
+
print("Verbs: ", verbs)
|
127 |
+
print("Sentiment: ", sentiment)
|
128 |
+
|
129 |
+
return {
|
130 |
+
"transcript": transcript,
|
131 |
+
"sentiment": sentiment,
|
132 |
+
"sound_intensity": float(sound_intensity),
|
133 |
+
"fundamental_frequency": float(fundamental_frequency),
|
134 |
+
"spectral_energy": float(spectral_energy),
|
135 |
+
"spectral_centroid": float(avg_spectral_centroid),
|
136 |
+
"zero_crossing_rate": float(zero_crossing_rate),
|
137 |
+
"avg_words_per_minute": float(avg_words_per_minute),
|
138 |
+
"avg_unique_words_per_minute": float(avg_unique_words_per_minute),
|
139 |
+
"unique_word_count": int(unique_words),
|
140 |
+
"filler_words_per_minute": float(filler_words_per_minute),
|
141 |
+
"noun_count": len(nouns),
|
142 |
+
"adjective_count": len(adjectives),
|
143 |
+
"verb_count": len(verbs),
|
144 |
+
"pause_rate": float(pause_rate)
|
145 |
+
},pitches
|
functions/fer.py
CHANGED
@@ -10,6 +10,7 @@ import os
|
|
10 |
import matplotlib.pyplot as plt
|
11 |
import matplotlib
|
12 |
matplotlib.use('Agg')
|
|
|
13 |
|
14 |
import torch.nn.functional as F
|
15 |
import pandas as pd
|
@@ -113,13 +114,14 @@ def plot_graph(x, y_vals, labels, path, calib_vals=None):
|
|
113 |
- path: Path to save the combined plot.
|
114 |
- calib_vals: List of calibration values for each variable (optional).
|
115 |
"""
|
|
|
116 |
plt.figure(figsize=(12, 8)) # Create a figure of appropriate size
|
117 |
|
118 |
# Iterate over y-values, labels, and calibration values to create subplots
|
119 |
for i, (y, label) in enumerate(zip(y_vals, labels)):
|
120 |
y = [value if isinstance(value, (int, float)) else np.nan for value in y]
|
121 |
-
|
122 |
-
|
123 |
plt.subplot(len(y_vals), 1, i+1)
|
124 |
plt.plot(range(len(x)), y, linestyle='-')
|
125 |
|
@@ -133,5 +135,9 @@ def plot_graph(x, y_vals, labels, path, calib_vals=None):
|
|
133 |
plt.legend()
|
134 |
|
135 |
plt.tight_layout() # Adjust layout to prevent overlap
|
136 |
-
plt.savefig(
|
137 |
-
plt.clf() # Clear the figure after saving
|
|
|
|
|
|
|
|
|
|
10 |
import matplotlib.pyplot as plt
|
11 |
import matplotlib
|
12 |
matplotlib.use('Agg')
|
13 |
+
from io import BytesIO
|
14 |
|
15 |
import torch.nn.functional as F
|
16 |
import pandas as pd
|
|
|
114 |
- path: Path to save the combined plot.
|
115 |
- calib_vals: List of calibration values for each variable (optional).
|
116 |
"""
|
117 |
+
buf = BytesIO()
|
118 |
plt.figure(figsize=(12, 8)) # Create a figure of appropriate size
|
119 |
|
120 |
# Iterate over y-values, labels, and calibration values to create subplots
|
121 |
for i, (y, label) in enumerate(zip(y_vals, labels)):
|
122 |
y = [value if isinstance(value, (int, float)) else np.nan for value in y]
|
123 |
+
# Create a subplot (n rows, 1 column, and the current subplot index)
|
124 |
+
|
125 |
plt.subplot(len(y_vals), 1, i+1)
|
126 |
plt.plot(range(len(x)), y, linestyle='-')
|
127 |
|
|
|
135 |
plt.legend()
|
136 |
|
137 |
plt.tight_layout() # Adjust layout to prevent overlap
|
138 |
+
plt.savefig(buf, format='png')
|
139 |
+
plt.clf() # Clear the figure after saving
|
140 |
+
buf.seek(0)
|
141 |
+
return buf
|
142 |
+
|
143 |
+
|
functions/helper.py
CHANGED
@@ -2,8 +2,13 @@ import cv2
|
|
2 |
import numpy as np
|
3 |
import dlib
|
4 |
from tqdm import tqdm
|
5 |
-
|
6 |
-
|
|
|
|
|
|
|
|
|
|
|
7 |
def extract_face(image, net, predictor):
|
8 |
(h, w) = image.shape[:2]
|
9 |
blob = cv2.dnn.blobFromImage(cv2.resize(image, (300, 300)), 1.0, (300, 300), (104.0, 177.0, 123.0))
|
@@ -56,3 +61,121 @@ def extract_faces_from_frames(frames, net, predictor):
|
|
56 |
|
57 |
return faces_list, landmarks_list, sizes_list
|
58 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
2 |
import numpy as np
|
3 |
import dlib
|
4 |
from tqdm import tqdm
|
5 |
+
from reportlab.lib.pagesizes import A4
|
6 |
+
from reportlab.lib import colors
|
7 |
+
from reportlab.lib.styles import getSampleStyleSheet
|
8 |
+
from reportlab.lib.units import inch
|
9 |
+
from reportlab.platypus import SimpleDocTemplate, Paragraph, Table, TableStyle, Spacer,Image
|
10 |
+
from io import BytesIO
|
11 |
+
import matplotlib.pyplot as plt
|
12 |
def extract_face(image, net, predictor):
|
13 |
(h, w) = image.shape[:2]
|
14 |
blob = cv2.dnn.blobFromImage(cv2.resize(image, (300, 300)), 1.0, (300, 300), (104.0, 177.0, 123.0))
|
|
|
61 |
|
62 |
return faces_list, landmarks_list, sizes_list
|
63 |
|
64 |
+
def make_pdf(file_path,data,buf,buf2):
|
65 |
+
doc = SimpleDocTemplate(file_path, pagesize=A4)
|
66 |
+
|
67 |
+
# Define styles
|
68 |
+
styles = getSampleStyleSheet()
|
69 |
+
content = []
|
70 |
+
|
71 |
+
# Adding title
|
72 |
+
content.append(Paragraph("Facial Emotion Recognition Report", styles['Title']))
|
73 |
+
content.append(Spacer(1, 12))
|
74 |
+
|
75 |
+
# Section 1: Facial Emotion Recognition
|
76 |
+
content.append(Paragraph("Facial Emotion Recognition", styles['Heading2']))
|
77 |
+
table_data = [["Emotion", "Frame Count"]]
|
78 |
+
for emotion, count in data["facial_emotion_recognition"]["class_wise_frame_count"].items():
|
79 |
+
table_data.append([emotion.capitalize(), str(count)])
|
80 |
+
|
81 |
+
table = Table(table_data, hAlign='LEFT')
|
82 |
+
table.setStyle(TableStyle([
|
83 |
+
('BACKGROUND', (0, 0), (-1, 0), colors.grey),
|
84 |
+
('TEXTCOLOR', (0, 0), (-1, 0), colors.whitesmoke),
|
85 |
+
('ALIGN', (0, 0), (-1, -1), 'CENTER'),
|
86 |
+
('FONTNAME', (0, 0), (-1, 0), 'Helvetica-Bold'),
|
87 |
+
('BOTTOMPADDING', (0, 0), (-1, 0), 12),
|
88 |
+
('BACKGROUND', (0, 1), (-1, -1), colors.beige),
|
89 |
+
('GRID', (0, 0), (-1, -1), 1, colors.black),
|
90 |
+
]))
|
91 |
+
content.append(table)
|
92 |
+
content.append(Spacer(1, 12))
|
93 |
+
|
94 |
+
# Section 2: Audio Analysis
|
95 |
+
content.append(Paragraph("Audio Analysis", styles['Heading2']))
|
96 |
+
content.append(Paragraph(f"Transcript: {data['audio']['transcript']}", styles['BodyText']))
|
97 |
+
|
98 |
+
sentiment = data['audio']['sentiment'][0]
|
99 |
+
content.append(Paragraph(f"Sentiment: {sentiment['label']} (Score: {sentiment['score']})", styles['BodyText']))
|
100 |
+
|
101 |
+
audio_features = [
|
102 |
+
f"Video Duration:{data['duration']}",
|
103 |
+
f"Sound Intensity: {data['audio']['sound_intensity']}",
|
104 |
+
f"Fundamental Frequency: {data['audio']['fundamental_frequency']}",
|
105 |
+
f"Spectral Energy: {data['audio']['spectral_energy']}",
|
106 |
+
f"Spectral Centroid: {data['audio']['spectral_centroid']}",
|
107 |
+
f"Zero Crossing Rate: {data['audio']['zero_crossing_rate']}",
|
108 |
+
f"Average Words per Minute: {data['audio']['avg_words_per_minute'] if data['duration']>60 else -1}",
|
109 |
+
f"Average Unique Words per Minute: {data['audio']['avg_unique_words_per_minute'] if data['duration']>60 else -1}",
|
110 |
+
f"Unique Word Count: {data['audio']['unique_word_count']}",
|
111 |
+
f"Filler Words per Minute: {data['audio']['filler_words_per_minute']}",
|
112 |
+
f"Noun Count: {data['audio']['noun_count']}",
|
113 |
+
f"Adjective Count: {data['audio']['adjective_count']}",
|
114 |
+
f"Verb Count: {data['audio']['verb_count']}",
|
115 |
+
f"Pause Rate: {data['audio']['pause_rate']}"
|
116 |
+
]
|
117 |
+
|
118 |
+
for feature in audio_features:
|
119 |
+
content.append(Paragraph(feature, styles['BodyText']))
|
120 |
+
content.append(Spacer(1, 12))
|
121 |
+
|
122 |
+
plot_image = Image(buf)
|
123 |
+
plot_image.drawHeight = 600 # Adjust height
|
124 |
+
plot_image.drawWidth = 600 # Adjust width
|
125 |
+
content.append(plot_image)
|
126 |
+
plot_image = Image(buf2)
|
127 |
+
plot_image.drawHeight = 600 # Adjust height
|
128 |
+
plot_image.drawWidth = 600 # Adjust width
|
129 |
+
content.append(plot_image)
|
130 |
+
# Build the PDF
|
131 |
+
doc.build(content)
|
132 |
+
|
133 |
+
|
134 |
+
|
135 |
+
def plot_facial_expression_graphs(smile_data, ear_data, yawn_data, thresholds, path):
|
136 |
+
"""
|
137 |
+
Plots multiple subplots (smile, EAR, and yawn ratios) in one figure.
|
138 |
+
|
139 |
+
Parameters:
|
140 |
+
- smile_data: List of smile ratios.
|
141 |
+
- ear_data: List of eye aspect ratios (EAR).
|
142 |
+
- yawn_data: List of yawn ratios.
|
143 |
+
- thresholds: List containing thresholds for smile, EAR, and yawn.
|
144 |
+
- path: Path to save the combined plot.
|
145 |
+
|
146 |
+
Returns:
|
147 |
+
- buf: BytesIO buffer containing the saved plot.
|
148 |
+
"""
|
149 |
+
buf = BytesIO()
|
150 |
+
plt.figure(figsize=(12, 8)) # Create a figure of appropriate size
|
151 |
+
|
152 |
+
# Plot smile data
|
153 |
+
plt.subplot(3, 1, 1)
|
154 |
+
plt.plot(smile_data, label='Smile Ratio (Width/Face Width)')
|
155 |
+
plt.axhline(y=thresholds[0], color='black', linestyle='--', label='Threshold')
|
156 |
+
plt.title('Smile Ratio Over Time')
|
157 |
+
plt.ylabel('Ratio')
|
158 |
+
plt.legend()
|
159 |
+
|
160 |
+
# Plot EAR data
|
161 |
+
plt.subplot(3, 1, 2)
|
162 |
+
plt.plot(ear_data, label='Eye Aspect Ratio (EAR)', color='orange')
|
163 |
+
plt.axhline(y=thresholds[1], color='black', linestyle='--', label='Threshold')
|
164 |
+
plt.title('Eye Aspect Ratio (EAR) Over Time')
|
165 |
+
plt.ylabel('Ratio')
|
166 |
+
plt.legend()
|
167 |
+
|
168 |
+
# Plot yawn data
|
169 |
+
plt.subplot(3, 1, 3)
|
170 |
+
plt.plot(yawn_data, label='Yawn Ratio (Mouth Height/Face Height)', color='red')
|
171 |
+
plt.axhline(y=thresholds[2], color='black', linestyle='--', label='Threshold')
|
172 |
+
plt.title('Yawn Ratio Over Time')
|
173 |
+
plt.xlabel('Frames')
|
174 |
+
plt.ylabel('Ratio')
|
175 |
+
plt.legend()
|
176 |
+
|
177 |
+
plt.tight_layout() # Adjust layout to prevent overlap
|
178 |
+
plt.savefig(buf, format='png') # Save to buffer
|
179 |
+
plt.clf() # Clear the figure after saving
|
180 |
+
buf.seek(0) # Rewind the buffer to the beginning
|
181 |
+
return buf
|
functions/video.py
CHANGED
@@ -28,87 +28,147 @@ def eye_aspect_ratio(eye):
|
|
28 |
ear = (A + B) / (2.0 * C) # EAR formula
|
29 |
return ear
|
30 |
|
31 |
-
def
|
32 |
-
|
33 |
-
blink_counter = 0
|
34 |
-
total_blinks = 0
|
35 |
-
EYE_AR_THRESH = 0.24 # EAR threshold for blink detection
|
36 |
-
EYE_AR_CONSEC_FRAMES = 4 # Consecutive frames for blink detection
|
37 |
-
|
38 |
-
frame_count = 0 # Initialize frame counter
|
39 |
-
|
40 |
-
for landmark, size in zip(landmarks, sizes):
|
41 |
-
if landmark is not None:
|
42 |
-
leftEye = landmark[lStart:lEnd]
|
43 |
-
rightEye = landmark[rStart:rEnd]
|
44 |
-
|
45 |
-
leftEAR = eye_aspect_ratio(leftEye)
|
46 |
-
rightEAR = eye_aspect_ratio(rightEye)
|
47 |
|
48 |
-
|
49 |
-
|
50 |
-
|
51 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
52 |
else:
|
53 |
-
|
54 |
-
|
55 |
-
|
56 |
-
|
57 |
-
|
58 |
-
|
59 |
-
|
60 |
-
|
61 |
-
|
62 |
-
|
63 |
-
return blink_durations, total_blinks
|
64 |
-
|
65 |
-
|
66 |
-
def detect_smiles(faces,smile_cascade):
|
67 |
-
smiles=[]
|
68 |
-
count=0
|
69 |
-
for face in faces:
|
70 |
-
if face is not None:
|
71 |
-
smile = smile_cascade.detectMultiScale(face, scaleFactor=1.8, minNeighbors=20, minSize=(25, 25))
|
72 |
-
if len(smile) > 0:
|
73 |
smiles.append(True)
|
74 |
-
|
|
|
|
|
75 |
else:
|
76 |
smiles.append(False)
|
|
|
|
|
|
|
|
|
|
|
|
|
77 |
else:
|
78 |
smiles.append(None)
|
79 |
-
|
80 |
-
|
81 |
-
|
82 |
-
|
83 |
-
|
84 |
-
low_lip_idx = [56, 57, 58, 59, 65, 66, 67]
|
85 |
|
86 |
-
top_lip = np.array([landmarks[idx] for idx in top_lip_idx])
|
87 |
-
low_lip = np.array([landmarks[idx] for idx in low_lip_idx])
|
88 |
|
89 |
-
|
90 |
-
|
91 |
|
92 |
-
|
93 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
94 |
|
95 |
-
|
96 |
-
|
97 |
-
|
98 |
-
|
99 |
-
|
100 |
-
|
101 |
-
|
102 |
-
|
103 |
-
|
104 |
-
|
105 |
-
|
106 |
-
|
107 |
-
|
108 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
109 |
else:
|
110 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
111 |
else:
|
112 |
-
|
113 |
-
|
114 |
-
return
|
|
|
28 |
ear = (A + B) / (2.0 * C) # EAR formula
|
29 |
return ear
|
30 |
|
31 |
+
def euclidean_distance(p1, p2):
|
32 |
+
return np.linalg.norm(p1 - p2)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
33 |
|
34 |
+
# Function to detect smiles based on mouth aspect ratio
|
35 |
+
def detect_smiles(landmarks_list, face_sizes, fps=30, consecutive_frames=2):
|
36 |
+
smile_ratios = [] # Store the smile ratios for plotting
|
37 |
+
smiles = []
|
38 |
+
smile_durations = [] # To store the duration of each smile
|
39 |
+
total_smiles = 0
|
40 |
+
smile_in_progress = False
|
41 |
+
smile_start_frame = None
|
42 |
+
avg_dynamic_threshold=[]
|
43 |
+
for frame_idx, (landmarks, face_size) in enumerate(zip(landmarks_list, face_sizes)):
|
44 |
+
if landmarks is not None:
|
45 |
+
# Use NumPy array indices for the relevant mouth landmarks
|
46 |
+
left_corner = np.array(landmarks[48])
|
47 |
+
right_corner = np.array(landmarks[54])
|
48 |
+
top_lip = np.array(landmarks[51])
|
49 |
+
bottom_lip = np.array(landmarks[57])
|
50 |
+
|
51 |
+
mouth_width = euclidean_distance(left_corner, right_corner)
|
52 |
+
mouth_height = euclidean_distance(top_lip, bottom_lip)
|
53 |
+
|
54 |
+
face_width, face_height = face_size # face_size is (width, height)
|
55 |
+
|
56 |
+
if face_width > 0 and face_height > 0:
|
57 |
+
normalized_mouth_width = mouth_width / face_width
|
58 |
+
normalized_mouth_height = mouth_height / face_height
|
59 |
else:
|
60 |
+
normalized_mouth_width = 0
|
61 |
+
normalized_mouth_height = 0
|
62 |
+
|
63 |
+
smile_ratios.append(normalized_mouth_width)
|
64 |
+
dynamic_threshold = 0.2 + (0.05 * face_width / 100)
|
65 |
+
avg_dynamic_threshold.append(dynamic_threshold)
|
66 |
+
# print(dynamic_threshold)
|
67 |
+
# Check if the smile meets the threshold
|
68 |
+
if (normalized_mouth_width > dynamic_threshold) and (normalized_mouth_height > 0.06):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
69 |
smiles.append(True)
|
70 |
+
if not smile_in_progress:
|
71 |
+
smile_in_progress = True
|
72 |
+
smile_start_frame = frame_idx # Record the start of the smile
|
73 |
else:
|
74 |
smiles.append(False)
|
75 |
+
if smile_in_progress and (frame_idx - smile_start_frame >= consecutive_frames):
|
76 |
+
smile_in_progress = False
|
77 |
+
smile_end_frame = frame_idx
|
78 |
+
smile_duration = (smile_end_frame - smile_start_frame) / fps # Calculate smile duration
|
79 |
+
smile_durations.append(smile_duration)
|
80 |
+
total_smiles += 1 # Increment total smile count
|
81 |
else:
|
82 |
smiles.append(None)
|
83 |
+
try:
|
84 |
+
avg_thr=sum(avg_dynamic_threshold)/len(avg_dynamic_threshold)
|
85 |
+
except:
|
86 |
+
avg_thr=0
|
87 |
+
return smiles, smile_ratios, total_smiles, smile_durations,avg_thr
|
|
|
88 |
|
|
|
|
|
89 |
|
90 |
+
# Function to detect blinks based on the eye aspect ratio (EAR)
|
91 |
+
import numpy as np
|
92 |
|
93 |
+
# Function to detect blinks based on the eye aspect ratio (EAR)
|
94 |
+
def detect_blinks(landmarks_list, face_sizes, ear_threshold=0.24, consecutive_frames=2):
|
95 |
+
ear_ratios = [] # Store EAR for plotting
|
96 |
+
blinks = []
|
97 |
+
|
98 |
+
# Variables to monitor consecutive low EAR values
|
99 |
+
blink_count = 0
|
100 |
+
consec_low_ear = 0
|
101 |
+
|
102 |
+
for landmarks, face in zip(landmarks_list, face_sizes):
|
103 |
+
if landmarks is not None:
|
104 |
+
left_eye = landmarks[36:42] # Points 36-41 (inclusive) for the left eye
|
105 |
+
right_eye = landmarks[42:48]
|
106 |
+
|
107 |
+
def eye_aspect_ratio(eye):
|
108 |
+
A = euclidean_distance(eye[1], eye[5])
|
109 |
+
B = euclidean_distance(eye[2], eye[4])
|
110 |
+
C = euclidean_distance(eye[0], eye[3])
|
111 |
+
ear = (A + B) / (2.0 * C)
|
112 |
+
return ear
|
113 |
+
|
114 |
+
left_ear = eye_aspect_ratio(left_eye)
|
115 |
+
right_ear = eye_aspect_ratio(right_eye)
|
116 |
+
avg_ear = (left_ear + right_ear) / 2.0
|
117 |
+
|
118 |
+
ear_ratios.append(avg_ear)
|
119 |
+
|
120 |
+
if avg_ear < ear_threshold:
|
121 |
+
consec_low_ear += 1
|
122 |
+
else:
|
123 |
+
# If low EAR is detected for enough consecutive frames, count as a blink
|
124 |
+
if consec_low_ear >= consecutive_frames:
|
125 |
+
blink_count += 1
|
126 |
+
consec_low_ear = 0 # Reset the consecutive low EAR counter
|
127 |
+
else:
|
128 |
+
blinks.append(None)
|
129 |
+
|
130 |
+
return blink_count, ear_ratios
|
131 |
|
132 |
+
# Function to detect yawns based on the vertical distance between top and bottom lips
|
133 |
+
# Function to detect yawns based on the vertical distance between top and bottom lips
|
134 |
+
def detect_yawns(landmarks_list, face_sizes, fps=30, consecutive_frames=3):
|
135 |
+
yawn_ratios = [] # Store the yawn ratios for plotting
|
136 |
+
yawns = []
|
137 |
+
yawn_durations = [] # To store the duration of each yawn
|
138 |
+
total_yawns = 0
|
139 |
+
yawn_in_progress = False
|
140 |
+
yawn_start_frame = None
|
141 |
+
|
142 |
+
for frame_idx, (landmarks, face_size) in enumerate(zip(landmarks_list, face_sizes)):
|
143 |
+
if landmarks is not None:
|
144 |
+
top_lip = np.array(landmarks[51])
|
145 |
+
bottom_lip = np.array(landmarks[57])
|
146 |
+
|
147 |
+
mouth_height = euclidean_distance(top_lip, bottom_lip)
|
148 |
+
face_width, face_height = face_size # face_size is (width, height)
|
149 |
+
|
150 |
+
if face_height > 0:
|
151 |
+
normalized_mouth_height = mouth_height / face_height
|
152 |
else:
|
153 |
+
normalized_mouth_height = 0
|
154 |
+
|
155 |
+
yawn_ratios.append(normalized_mouth_height)
|
156 |
+
|
157 |
+
# Check if the yawn meets the threshold
|
158 |
+
if normalized_mouth_height > 0.24:
|
159 |
+
yawns.append(True)
|
160 |
+
if not yawn_in_progress:
|
161 |
+
yawn_in_progress = True
|
162 |
+
yawn_start_frame = frame_idx # Record the start of the yawn
|
163 |
+
else:
|
164 |
+
yawns.append(False)
|
165 |
+
if yawn_in_progress and (frame_idx - yawn_start_frame >= consecutive_frames):
|
166 |
+
yawn_in_progress = False
|
167 |
+
yawn_end_frame = frame_idx
|
168 |
+
yawn_duration = (yawn_end_frame - yawn_start_frame) / fps # Calculate yawn duration
|
169 |
+
yawn_durations.append(yawn_duration)
|
170 |
+
total_yawns += 1 # Increment total yawn count
|
171 |
else:
|
172 |
+
yawns.append(None)
|
173 |
+
|
174 |
+
return yawns, yawn_ratios, total_yawns, yawn_durations
|
main.py
CHANGED
@@ -6,12 +6,14 @@ os.environ['TF_ENABLE_ONEDNN_OPTS'] = '0'
|
|
6 |
import logging
|
7 |
logging.getLogger('absl').setLevel(logging.ERROR)
|
8 |
from functions.models import models_dict
|
9 |
-
from functions.helper import extract_faces_from_frames
|
10 |
-
from functions.video import eyebrow,
|
11 |
from functions.valence_arousal import va_predict
|
12 |
from functions.fer import fer_predict,plot_graph
|
|
|
13 |
from moviepy.editor import VideoFileClip
|
14 |
import json
|
|
|
15 |
import pandas as pd
|
16 |
from typing import Callable
|
17 |
from functions.audio import extract_audio_features
|
@@ -57,7 +59,9 @@ def analyze_live_video(video_path: str, uid: str, user_id: str, count: int, fina
|
|
57 |
os.makedirs(folder_path,exist_ok=True)
|
58 |
meta_data_path=os.path.join(folder_path,'metadata.json')
|
59 |
valence_plot=os.path.join(folder_path,"vas.png")
|
|
|
60 |
df_path=os.path.join(folder_path,'data.csv')
|
|
|
61 |
|
62 |
video_clip=VideoFileClip(video_path)
|
63 |
video_clip=video_clip.set_fps(fps)
|
@@ -72,8 +76,8 @@ def analyze_live_video(video_path: str, uid: str, user_id: str, count: int, fina
|
|
72 |
|
73 |
|
74 |
# faces=[extract_face(frame) for frame in tqdm(video_frames)]
|
75 |
-
af=extract_audio_features(audio_path,asrmodel,asrproc,sentipipe)
|
76 |
-
|
77 |
|
78 |
fer_emotions,class_wise_frame_count,em_tensors=fer_predict(faces,fps,fer_model)
|
79 |
valence_list,arousal_list,stress_list=va_predict(valence_arousal_model,val_ar_feat_model,faces,list(em_tensors))
|
@@ -81,30 +85,35 @@ def analyze_live_video(video_path: str, uid: str, user_id: str, count: int, fina
|
|
81 |
|
82 |
eyebrow_dist=eyebrow(landmarks,sizes)
|
83 |
print('eyebrow done')
|
84 |
-
|
85 |
-
|
86 |
-
|
87 |
-
print('
|
88 |
-
|
|
|
|
|
|
|
89 |
print('ywan done')
|
90 |
|
|
|
|
|
|
|
|
|
91 |
|
92 |
-
y_vals = [valence_list, arousal_list, stress_list,eyebrow_dist]
|
93 |
-
labels = ['Valence', 'Arousal', 'Stress',"EyeBrowDistance"]
|
94 |
-
plot_graph(timestamps, y_vals, labels, valence_plot)
|
95 |
print('graph_plotted')
|
96 |
meta_data={}
|
|
|
97 |
meta_data['facial_emotion_recognition'] = {
|
98 |
"class_wise_frame_count": class_wise_frame_count,
|
99 |
}
|
100 |
meta_data['audio']=af
|
101 |
|
102 |
-
|
103 |
-
|
104 |
-
|
105 |
-
}
|
106 |
-
meta_data['smile']=smile_count
|
107 |
-
meta_data['yawn']=yawn_count
|
108 |
with open(meta_data_path, 'w') as json_file:
|
109 |
json.dump(meta_data, json_file, indent=4)
|
110 |
df=pd.DataFrame(
|
|
|
6 |
import logging
|
7 |
logging.getLogger('absl').setLevel(logging.ERROR)
|
8 |
from functions.models import models_dict
|
9 |
+
from functions.helper import extract_faces_from_frames,make_pdf
|
10 |
+
from functions.video import eyebrow,detect_blinks,detect_yawns,detect_smiles
|
11 |
from functions.valence_arousal import va_predict
|
12 |
from functions.fer import fer_predict,plot_graph
|
13 |
+
from functions.helper import plot_facial_expression_graphs
|
14 |
from moviepy.editor import VideoFileClip
|
15 |
import json
|
16 |
+
# from trash import detect_eyes_in_faces
|
17 |
import pandas as pd
|
18 |
from typing import Callable
|
19 |
from functions.audio import extract_audio_features
|
|
|
59 |
os.makedirs(folder_path,exist_ok=True)
|
60 |
meta_data_path=os.path.join(folder_path,'metadata.json')
|
61 |
valence_plot=os.path.join(folder_path,"vas.png")
|
62 |
+
word_cloud=os.path.join(folder_path,'wordcloud.jpg')
|
63 |
df_path=os.path.join(folder_path,'data.csv')
|
64 |
+
pdf_filename = os.path.join(folder_path,"formatted_output_with_plots.pdf")
|
65 |
|
66 |
video_clip=VideoFileClip(video_path)
|
67 |
video_clip=video_clip.set_fps(fps)
|
|
|
76 |
|
77 |
|
78 |
# faces=[extract_face(frame) for frame in tqdm(video_frames)]
|
79 |
+
af,pitches=extract_audio_features(audio_path,asrmodel,asrproc,sentipipe,duration,word_cloud)
|
80 |
+
pitches=[float(pitch) for pitch in pitches]
|
81 |
|
82 |
fer_emotions,class_wise_frame_count,em_tensors=fer_predict(faces,fps,fer_model)
|
83 |
valence_list,arousal_list,stress_list=va_predict(valence_arousal_model,val_ar_feat_model,faces,list(em_tensors))
|
|
|
85 |
|
86 |
eyebrow_dist=eyebrow(landmarks,sizes)
|
87 |
print('eyebrow done')
|
88 |
+
|
89 |
+
blink_count, ear_ratios=detect_blinks(landmarks,sizes,fps)
|
90 |
+
ear_ratios=[float(pitch) for pitch in ear_ratios]
|
91 |
+
print('blinks done',blink_count)
|
92 |
+
smiles, smile_ratios, total_smiles, smile_durations,smile_threshold=detect_smiles(landmarks,sizes)
|
93 |
+
smile_ratios=[float(smile) for smile in smile_ratios]
|
94 |
+
print('smiles done',total_smiles)
|
95 |
+
yawns, yawn_ratios, total_yawns, yawn_durations=detect_yawns(landmarks,sizes)
|
96 |
print('ywan done')
|
97 |
|
98 |
+
thresholds=[smile_threshold,0.225,0.22]
|
99 |
+
buffer = plot_facial_expression_graphs(smile_ratios, ear_ratios, yawn_ratios, thresholds, 'path_to_save_plot.pdf')
|
100 |
+
|
101 |
+
# print("detect_eyes : ",detect_eyes_in_faces(faces))
|
102 |
|
103 |
+
y_vals = [valence_list, arousal_list, stress_list,eyebrow_dist,pitches]
|
104 |
+
labels = ['Valence', 'Arousal', 'Stress',"EyeBrowDistance","Pitch"]
|
105 |
+
buf=plot_graph(timestamps, y_vals, labels, valence_plot)
|
106 |
print('graph_plotted')
|
107 |
meta_data={}
|
108 |
+
meta_data['duration']=duration
|
109 |
meta_data['facial_emotion_recognition'] = {
|
110 |
"class_wise_frame_count": class_wise_frame_count,
|
111 |
}
|
112 |
meta_data['audio']=af
|
113 |
|
114 |
+
|
115 |
+
make_pdf(pdf_filename,meta_data,buf,buffer)
|
116 |
+
|
|
|
|
|
|
|
117 |
with open(meta_data_path, 'w') as json_file:
|
118 |
json.dump(meta_data, json_file, indent=4)
|
119 |
df=pd.DataFrame(
|
requirements.txt
CHANGED
Binary files a/requirements.txt and b/requirements.txt differ
|
|