Rahulk2197 commited on
Commit
eb9b1e7
·
verified ·
1 Parent(s): 92724d0

Upload 15 files

Browse files
app.py CHANGED
@@ -70,5 +70,5 @@ if uploaded_file is not None:
70
 
71
  # Clean up temporary files
72
  os.remove(temp_file_path)
73
- shutil.rmtree(output_folder)
74
  os.remove(zip_file_path)
 
70
 
71
  # Clean up temporary files
72
  os.remove(temp_file_path)
73
+ # shutil.rmtree(output_folder)
74
  os.remove(zip_file_path)
functions/__pycache__/audio.cpython-312.pyc CHANGED
Binary files a/functions/__pycache__/audio.cpython-312.pyc and b/functions/__pycache__/audio.cpython-312.pyc differ
 
functions/__pycache__/fer.cpython-312.pyc CHANGED
Binary files a/functions/__pycache__/fer.cpython-312.pyc and b/functions/__pycache__/fer.cpython-312.pyc differ
 
functions/__pycache__/helper.cpython-312.pyc CHANGED
Binary files a/functions/__pycache__/helper.cpython-312.pyc and b/functions/__pycache__/helper.cpython-312.pyc differ
 
functions/__pycache__/video.cpython-312.pyc CHANGED
Binary files a/functions/__pycache__/video.cpython-312.pyc and b/functions/__pycache__/video.cpython-312.pyc differ
 
functions/audio.py CHANGED
@@ -1,99 +1,145 @@
1
- import librosa
2
- import numpy as np
3
- import torch
4
- from collections import Counter
5
- import nltk
6
- nltk.download('punkt_tab')
7
- nltk.download('averaged_perceptron_tagger_eng')
8
- device = "cuda:0" if torch.cuda.is_available() else "cpu"
9
- torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
10
- def extract_audio_features(audio_path,asrmodel,asrproc,sentipipe):
11
- y, sr = librosa.load(audio_path,sr=16000)
12
- inputs = asrproc(y, sampling_rate=sr, return_tensors="pt").input_features
13
- inputs = inputs.to(device, dtype=torch_dtype)
14
- with torch.no_grad():
15
- generated_ids = asrmodel.generate(inputs)
16
- transcript = asrproc.batch_decode(generated_ids, skip_special_tokens=True)[0]
17
- # Sound intensity (RMS)
18
- rms = librosa.feature.rms(y=y)
19
- sound_intensity = np.mean(rms)
20
-
21
- # Fundamental frequency (F0)
22
- f0, voiced_flag, voiced_probs = librosa.pyin(y, fmin=librosa.note_to_hz('C2'), fmax=librosa.note_to_hz('C7'))
23
- fundamental_frequency = np.nanmean(f0)
24
-
25
- # Spectral energy (based on STFT)
26
- S = np.abs(librosa.stft(y))
27
- spectral_energy = np.mean(np.sum(S ** 2, axis=0))
28
-
29
- # Spectral centroid
30
- spectral_centroid = librosa.feature.spectral_centroid(y=y, sr=sr)
31
- avg_spectral_centroid = np.mean(spectral_centroid)
32
-
33
- # Zero-crossing rate
34
- zcr = librosa.feature.zero_crossing_rate(y)
35
- zero_crossing_rate = np.mean(zcr)
36
-
37
- # Pause detection
38
- silence_threshold = -40
39
- silent_intervals = librosa.effects.split(y, top_db=silence_threshold) # Split into non-silent intervals
40
- pause_duration = 0
41
- for start, end in silent_intervals:
42
- pause_duration += (end - start) / sr # Add the pause duration in seconds
43
-
44
- total_duration = librosa.get_duration(y=y, sr=sr)
45
- pause_rate = (pause_duration / total_duration) * 60 # Convert to pauses per minute
46
-
47
- # Transcript processing
48
- words = nltk.word_tokenize(transcript)
49
- num_words = len(words)
50
- unique_words = len(set(words))
51
- word_frequencies = Counter(words)
52
-
53
- duration_minutes = total_duration / 60
54
- avg_words_per_minute = num_words / duration_minutes
55
- avg_unique_words_per_minute = unique_words / duration_minutes
56
-
57
- # Count of unique words
58
- unique_word_count = unique_words
59
-
60
- # Filler word detection
61
- filler_words = [
62
- 'uh', 'um', 'like', 'you know', 'ah', 'er', 'hmm', 'well', 'so',
63
- 'I mean', 'okay', 'right', 'actually', 'basically', 'you see',
64
- 'sort of', 'kind of', 'yeah', 'literally', 'just', 'I guess',
65
- 'totally', 'honestly', 'seriously', 'alright'
66
- ]
67
- filler_word_count = sum([word_frequencies.get(filler, 0) for filler in filler_words])
68
- filler_words_per_minute = filler_word_count / duration_minutes
69
-
70
- # POS tagging
71
- pos_tags = nltk.pos_tag(words)
72
- nouns = [word for word, pos in pos_tags if pos.startswith('NN')]
73
- adjectives = [word for word, pos in pos_tags if pos.startswith('JJ')]
74
- verbs = [word for word, pos in pos_tags if pos.startswith('VB')]
75
-
76
- # Sentiment analysis
77
- sentiment = sentipipe(transcript)
78
-
79
- print("Nouns: ", nouns)
80
- print("Adjectives: ", adjectives)
81
- print("Verbs: ", verbs)
82
-
83
- return {
84
- "transcript": transcript, # assuming this is a string
85
- "sentiment":sentiment,
86
- "sound_intensity": float(sound_intensity), # convert numpy float to Python float
87
- "fundamental_frequency": float(fundamental_frequency), # same conversion
88
- "spectral_energy": float(spectral_energy), # convert to Python float
89
- "spectral_centroid": float(avg_spectral_centroid), # convert numpy float
90
- "zero_crossing_rate": float(zero_crossing_rate), # convert to Python float
91
- "avg_words_per_minute": float(avg_words_per_minute), # same conversion
92
- "avg_unique_words_per_minute": float(avg_unique_words_per_minute), # convert float
93
- "unique_word_count": int(unique_word_count), # convert to integer if needed
94
- "filler_words_per_minute": float(filler_words_per_minute), # convert float
95
- "noun_count": len(nouns), # Assuming nouns is a list, so no changes needed
96
- "adjective_count": len(adjectives), # Same here
97
- "verb_count": len(verbs), # Same here
98
- "pause_rate": float(pause_rate), # convert to Python float
99
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import librosa
2
+ import numpy as np
3
+ import torch
4
+ from collections import Counter
5
+ import nltk
6
+ import string
7
+ import matplotlib.pyplot as plt
8
+ from wordcloud import WordCloud
9
+
10
+ nltk.download('punkt')
11
+ nltk.download('averaged_perceptron_tagger')
12
+ device = "cuda:0" if torch.cuda.is_available() else "cpu"
13
+ torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
14
+
15
+ def get_pitch_list(y,sr):
16
+ hop_length = int(sr / 30) # hop_length determines how far apart the frames are
17
+
18
+ # Extract the pitch (F0) using librosa's piptrack method
19
+ pitches, magnitudes = librosa.piptrack(y=y, sr=sr, hop_length=hop_length)
20
+
21
+ # Get the pitch frequencies from the pitch array
22
+ pitch_frequencies = []
23
+
24
+ for t in range(pitches.shape[1]):
25
+ index = magnitudes[:, t].argmax() # Get the index of the maximum magnitude
26
+ pitch = pitches[index, t]
27
+
28
+ pitch_frequencies.append(pitch)
29
+
30
+ # Convert pitch_frequencies to a NumPy array
31
+ pitch_frequencies = np.array(pitch_frequencies)
32
+ print("shape : ",pitch_frequencies.shape)
33
+ return pitch_frequencies
34
+
35
+
36
+ def extract_audio_features(audio_path, asrmodel, asrproc, sentipipe, duration, wordcloud_path):
37
+ y, sr = librosa.load(audio_path, sr=16000)
38
+ inputs = asrproc(y, sampling_rate=sr, return_tensors="pt").input_features
39
+ inputs = inputs.to(device, dtype=torch_dtype)
40
+ with torch.no_grad():
41
+ generated_ids = asrmodel.generate(inputs)
42
+ transcript = asrproc.batch_decode(generated_ids, skip_special_tokens=True)[0]
43
+
44
+ # Sound intensity (RMS)
45
+ rms = librosa.feature.rms(y=y)
46
+ sound_intensity = np.mean(rms)
47
+
48
+ # Pitch list
49
+ pitches=get_pitch_list(y,sr)
50
+
51
+ # Fundamental frequency (F0)
52
+ f0, voiced_flag, voiced_probs = librosa.pyin(y, fmin=librosa.note_to_hz('C2'), fmax=librosa.note_to_hz('C7'))
53
+ fundamental_frequency = np.nanmean(f0)
54
+
55
+ # Spectral energy (based on STFT)
56
+ S = np.abs(librosa.stft(y))
57
+ spectral_energy = np.mean(np.sum(S ** 2, axis=0))
58
+
59
+ # Spectral centroid
60
+ spectral_centroid = librosa.feature.spectral_centroid(y=y, sr=sr)
61
+ avg_spectral_centroid = np.mean(spectral_centroid)
62
+
63
+ # Zero-crossing rate
64
+ zcr = librosa.feature.zero_crossing_rate(y)
65
+ zero_crossing_rate = np.mean(zcr)
66
+
67
+ # Pause detection
68
+ silence_threshold = -40
69
+ silent_intervals = librosa.effects.split(y, top_db=silence_threshold)
70
+ pause_duration = 0
71
+ for start, end in silent_intervals:
72
+ pause_duration += (end - start) / sr
73
+
74
+ total_duration = librosa.get_duration(y=y, sr=sr)
75
+ pause_rate = (pause_duration / total_duration) * 60 # Convert to pauses per minute
76
+
77
+ # Transcript processing
78
+ words = nltk.word_tokenize(transcript)
79
+ words = [word.lower() for word in words if word not in string.punctuation]
80
+ num_words = len(words)
81
+ unique_words = len(set(words))
82
+ word_frequencies = Counter(words)
83
+
84
+ # Duration in minutes
85
+ duration_minutes = total_duration / 60
86
+ avg_words_per_minute = num_words / duration_minutes
87
+ avg_unique_words_per_minute = unique_words / duration_minutes
88
+
89
+ # Filler word detection
90
+ filler_words = [
91
+ 'uh', 'um', 'like', 'you know', 'ah', 'er', 'hmm', 'well', 'so',
92
+ 'I mean', 'okay', 'right', 'actually', 'basically', 'you see',
93
+ 'sort of', 'kind of', 'yeah', 'literally', 'just', 'I guess',
94
+ 'totally', 'honestly', 'seriously', 'alright'
95
+ ]
96
+ filler_word_count = sum([word_frequencies.get(filler, 0) for filler in filler_words])
97
+ filler_words_per_minute = filler_word_count / duration_minutes
98
+
99
+ # POS tagging
100
+ pos_tags = nltk.pos_tag(words)
101
+ nouns = [word for word, pos in pos_tags if pos.startswith('NN')]
102
+ adjectives = [word for word, pos in pos_tags if pos.startswith('JJ')]
103
+ verbs = [word for word, pos in pos_tags if pos.startswith('VB')]
104
+
105
+ # Sentiment analysis
106
+ sentiment = sentipipe(transcript)
107
+ sentiment_mapping = {
108
+ "LABEL_0": "Negative",
109
+ "LABEL_1": "Neutral",
110
+ "LABEL_2": "Positive"
111
+ }
112
+ sentiment[0]['label'] = sentiment_mapping[sentiment[0]['label']]
113
+
114
+ # Generate Word Cloud and Save it as an Image
115
+ wordcloud = WordCloud(width=800, height=400, background_color='white').generate_from_frequencies(word_frequencies)
116
+
117
+ # Save the Word Cloud to the provided path
118
+ plt.figure(figsize=(10, 5))
119
+ plt.imshow(wordcloud, interpolation='bilinear')
120
+ plt.axis('off')
121
+ plt.savefig(wordcloud_path, format='png')
122
+ plt.close()
123
+
124
+ print("Nouns: ", nouns)
125
+ print("Adjectives: ", adjectives)
126
+ print("Verbs: ", verbs)
127
+ print("Sentiment: ", sentiment)
128
+
129
+ return {
130
+ "transcript": transcript,
131
+ "sentiment": sentiment,
132
+ "sound_intensity": float(sound_intensity),
133
+ "fundamental_frequency": float(fundamental_frequency),
134
+ "spectral_energy": float(spectral_energy),
135
+ "spectral_centroid": float(avg_spectral_centroid),
136
+ "zero_crossing_rate": float(zero_crossing_rate),
137
+ "avg_words_per_minute": float(avg_words_per_minute),
138
+ "avg_unique_words_per_minute": float(avg_unique_words_per_minute),
139
+ "unique_word_count": int(unique_words),
140
+ "filler_words_per_minute": float(filler_words_per_minute),
141
+ "noun_count": len(nouns),
142
+ "adjective_count": len(adjectives),
143
+ "verb_count": len(verbs),
144
+ "pause_rate": float(pause_rate)
145
+ },pitches
functions/fer.py CHANGED
@@ -10,6 +10,7 @@ import os
10
  import matplotlib.pyplot as plt
11
  import matplotlib
12
  matplotlib.use('Agg')
 
13
 
14
  import torch.nn.functional as F
15
  import pandas as pd
@@ -113,13 +114,14 @@ def plot_graph(x, y_vals, labels, path, calib_vals=None):
113
  - path: Path to save the combined plot.
114
  - calib_vals: List of calibration values for each variable (optional).
115
  """
 
116
  plt.figure(figsize=(12, 8)) # Create a figure of appropriate size
117
 
118
  # Iterate over y-values, labels, and calibration values to create subplots
119
  for i, (y, label) in enumerate(zip(y_vals, labels)):
120
  y = [value if isinstance(value, (int, float)) else np.nan for value in y]
121
-
122
- # Create a subplot (3 rows, 1 column, and the current subplot index)
123
  plt.subplot(len(y_vals), 1, i+1)
124
  plt.plot(range(len(x)), y, linestyle='-')
125
 
@@ -133,5 +135,9 @@ def plot_graph(x, y_vals, labels, path, calib_vals=None):
133
  plt.legend()
134
 
135
  plt.tight_layout() # Adjust layout to prevent overlap
136
- plt.savefig(path)
137
- plt.clf() # Clear the figure after saving
 
 
 
 
 
10
  import matplotlib.pyplot as plt
11
  import matplotlib
12
  matplotlib.use('Agg')
13
+ from io import BytesIO
14
 
15
  import torch.nn.functional as F
16
  import pandas as pd
 
114
  - path: Path to save the combined plot.
115
  - calib_vals: List of calibration values for each variable (optional).
116
  """
117
+ buf = BytesIO()
118
  plt.figure(figsize=(12, 8)) # Create a figure of appropriate size
119
 
120
  # Iterate over y-values, labels, and calibration values to create subplots
121
  for i, (y, label) in enumerate(zip(y_vals, labels)):
122
  y = [value if isinstance(value, (int, float)) else np.nan for value in y]
123
+ # Create a subplot (n rows, 1 column, and the current subplot index)
124
+
125
  plt.subplot(len(y_vals), 1, i+1)
126
  plt.plot(range(len(x)), y, linestyle='-')
127
 
 
135
  plt.legend()
136
 
137
  plt.tight_layout() # Adjust layout to prevent overlap
138
+ plt.savefig(buf, format='png')
139
+ plt.clf() # Clear the figure after saving
140
+ buf.seek(0)
141
+ return buf
142
+
143
+
functions/helper.py CHANGED
@@ -2,8 +2,13 @@ import cv2
2
  import numpy as np
3
  import dlib
4
  from tqdm import tqdm
5
-
6
-
 
 
 
 
 
7
  def extract_face(image, net, predictor):
8
  (h, w) = image.shape[:2]
9
  blob = cv2.dnn.blobFromImage(cv2.resize(image, (300, 300)), 1.0, (300, 300), (104.0, 177.0, 123.0))
@@ -56,3 +61,121 @@ def extract_faces_from_frames(frames, net, predictor):
56
 
57
  return faces_list, landmarks_list, sizes_list
58
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2
  import numpy as np
3
  import dlib
4
  from tqdm import tqdm
5
+ from reportlab.lib.pagesizes import A4
6
+ from reportlab.lib import colors
7
+ from reportlab.lib.styles import getSampleStyleSheet
8
+ from reportlab.lib.units import inch
9
+ from reportlab.platypus import SimpleDocTemplate, Paragraph, Table, TableStyle, Spacer,Image
10
+ from io import BytesIO
11
+ import matplotlib.pyplot as plt
12
  def extract_face(image, net, predictor):
13
  (h, w) = image.shape[:2]
14
  blob = cv2.dnn.blobFromImage(cv2.resize(image, (300, 300)), 1.0, (300, 300), (104.0, 177.0, 123.0))
 
61
 
62
  return faces_list, landmarks_list, sizes_list
63
 
64
+ def make_pdf(file_path,data,buf,buf2):
65
+ doc = SimpleDocTemplate(file_path, pagesize=A4)
66
+
67
+ # Define styles
68
+ styles = getSampleStyleSheet()
69
+ content = []
70
+
71
+ # Adding title
72
+ content.append(Paragraph("Facial Emotion Recognition Report", styles['Title']))
73
+ content.append(Spacer(1, 12))
74
+
75
+ # Section 1: Facial Emotion Recognition
76
+ content.append(Paragraph("Facial Emotion Recognition", styles['Heading2']))
77
+ table_data = [["Emotion", "Frame Count"]]
78
+ for emotion, count in data["facial_emotion_recognition"]["class_wise_frame_count"].items():
79
+ table_data.append([emotion.capitalize(), str(count)])
80
+
81
+ table = Table(table_data, hAlign='LEFT')
82
+ table.setStyle(TableStyle([
83
+ ('BACKGROUND', (0, 0), (-1, 0), colors.grey),
84
+ ('TEXTCOLOR', (0, 0), (-1, 0), colors.whitesmoke),
85
+ ('ALIGN', (0, 0), (-1, -1), 'CENTER'),
86
+ ('FONTNAME', (0, 0), (-1, 0), 'Helvetica-Bold'),
87
+ ('BOTTOMPADDING', (0, 0), (-1, 0), 12),
88
+ ('BACKGROUND', (0, 1), (-1, -1), colors.beige),
89
+ ('GRID', (0, 0), (-1, -1), 1, colors.black),
90
+ ]))
91
+ content.append(table)
92
+ content.append(Spacer(1, 12))
93
+
94
+ # Section 2: Audio Analysis
95
+ content.append(Paragraph("Audio Analysis", styles['Heading2']))
96
+ content.append(Paragraph(f"Transcript: {data['audio']['transcript']}", styles['BodyText']))
97
+
98
+ sentiment = data['audio']['sentiment'][0]
99
+ content.append(Paragraph(f"Sentiment: {sentiment['label']} (Score: {sentiment['score']})", styles['BodyText']))
100
+
101
+ audio_features = [
102
+ f"Video Duration:{data['duration']}",
103
+ f"Sound Intensity: {data['audio']['sound_intensity']}",
104
+ f"Fundamental Frequency: {data['audio']['fundamental_frequency']}",
105
+ f"Spectral Energy: {data['audio']['spectral_energy']}",
106
+ f"Spectral Centroid: {data['audio']['spectral_centroid']}",
107
+ f"Zero Crossing Rate: {data['audio']['zero_crossing_rate']}",
108
+ f"Average Words per Minute: {data['audio']['avg_words_per_minute'] if data['duration']>60 else -1}",
109
+ f"Average Unique Words per Minute: {data['audio']['avg_unique_words_per_minute'] if data['duration']>60 else -1}",
110
+ f"Unique Word Count: {data['audio']['unique_word_count']}",
111
+ f"Filler Words per Minute: {data['audio']['filler_words_per_minute']}",
112
+ f"Noun Count: {data['audio']['noun_count']}",
113
+ f"Adjective Count: {data['audio']['adjective_count']}",
114
+ f"Verb Count: {data['audio']['verb_count']}",
115
+ f"Pause Rate: {data['audio']['pause_rate']}"
116
+ ]
117
+
118
+ for feature in audio_features:
119
+ content.append(Paragraph(feature, styles['BodyText']))
120
+ content.append(Spacer(1, 12))
121
+
122
+ plot_image = Image(buf)
123
+ plot_image.drawHeight = 600 # Adjust height
124
+ plot_image.drawWidth = 600 # Adjust width
125
+ content.append(plot_image)
126
+ plot_image = Image(buf2)
127
+ plot_image.drawHeight = 600 # Adjust height
128
+ plot_image.drawWidth = 600 # Adjust width
129
+ content.append(plot_image)
130
+ # Build the PDF
131
+ doc.build(content)
132
+
133
+
134
+
135
+ def plot_facial_expression_graphs(smile_data, ear_data, yawn_data, thresholds, path):
136
+ """
137
+ Plots multiple subplots (smile, EAR, and yawn ratios) in one figure.
138
+
139
+ Parameters:
140
+ - smile_data: List of smile ratios.
141
+ - ear_data: List of eye aspect ratios (EAR).
142
+ - yawn_data: List of yawn ratios.
143
+ - thresholds: List containing thresholds for smile, EAR, and yawn.
144
+ - path: Path to save the combined plot.
145
+
146
+ Returns:
147
+ - buf: BytesIO buffer containing the saved plot.
148
+ """
149
+ buf = BytesIO()
150
+ plt.figure(figsize=(12, 8)) # Create a figure of appropriate size
151
+
152
+ # Plot smile data
153
+ plt.subplot(3, 1, 1)
154
+ plt.plot(smile_data, label='Smile Ratio (Width/Face Width)')
155
+ plt.axhline(y=thresholds[0], color='black', linestyle='--', label='Threshold')
156
+ plt.title('Smile Ratio Over Time')
157
+ plt.ylabel('Ratio')
158
+ plt.legend()
159
+
160
+ # Plot EAR data
161
+ plt.subplot(3, 1, 2)
162
+ plt.plot(ear_data, label='Eye Aspect Ratio (EAR)', color='orange')
163
+ plt.axhline(y=thresholds[1], color='black', linestyle='--', label='Threshold')
164
+ plt.title('Eye Aspect Ratio (EAR) Over Time')
165
+ plt.ylabel('Ratio')
166
+ plt.legend()
167
+
168
+ # Plot yawn data
169
+ plt.subplot(3, 1, 3)
170
+ plt.plot(yawn_data, label='Yawn Ratio (Mouth Height/Face Height)', color='red')
171
+ plt.axhline(y=thresholds[2], color='black', linestyle='--', label='Threshold')
172
+ plt.title('Yawn Ratio Over Time')
173
+ plt.xlabel('Frames')
174
+ plt.ylabel('Ratio')
175
+ plt.legend()
176
+
177
+ plt.tight_layout() # Adjust layout to prevent overlap
178
+ plt.savefig(buf, format='png') # Save to buffer
179
+ plt.clf() # Clear the figure after saving
180
+ buf.seek(0) # Rewind the buffer to the beginning
181
+ return buf
functions/video.py CHANGED
@@ -28,87 +28,147 @@ def eye_aspect_ratio(eye):
28
  ear = (A + B) / (2.0 * C) # EAR formula
29
  return ear
30
 
31
- def blinks(landmarks, sizes, fps):
32
- blink_durations = []
33
- blink_counter = 0
34
- total_blinks = 0
35
- EYE_AR_THRESH = 0.24 # EAR threshold for blink detection
36
- EYE_AR_CONSEC_FRAMES = 4 # Consecutive frames for blink detection
37
-
38
- frame_count = 0 # Initialize frame counter
39
-
40
- for landmark, size in zip(landmarks, sizes):
41
- if landmark is not None:
42
- leftEye = landmark[lStart:lEnd]
43
- rightEye = landmark[rStart:rEnd]
44
-
45
- leftEAR = eye_aspect_ratio(leftEye)
46
- rightEAR = eye_aspect_ratio(rightEye)
47
 
48
- if leftEAR < EYE_AR_THRESH and rightEAR < EYE_AR_THRESH:
49
- if blink_counter == 0:
50
- blink_start_frame = frame_count # Start tracking blink in frames
51
- blink_counter += 1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
52
  else:
53
- if blink_counter >= EYE_AR_CONSEC_FRAMES:
54
- blink_end_frame = frame_count
55
- blink_duration_frames = blink_end_frame - blink_start_frame
56
- blink_duration_seconds = blink_duration_frames / fps # Convert frames to seconds
57
- blink_durations.append(blink_duration_seconds) # Store blink duration in seconds
58
- total_blinks += 1
59
- blink_counter = 0
60
-
61
- frame_count += 1 # Increment the frame counter for each loop iteration
62
-
63
- return blink_durations, total_blinks
64
-
65
-
66
- def detect_smiles(faces,smile_cascade):
67
- smiles=[]
68
- count=0
69
- for face in faces:
70
- if face is not None:
71
- smile = smile_cascade.detectMultiScale(face, scaleFactor=1.8, minNeighbors=20, minSize=(25, 25))
72
- if len(smile) > 0:
73
  smiles.append(True)
74
- count+=1
 
 
75
  else:
76
  smiles.append(False)
 
 
 
 
 
 
77
  else:
78
  smiles.append(None)
79
- return smiles,count
80
-
81
- def cal_yawn(landmarks):
82
- # Corrected lip landmark indices for dlib's 68-point model
83
- top_lip_idx = [50, 51, 52, 53, 61, 62, 63]
84
- low_lip_idx = [56, 57, 58, 59, 65, 66, 67]
85
 
86
- top_lip = np.array([landmarks[idx] for idx in top_lip_idx])
87
- low_lip = np.array([landmarks[idx] for idx in low_lip_idx])
88
 
89
- top_mean = np.mean(top_lip, axis=0)
90
- low_mean = np.mean(low_lip, axis=0)
91
 
92
- distance = dist.euclidean(top_mean, low_mean)
93
- return distance
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
94
 
95
- def detect_yawn(landmarks,sizes):
96
- yawn=[]
97
- count=0
98
- normalized_yawn_thresh = 0.25
99
- normalized_lip_distances=[]
100
- for landmark,size in zip(landmarks,sizes):
101
- if landmark is not None:
102
- lip_dist = cal_yawn(landmark)
103
- face_size = dist.euclidean(landmark[8], landmark[27])
104
- normalized_lip_dist = lip_dist / face_size
105
- normalized_lip_distances.append(normalized_lip_dist)
106
- if normalized_lip_dist > normalized_yawn_thresh:
107
- yawn.append(True)
108
- count+=1
 
 
 
 
 
 
109
  else:
110
- yawn.append(False)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
111
  else:
112
- normalized_lip_distances.append(None)
113
- yawn.append(None)
114
- return yawn,normalized_lip_distances,count
 
28
  ear = (A + B) / (2.0 * C) # EAR formula
29
  return ear
30
 
31
+ def euclidean_distance(p1, p2):
32
+ return np.linalg.norm(p1 - p2)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
33
 
34
+ # Function to detect smiles based on mouth aspect ratio
35
+ def detect_smiles(landmarks_list, face_sizes, fps=30, consecutive_frames=2):
36
+ smile_ratios = [] # Store the smile ratios for plotting
37
+ smiles = []
38
+ smile_durations = [] # To store the duration of each smile
39
+ total_smiles = 0
40
+ smile_in_progress = False
41
+ smile_start_frame = None
42
+ avg_dynamic_threshold=[]
43
+ for frame_idx, (landmarks, face_size) in enumerate(zip(landmarks_list, face_sizes)):
44
+ if landmarks is not None:
45
+ # Use NumPy array indices for the relevant mouth landmarks
46
+ left_corner = np.array(landmarks[48])
47
+ right_corner = np.array(landmarks[54])
48
+ top_lip = np.array(landmarks[51])
49
+ bottom_lip = np.array(landmarks[57])
50
+
51
+ mouth_width = euclidean_distance(left_corner, right_corner)
52
+ mouth_height = euclidean_distance(top_lip, bottom_lip)
53
+
54
+ face_width, face_height = face_size # face_size is (width, height)
55
+
56
+ if face_width > 0 and face_height > 0:
57
+ normalized_mouth_width = mouth_width / face_width
58
+ normalized_mouth_height = mouth_height / face_height
59
  else:
60
+ normalized_mouth_width = 0
61
+ normalized_mouth_height = 0
62
+
63
+ smile_ratios.append(normalized_mouth_width)
64
+ dynamic_threshold = 0.2 + (0.05 * face_width / 100)
65
+ avg_dynamic_threshold.append(dynamic_threshold)
66
+ # print(dynamic_threshold)
67
+ # Check if the smile meets the threshold
68
+ if (normalized_mouth_width > dynamic_threshold) and (normalized_mouth_height > 0.06):
 
 
 
 
 
 
 
 
 
 
 
69
  smiles.append(True)
70
+ if not smile_in_progress:
71
+ smile_in_progress = True
72
+ smile_start_frame = frame_idx # Record the start of the smile
73
  else:
74
  smiles.append(False)
75
+ if smile_in_progress and (frame_idx - smile_start_frame >= consecutive_frames):
76
+ smile_in_progress = False
77
+ smile_end_frame = frame_idx
78
+ smile_duration = (smile_end_frame - smile_start_frame) / fps # Calculate smile duration
79
+ smile_durations.append(smile_duration)
80
+ total_smiles += 1 # Increment total smile count
81
  else:
82
  smiles.append(None)
83
+ try:
84
+ avg_thr=sum(avg_dynamic_threshold)/len(avg_dynamic_threshold)
85
+ except:
86
+ avg_thr=0
87
+ return smiles, smile_ratios, total_smiles, smile_durations,avg_thr
 
88
 
 
 
89
 
90
+ # Function to detect blinks based on the eye aspect ratio (EAR)
91
+ import numpy as np
92
 
93
+ # Function to detect blinks based on the eye aspect ratio (EAR)
94
+ def detect_blinks(landmarks_list, face_sizes, ear_threshold=0.24, consecutive_frames=2):
95
+ ear_ratios = [] # Store EAR for plotting
96
+ blinks = []
97
+
98
+ # Variables to monitor consecutive low EAR values
99
+ blink_count = 0
100
+ consec_low_ear = 0
101
+
102
+ for landmarks, face in zip(landmarks_list, face_sizes):
103
+ if landmarks is not None:
104
+ left_eye = landmarks[36:42] # Points 36-41 (inclusive) for the left eye
105
+ right_eye = landmarks[42:48]
106
+
107
+ def eye_aspect_ratio(eye):
108
+ A = euclidean_distance(eye[1], eye[5])
109
+ B = euclidean_distance(eye[2], eye[4])
110
+ C = euclidean_distance(eye[0], eye[3])
111
+ ear = (A + B) / (2.0 * C)
112
+ return ear
113
+
114
+ left_ear = eye_aspect_ratio(left_eye)
115
+ right_ear = eye_aspect_ratio(right_eye)
116
+ avg_ear = (left_ear + right_ear) / 2.0
117
+
118
+ ear_ratios.append(avg_ear)
119
+
120
+ if avg_ear < ear_threshold:
121
+ consec_low_ear += 1
122
+ else:
123
+ # If low EAR is detected for enough consecutive frames, count as a blink
124
+ if consec_low_ear >= consecutive_frames:
125
+ blink_count += 1
126
+ consec_low_ear = 0 # Reset the consecutive low EAR counter
127
+ else:
128
+ blinks.append(None)
129
+
130
+ return blink_count, ear_ratios
131
 
132
+ # Function to detect yawns based on the vertical distance between top and bottom lips
133
+ # Function to detect yawns based on the vertical distance between top and bottom lips
134
+ def detect_yawns(landmarks_list, face_sizes, fps=30, consecutive_frames=3):
135
+ yawn_ratios = [] # Store the yawn ratios for plotting
136
+ yawns = []
137
+ yawn_durations = [] # To store the duration of each yawn
138
+ total_yawns = 0
139
+ yawn_in_progress = False
140
+ yawn_start_frame = None
141
+
142
+ for frame_idx, (landmarks, face_size) in enumerate(zip(landmarks_list, face_sizes)):
143
+ if landmarks is not None:
144
+ top_lip = np.array(landmarks[51])
145
+ bottom_lip = np.array(landmarks[57])
146
+
147
+ mouth_height = euclidean_distance(top_lip, bottom_lip)
148
+ face_width, face_height = face_size # face_size is (width, height)
149
+
150
+ if face_height > 0:
151
+ normalized_mouth_height = mouth_height / face_height
152
  else:
153
+ normalized_mouth_height = 0
154
+
155
+ yawn_ratios.append(normalized_mouth_height)
156
+
157
+ # Check if the yawn meets the threshold
158
+ if normalized_mouth_height > 0.24:
159
+ yawns.append(True)
160
+ if not yawn_in_progress:
161
+ yawn_in_progress = True
162
+ yawn_start_frame = frame_idx # Record the start of the yawn
163
+ else:
164
+ yawns.append(False)
165
+ if yawn_in_progress and (frame_idx - yawn_start_frame >= consecutive_frames):
166
+ yawn_in_progress = False
167
+ yawn_end_frame = frame_idx
168
+ yawn_duration = (yawn_end_frame - yawn_start_frame) / fps # Calculate yawn duration
169
+ yawn_durations.append(yawn_duration)
170
+ total_yawns += 1 # Increment total yawn count
171
  else:
172
+ yawns.append(None)
173
+
174
+ return yawns, yawn_ratios, total_yawns, yawn_durations
main.py CHANGED
@@ -6,12 +6,14 @@ os.environ['TF_ENABLE_ONEDNN_OPTS'] = '0'
6
  import logging
7
  logging.getLogger('absl').setLevel(logging.ERROR)
8
  from functions.models import models_dict
9
- from functions.helper import extract_faces_from_frames
10
- from functions.video import eyebrow,blinks,detect_yawn,detect_smiles
11
  from functions.valence_arousal import va_predict
12
  from functions.fer import fer_predict,plot_graph
 
13
  from moviepy.editor import VideoFileClip
14
  import json
 
15
  import pandas as pd
16
  from typing import Callable
17
  from functions.audio import extract_audio_features
@@ -57,7 +59,9 @@ def analyze_live_video(video_path: str, uid: str, user_id: str, count: int, fina
57
  os.makedirs(folder_path,exist_ok=True)
58
  meta_data_path=os.path.join(folder_path,'metadata.json')
59
  valence_plot=os.path.join(folder_path,"vas.png")
 
60
  df_path=os.path.join(folder_path,'data.csv')
 
61
 
62
  video_clip=VideoFileClip(video_path)
63
  video_clip=video_clip.set_fps(fps)
@@ -72,8 +76,8 @@ def analyze_live_video(video_path: str, uid: str, user_id: str, count: int, fina
72
 
73
 
74
  # faces=[extract_face(frame) for frame in tqdm(video_frames)]
75
- af=extract_audio_features(audio_path,asrmodel,asrproc,sentipipe)
76
-
77
 
78
  fer_emotions,class_wise_frame_count,em_tensors=fer_predict(faces,fps,fer_model)
79
  valence_list,arousal_list,stress_list=va_predict(valence_arousal_model,val_ar_feat_model,faces,list(em_tensors))
@@ -81,30 +85,35 @@ def analyze_live_video(video_path: str, uid: str, user_id: str, count: int, fina
81
 
82
  eyebrow_dist=eyebrow(landmarks,sizes)
83
  print('eyebrow done')
84
- blink_durations,total_blinks=blinks(landmarks,sizes,fps)
85
- print('blinks done')
86
- smiles,smile_count=detect_smiles(faces,smile_cascade)
87
- print('smiles done')
88
- yawn,normalized_lip_distances,yawn_count=detect_yawn(landmarks,sizes)
 
 
 
89
  print('ywan done')
90
 
 
 
 
 
91
 
92
- y_vals = [valence_list, arousal_list, stress_list,eyebrow_dist]
93
- labels = ['Valence', 'Arousal', 'Stress',"EyeBrowDistance"]
94
- plot_graph(timestamps, y_vals, labels, valence_plot)
95
  print('graph_plotted')
96
  meta_data={}
 
97
  meta_data['facial_emotion_recognition'] = {
98
  "class_wise_frame_count": class_wise_frame_count,
99
  }
100
  meta_data['audio']=af
101
 
102
- meta_data['blinks']={
103
- 'blink_durations':blink_durations,
104
- 'total_blinks':total_blinks
105
- }
106
- meta_data['smile']=smile_count
107
- meta_data['yawn']=yawn_count
108
  with open(meta_data_path, 'w') as json_file:
109
  json.dump(meta_data, json_file, indent=4)
110
  df=pd.DataFrame(
 
6
  import logging
7
  logging.getLogger('absl').setLevel(logging.ERROR)
8
  from functions.models import models_dict
9
+ from functions.helper import extract_faces_from_frames,make_pdf
10
+ from functions.video import eyebrow,detect_blinks,detect_yawns,detect_smiles
11
  from functions.valence_arousal import va_predict
12
  from functions.fer import fer_predict,plot_graph
13
+ from functions.helper import plot_facial_expression_graphs
14
  from moviepy.editor import VideoFileClip
15
  import json
16
+ # from trash import detect_eyes_in_faces
17
  import pandas as pd
18
  from typing import Callable
19
  from functions.audio import extract_audio_features
 
59
  os.makedirs(folder_path,exist_ok=True)
60
  meta_data_path=os.path.join(folder_path,'metadata.json')
61
  valence_plot=os.path.join(folder_path,"vas.png")
62
+ word_cloud=os.path.join(folder_path,'wordcloud.jpg')
63
  df_path=os.path.join(folder_path,'data.csv')
64
+ pdf_filename = os.path.join(folder_path,"formatted_output_with_plots.pdf")
65
 
66
  video_clip=VideoFileClip(video_path)
67
  video_clip=video_clip.set_fps(fps)
 
76
 
77
 
78
  # faces=[extract_face(frame) for frame in tqdm(video_frames)]
79
+ af,pitches=extract_audio_features(audio_path,asrmodel,asrproc,sentipipe,duration,word_cloud)
80
+ pitches=[float(pitch) for pitch in pitches]
81
 
82
  fer_emotions,class_wise_frame_count,em_tensors=fer_predict(faces,fps,fer_model)
83
  valence_list,arousal_list,stress_list=va_predict(valence_arousal_model,val_ar_feat_model,faces,list(em_tensors))
 
85
 
86
  eyebrow_dist=eyebrow(landmarks,sizes)
87
  print('eyebrow done')
88
+
89
+ blink_count, ear_ratios=detect_blinks(landmarks,sizes,fps)
90
+ ear_ratios=[float(pitch) for pitch in ear_ratios]
91
+ print('blinks done',blink_count)
92
+ smiles, smile_ratios, total_smiles, smile_durations,smile_threshold=detect_smiles(landmarks,sizes)
93
+ smile_ratios=[float(smile) for smile in smile_ratios]
94
+ print('smiles done',total_smiles)
95
+ yawns, yawn_ratios, total_yawns, yawn_durations=detect_yawns(landmarks,sizes)
96
  print('ywan done')
97
 
98
+ thresholds=[smile_threshold,0.225,0.22]
99
+ buffer = plot_facial_expression_graphs(smile_ratios, ear_ratios, yawn_ratios, thresholds, 'path_to_save_plot.pdf')
100
+
101
+ # print("detect_eyes : ",detect_eyes_in_faces(faces))
102
 
103
+ y_vals = [valence_list, arousal_list, stress_list,eyebrow_dist,pitches]
104
+ labels = ['Valence', 'Arousal', 'Stress',"EyeBrowDistance","Pitch"]
105
+ buf=plot_graph(timestamps, y_vals, labels, valence_plot)
106
  print('graph_plotted')
107
  meta_data={}
108
+ meta_data['duration']=duration
109
  meta_data['facial_emotion_recognition'] = {
110
  "class_wise_frame_count": class_wise_frame_count,
111
  }
112
  meta_data['audio']=af
113
 
114
+
115
+ make_pdf(pdf_filename,meta_data,buf,buffer)
116
+
 
 
 
117
  with open(meta_data_path, 'w') as json_file:
118
  json.dump(meta_data, json_file, indent=4)
119
  df=pd.DataFrame(
requirements.txt CHANGED
Binary files a/requirements.txt and b/requirements.txt differ