Spaces:

Rahulk2197
/

Interview

Sleeping

App Files Files Community

Rahulk2197 commited on Oct 18, 2024

Commit

eb9b1e7

verified ·

1 Parent(s): 92724d0

Upload 15 files

Browse files

Files changed (11) hide show

app.py +1 -1
functions/__pycache__/audio.cpython-312.pyc +0 -0
functions/__pycache__/fer.cpython-312.pyc +0 -0
functions/__pycache__/helper.cpython-312.pyc +0 -0
functions/__pycache__/video.cpython-312.pyc +0 -0
functions/audio.py +145 -99
functions/fer.py +10 -4
functions/helper.py +125 -2
functions/video.py +131 -71
main.py +27 -18
requirements.txt +0 -0

app.py CHANGED Viewed

@@ -70,5 +70,5 @@ if uploaded_file is not None:
         # Clean up temporary files
         os.remove(temp_file_path)
-        shutil.rmtree(output_folder)
         os.remove(zip_file_path)

         # Clean up temporary files
         os.remove(temp_file_path)
+        # shutil.rmtree(output_folder)
         os.remove(zip_file_path)

functions/__pycache__/audio.cpython-312.pyc CHANGED Viewed

Binary files a/functions/__pycache__/audio.cpython-312.pyc and b/functions/__pycache__/audio.cpython-312.pyc differ

functions/__pycache__/fer.cpython-312.pyc CHANGED Viewed

Binary files a/functions/__pycache__/fer.cpython-312.pyc and b/functions/__pycache__/fer.cpython-312.pyc differ

functions/__pycache__/helper.cpython-312.pyc CHANGED Viewed

Binary files a/functions/__pycache__/helper.cpython-312.pyc and b/functions/__pycache__/helper.cpython-312.pyc differ

functions/__pycache__/video.cpython-312.pyc CHANGED Viewed

Binary files a/functions/__pycache__/video.cpython-312.pyc and b/functions/__pycache__/video.cpython-312.pyc differ

functions/audio.py CHANGED Viewed

@@ -1,99 +1,145 @@
-import librosa
-import numpy as np
-import torch
-from collections import Counter
-import nltk
-nltk.download('punkt_tab')
-nltk.download('averaged_perceptron_tagger_eng')
-device = "cuda:0" if torch.cuda.is_available() else "cpu"
-torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
-def extract_audio_features(audio_path,asrmodel,asrproc,sentipipe):
-    y, sr = librosa.load(audio_path,sr=16000)
-    inputs = asrproc(y, sampling_rate=sr, return_tensors="pt").input_features
-    inputs = inputs.to(device, dtype=torch_dtype)
-    with torch.no_grad():
-        generated_ids = asrmodel.generate(inputs)
-        transcript = asrproc.batch_decode(generated_ids, skip_special_tokens=True)[0]
-    # Sound intensity (RMS)
-    rms = librosa.feature.rms(y=y)
-    sound_intensity = np.mean(rms)
-    # Fundamental frequency (F0)
-    f0, voiced_flag, voiced_probs = librosa.pyin(y, fmin=librosa.note_to_hz('C2'), fmax=librosa.note_to_hz('C7'))
-    fundamental_frequency = np.nanmean(f0)
-    # Spectral energy (based on STFT)
-    S = np.abs(librosa.stft(y))
-    spectral_energy = np.mean(np.sum(S ** 2, axis=0))
-    # Spectral centroid
-    spectral_centroid = librosa.feature.spectral_centroid(y=y, sr=sr)
-    avg_spectral_centroid = np.mean(spectral_centroid)
-    # Zero-crossing rate
-    zcr = librosa.feature.zero_crossing_rate(y)
-    zero_crossing_rate = np.mean(zcr)
-    # Pause detection
-    silence_threshold = -40
-    silent_intervals = librosa.effects.split(y, top_db=silence_threshold)  # Split into non-silent intervals
-    pause_duration = 0
-    for start, end in silent_intervals:
-        pause_duration += (end - start) / sr  # Add the pause duration in seconds
-    total_duration = librosa.get_duration(y=y, sr=sr)
-    pause_rate = (pause_duration / total_duration) * 60  # Convert to pauses per minute
-    # Transcript processing
-    words = nltk.word_tokenize(transcript)
-    num_words = len(words)
-    unique_words = len(set(words))
-    word_frequencies = Counter(words)
-    duration_minutes = total_duration / 60
-    avg_words_per_minute = num_words / duration_minutes
-    avg_unique_words_per_minute = unique_words / duration_minutes
-    # Count of unique words
-    unique_word_count = unique_words
-    # Filler word detection
-    filler_words = [
-        'uh', 'um', 'like', 'you know', 'ah', 'er', 'hmm', 'well', 'so',
-        'I mean', 'okay', 'right', 'actually', 'basically', 'you see',
-        'sort of', 'kind of', 'yeah', 'literally', 'just', 'I guess',
-        'totally', 'honestly', 'seriously', 'alright'
-    ]
-    filler_word_count = sum([word_frequencies.get(filler, 0) for filler in filler_words])
-    filler_words_per_minute = filler_word_count / duration_minutes
-    # POS tagging
-    pos_tags = nltk.pos_tag(words)
-    nouns = [word for word, pos in pos_tags if pos.startswith('NN')]
-    adjectives = [word for word, pos in pos_tags if pos.startswith('JJ')]
-    verbs = [word for word, pos in pos_tags if pos.startswith('VB')]
-    # Sentiment analysis
-    sentiment = sentipipe(transcript)
-    print("Nouns: ", nouns)
-    print("Adjectives: ", adjectives)
-    print("Verbs: ", verbs)
-    return {
-    "transcript": transcript,  # assuming this is a string
-    "sentiment":sentiment,
-    "sound_intensity": float(sound_intensity),  # convert numpy float to Python float
-    "fundamental_frequency": float(fundamental_frequency),  # same conversion
-    "spectral_energy": float(spectral_energy),  # convert to Python float
-    "spectral_centroid": float(avg_spectral_centroid),  # convert numpy float
-    "zero_crossing_rate": float(zero_crossing_rate),  # convert to Python float
-    "avg_words_per_minute": float(avg_words_per_minute),  # same conversion
-    "avg_unique_words_per_minute": float(avg_unique_words_per_minute),  # convert float
-    "unique_word_count": int(unique_word_count),  # convert to integer if needed
-    "filler_words_per_minute": float(filler_words_per_minute),  # convert float
-    "noun_count": len(nouns),  # Assuming nouns is a list, so no changes needed
-    "adjective_count": len(adjectives),  # Same here
-    "verb_count": len(verbs),  # Same here
-    "pause_rate": float(pause_rate), # convert to Python float
-}

+import librosa
+import numpy as np
+import torch
+from collections import Counter
+import nltk
+import string
+import matplotlib.pyplot as plt
+from wordcloud import WordCloud
+nltk.download('punkt')
+nltk.download('averaged_perceptron_tagger')
+device = "cuda:0" if torch.cuda.is_available() else "cpu"
+torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
+def get_pitch_list(y,sr):
+    hop_length = int(sr / 30)  # hop_length determines how far apart the frames are
+    # Extract the pitch (F0) using librosa's piptrack method
+    pitches, magnitudes = librosa.piptrack(y=y, sr=sr, hop_length=hop_length)
+    # Get the pitch frequencies from the pitch array
+    pitch_frequencies = []
+    for t in range(pitches.shape[1]):
+        index = magnitudes[:, t].argmax()  # Get the index of the maximum magnitude
+        pitch = pitches[index, t]
+        pitch_frequencies.append(pitch)
+    # Convert pitch_frequencies to a NumPy array
+    pitch_frequencies = np.array(pitch_frequencies)
+    print("shape : ",pitch_frequencies.shape)
+    return pitch_frequencies
+def extract_audio_features(audio_path, asrmodel, asrproc, sentipipe, duration, wordcloud_path):
+    y, sr = librosa.load(audio_path, sr=16000)
+    inputs = asrproc(y, sampling_rate=sr, return_tensors="pt").input_features
+    inputs = inputs.to(device, dtype=torch_dtype)
+    with torch.no_grad():
+        generated_ids = asrmodel.generate(inputs)
+        transcript = asrproc.batch_decode(generated_ids, skip_special_tokens=True)[0]
+    # Sound intensity (RMS)
+    rms = librosa.feature.rms(y=y)
+    sound_intensity = np.mean(rms)
+    # Pitch list
+    pitches=get_pitch_list(y,sr)
+    # Fundamental frequency (F0)
+    f0, voiced_flag, voiced_probs = librosa.pyin(y, fmin=librosa.note_to_hz('C2'), fmax=librosa.note_to_hz('C7'))
+    fundamental_frequency = np.nanmean(f0)
+    # Spectral energy (based on STFT)
+    S = np.abs(librosa.stft(y))
+    spectral_energy = np.mean(np.sum(S ** 2, axis=0))
+    # Spectral centroid
+    spectral_centroid = librosa.feature.spectral_centroid(y=y, sr=sr)
+    avg_spectral_centroid = np.mean(spectral_centroid)
+    # Zero-crossing rate
+    zcr = librosa.feature.zero_crossing_rate(y)
+    zero_crossing_rate = np.mean(zcr)
+    # Pause detection
+    silence_threshold = -40
+    silent_intervals = librosa.effects.split(y, top_db=silence_threshold)
+    pause_duration = 0
+    for start, end in silent_intervals:
+        pause_duration += (end - start) / sr
+    total_duration = librosa.get_duration(y=y, sr=sr)
+    pause_rate = (pause_duration / total_duration) * 60  # Convert to pauses per minute
+    # Transcript processing
+    words = nltk.word_tokenize(transcript)
+    words = [word.lower() for word in words if word not in string.punctuation]
+    num_words = len(words)
+    unique_words = len(set(words))
+    word_frequencies = Counter(words)
+    # Duration in minutes
+    duration_minutes = total_duration / 60
+    avg_words_per_minute = num_words / duration_minutes
+    avg_unique_words_per_minute = unique_words / duration_minutes
+    # Filler word detection
+    filler_words = [
+        'uh', 'um', 'like', 'you know', 'ah', 'er', 'hmm', 'well', 'so',
+        'I mean', 'okay', 'right', 'actually', 'basically', 'you see',
+        'sort of', 'kind of', 'yeah', 'literally', 'just', 'I guess',
+        'totally', 'honestly', 'seriously', 'alright'
+    ]
+    filler_word_count = sum([word_frequencies.get(filler, 0) for filler in filler_words])
+    filler_words_per_minute = filler_word_count / duration_minutes
+    # POS tagging
+    pos_tags = nltk.pos_tag(words)
+    nouns = [word for word, pos in pos_tags if pos.startswith('NN')]
+    adjectives = [word for word, pos in pos_tags if pos.startswith('JJ')]
+    verbs = [word for word, pos in pos_tags if pos.startswith('VB')]
+    # Sentiment analysis
+    sentiment = sentipipe(transcript)
+    sentiment_mapping = {
+        "LABEL_0": "Negative",
+        "LABEL_1": "Neutral",
+        "LABEL_2": "Positive"
+    }
+    sentiment[0]['label'] = sentiment_mapping[sentiment[0]['label']]
+    # Generate Word Cloud and Save it as an Image
+    wordcloud = WordCloud(width=800, height=400, background_color='white').generate_from_frequencies(word_frequencies)
+    # Save the Word Cloud to the provided path
+    plt.figure(figsize=(10, 5))
+    plt.imshow(wordcloud, interpolation='bilinear')
+    plt.axis('off')
+    plt.savefig(wordcloud_path, format='png')
+    plt.close()
+    print("Nouns: ", nouns)
+    print("Adjectives: ", adjectives)
+    print("Verbs: ", verbs)
+    print("Sentiment: ", sentiment)
+    return {
+        "transcript": transcript,
+        "sentiment": sentiment,
+        "sound_intensity": float(sound_intensity),
+        "fundamental_frequency": float(fundamental_frequency),
+        "spectral_energy": float(spectral_energy),
+        "spectral_centroid": float(avg_spectral_centroid),
+        "zero_crossing_rate": float(zero_crossing_rate),
+        "avg_words_per_minute": float(avg_words_per_minute),
+        "avg_unique_words_per_minute": float(avg_unique_words_per_minute),
+        "unique_word_count": int(unique_words),
+        "filler_words_per_minute": float(filler_words_per_minute),
+        "noun_count": len(nouns),
+        "adjective_count": len(adjectives),
+        "verb_count": len(verbs),
+        "pause_rate": float(pause_rate)
+    },pitches

functions/fer.py CHANGED Viewed

@@ -10,6 +10,7 @@ import os
 import matplotlib.pyplot as plt
 import matplotlib
 matplotlib.use('Agg')
 import torch.nn.functional as F
 import pandas as pd
@@ -113,13 +114,14 @@ def plot_graph(x, y_vals, labels, path, calib_vals=None):
     - path: Path to save the combined plot.
     - calib_vals: List of calibration values for each variable (optional).
     """
     plt.figure(figsize=(12, 8))  # Create a figure of appropriate size
     # Iterate over y-values, labels, and calibration values to create subplots
     for i, (y, label) in enumerate(zip(y_vals, labels)):
         y = [value if isinstance(value, (int, float)) else np.nan for value in y]
-        # Create a subplot (3 rows, 1 column, and the current subplot index)
         plt.subplot(len(y_vals), 1, i+1)
         plt.plot(range(len(x)), y, linestyle='-')
@@ -133,5 +135,9 @@ def plot_graph(x, y_vals, labels, path, calib_vals=None):
         plt.legend()
     plt.tight_layout()  # Adjust layout to prevent overlap
-    plt.savefig(path)
-    plt.clf()  # Clear the figure after saving

 import matplotlib.pyplot as plt
 import matplotlib
 matplotlib.use('Agg')
+from io import BytesIO
 import torch.nn.functional as F
 import pandas as pd
     - path: Path to save the combined plot.
     - calib_vals: List of calibration values for each variable (optional).
     """
+    buf = BytesIO()
     plt.figure(figsize=(12, 8))  # Create a figure of appropriate size
     # Iterate over y-values, labels, and calibration values to create subplots
     for i, (y, label) in enumerate(zip(y_vals, labels)):
         y = [value if isinstance(value, (int, float)) else np.nan for value in y]
+        # Create a subplot (n rows, 1 column, and the current subplot index)
         plt.subplot(len(y_vals), 1, i+1)
         plt.plot(range(len(x)), y, linestyle='-')
         plt.legend()
     plt.tight_layout()  # Adjust layout to prevent overlap
+    plt.savefig(buf, format='png')
+    plt.clf()  # Clear the figure after saving
+    buf.seek(0)
+    return buf

functions/helper.py CHANGED Viewed

@@ -2,8 +2,13 @@ import cv2
 import numpy as np
 import dlib
 from tqdm import tqdm
 def extract_face(image, net, predictor):
     (h, w) = image.shape[:2]
     blob = cv2.dnn.blobFromImage(cv2.resize(image, (300, 300)), 1.0, (300, 300), (104.0, 177.0, 123.0))
@@ -56,3 +61,121 @@ def extract_faces_from_frames(frames, net, predictor):
     return faces_list, landmarks_list, sizes_list

 import numpy as np
 import dlib
 from tqdm import tqdm
+from reportlab.lib.pagesizes import A4
+from reportlab.lib import colors
+from reportlab.lib.styles import getSampleStyleSheet
+from reportlab.lib.units import inch
+from reportlab.platypus import SimpleDocTemplate, Paragraph, Table, TableStyle, Spacer,Image
+from io import BytesIO
+import matplotlib.pyplot as plt
 def extract_face(image, net, predictor):
     (h, w) = image.shape[:2]
     blob = cv2.dnn.blobFromImage(cv2.resize(image, (300, 300)), 1.0, (300, 300), (104.0, 177.0, 123.0))
     return faces_list, landmarks_list, sizes_list
+def make_pdf(file_path,data,buf,buf2):
+    doc = SimpleDocTemplate(file_path, pagesize=A4)
+    # Define styles
+    styles = getSampleStyleSheet()
+    content = []
+    # Adding title
+    content.append(Paragraph("Facial Emotion Recognition Report", styles['Title']))
+    content.append(Spacer(1, 12))
+    # Section 1: Facial Emotion Recognition
+    content.append(Paragraph("Facial Emotion Recognition", styles['Heading2']))
+    table_data = [["Emotion", "Frame Count"]]
+    for emotion, count in data["facial_emotion_recognition"]["class_wise_frame_count"].items():
+        table_data.append([emotion.capitalize(), str(count)])
+    table = Table(table_data, hAlign='LEFT')
+    table.setStyle(TableStyle([
+        ('BACKGROUND', (0, 0), (-1, 0), colors.grey),
+        ('TEXTCOLOR', (0, 0), (-1, 0), colors.whitesmoke),
+        ('ALIGN', (0, 0), (-1, -1), 'CENTER'),
+        ('FONTNAME', (0, 0), (-1, 0), 'Helvetica-Bold'),
+        ('BOTTOMPADDING', (0, 0), (-1, 0), 12),
+        ('BACKGROUND', (0, 1), (-1, -1), colors.beige),
+        ('GRID', (0, 0), (-1, -1), 1, colors.black),
+    ]))
+    content.append(table)
+    content.append(Spacer(1, 12))
+    # Section 2: Audio Analysis
+    content.append(Paragraph("Audio Analysis", styles['Heading2']))
+    content.append(Paragraph(f"Transcript: {data['audio']['transcript']}", styles['BodyText']))
+    sentiment = data['audio']['sentiment'][0]
+    content.append(Paragraph(f"Sentiment: {sentiment['label']} (Score: {sentiment['score']})", styles['BodyText']))
+    audio_features = [
+        f"Video Duration:{data['duration']}",
+        f"Sound Intensity: {data['audio']['sound_intensity']}",
+        f"Fundamental Frequency: {data['audio']['fundamental_frequency']}",
+        f"Spectral Energy: {data['audio']['spectral_energy']}",
+        f"Spectral Centroid: {data['audio']['spectral_centroid']}",
+        f"Zero Crossing Rate: {data['audio']['zero_crossing_rate']}",
+        f"Average Words per Minute: {data['audio']['avg_words_per_minute'] if data['duration']>60 else -1}",
+        f"Average Unique Words per Minute: {data['audio']['avg_unique_words_per_minute'] if data['duration']>60 else -1}",
+        f"Unique Word Count: {data['audio']['unique_word_count']}",
+        f"Filler Words per Minute: {data['audio']['filler_words_per_minute']}",
+        f"Noun Count: {data['audio']['noun_count']}",
+        f"Adjective Count: {data['audio']['adjective_count']}",
+        f"Verb Count: {data['audio']['verb_count']}",
+        f"Pause Rate: {data['audio']['pause_rate']}"
+    ]
+    for feature in audio_features:
+        content.append(Paragraph(feature, styles['BodyText']))
+    content.append(Spacer(1, 12))
+    plot_image = Image(buf)
+    plot_image.drawHeight = 600  # Adjust height
+    plot_image.drawWidth = 600   # Adjust width
+    content.append(plot_image)
+    plot_image = Image(buf2)
+    plot_image.drawHeight = 600  # Adjust height
+    plot_image.drawWidth = 600   # Adjust width
+    content.append(plot_image)
+    # Build the PDF
+    doc.build(content)
+def plot_facial_expression_graphs(smile_data, ear_data, yawn_data, thresholds, path):
+    """
+    Plots multiple subplots (smile, EAR, and yawn ratios) in one figure.
+    Parameters:
+    - smile_data: List of smile ratios.
+    - ear_data: List of eye aspect ratios (EAR).
+    - yawn_data: List of yawn ratios.
+    - thresholds: List containing thresholds for smile, EAR, and yawn.
+    - path: Path to save the combined plot.
+    Returns:
+    - buf: BytesIO buffer containing the saved plot.
+    """
+    buf = BytesIO()
+    plt.figure(figsize=(12, 8))  # Create a figure of appropriate size
+    # Plot smile data
+    plt.subplot(3, 1, 1)
+    plt.plot(smile_data, label='Smile Ratio (Width/Face Width)')
+    plt.axhline(y=thresholds[0], color='black', linestyle='--', label='Threshold')
+    plt.title('Smile Ratio Over Time')
+    plt.ylabel('Ratio')
+    plt.legend()
+    # Plot EAR data
+    plt.subplot(3, 1, 2)
+    plt.plot(ear_data, label='Eye Aspect Ratio (EAR)', color='orange')
+    plt.axhline(y=thresholds[1], color='black', linestyle='--', label='Threshold')
+    plt.title('Eye Aspect Ratio (EAR) Over Time')
+    plt.ylabel('Ratio')
+    plt.legend()
+    # Plot yawn data
+    plt.subplot(3, 1, 3)
+    plt.plot(yawn_data, label='Yawn Ratio (Mouth Height/Face Height)', color='red')
+    plt.axhline(y=thresholds[2], color='black', linestyle='--', label='Threshold')
+    plt.title('Yawn Ratio Over Time')
+    plt.xlabel('Frames')
+    plt.ylabel('Ratio')
+    plt.legend()
+    plt.tight_layout()  # Adjust layout to prevent overlap
+    plt.savefig(buf, format='png')  # Save to buffer
+    plt.clf()  # Clear the figure after saving
+    buf.seek(0)  # Rewind the buffer to the beginning
+    return buf

functions/video.py CHANGED Viewed

@@ -28,87 +28,147 @@ def eye_aspect_ratio(eye):
     ear = (A + B) / (2.0 * C)  # EAR formula
     return ear
-def blinks(landmarks, sizes, fps):
-    blink_durations = []
-    blink_counter = 0
-    total_blinks = 0
-    EYE_AR_THRESH = 0.24  # EAR threshold for blink detection
-    EYE_AR_CONSEC_FRAMES = 4  # Consecutive frames for blink detection
-    frame_count = 0  # Initialize frame counter
-    for landmark, size in zip(landmarks, sizes):
-        if landmark is not None:
-            leftEye = landmark[lStart:lEnd]
-            rightEye = landmark[rStart:rEnd]
-            leftEAR = eye_aspect_ratio(leftEye)
-            rightEAR = eye_aspect_ratio(rightEye)
-            if leftEAR < EYE_AR_THRESH and rightEAR < EYE_AR_THRESH:
-                if blink_counter == 0:
-                    blink_start_frame = frame_count  # Start tracking blink in frames
-                blink_counter += 1
             else:
-                if blink_counter >= EYE_AR_CONSEC_FRAMES:
-                    blink_end_frame = frame_count
-                    blink_duration_frames = blink_end_frame - blink_start_frame
-                    blink_duration_seconds = blink_duration_frames / fps  # Convert frames to seconds
-                    blink_durations.append(blink_duration_seconds)  # Store blink duration in seconds
-                    total_blinks += 1
-                blink_counter = 0
-        frame_count += 1  # Increment the frame counter for each loop iteration
-    return blink_durations, total_blinks
-def detect_smiles(faces,smile_cascade):
-    smiles=[]
-    count=0
-    for face in faces:
-        if face is not None:
-            smile = smile_cascade.detectMultiScale(face, scaleFactor=1.8, minNeighbors=20, minSize=(25, 25))
-            if len(smile) > 0:
                 smiles.append(True)
-                count+=1
             else:
                 smiles.append(False)
         else:
             smiles.append(None)
-    return smiles,count
-def cal_yawn(landmarks):
-    # Corrected lip landmark indices for dlib's 68-point model
-    top_lip_idx = [50, 51, 52, 53, 61, 62, 63]
-    low_lip_idx = [56, 57, 58, 59, 65, 66, 67]
-    top_lip = np.array([landmarks[idx] for idx in top_lip_idx])
-    low_lip = np.array([landmarks[idx] for idx in low_lip_idx])
-    top_mean = np.mean(top_lip, axis=0)
-    low_mean = np.mean(low_lip, axis=0)
-    distance = dist.euclidean(top_mean, low_mean)
-    return distance
-def detect_yawn(landmarks,sizes):
-    yawn=[]
-    count=0
-    normalized_yawn_thresh = 0.25
-    normalized_lip_distances=[]
-    for landmark,size in zip(landmarks,sizes):
-        if landmark is not None:
-            lip_dist = cal_yawn(landmark)
-            face_size = dist.euclidean(landmark[8], landmark[27])
-            normalized_lip_dist = lip_dist / face_size
-            normalized_lip_distances.append(normalized_lip_dist)
-            if normalized_lip_dist > normalized_yawn_thresh:
-                yawn.append(True)
-                count+=1
             else:
-                yawn.append(False)
         else:
-            normalized_lip_distances.append(None)
-            yawn.append(None)
-    return yawn,normalized_lip_distances,count

     ear = (A + B) / (2.0 * C)  # EAR formula
     return ear
+def euclidean_distance(p1, p2):
+    return np.linalg.norm(p1 - p2)
+# Function to detect smiles based on mouth aspect ratio
+def detect_smiles(landmarks_list, face_sizes, fps=30, consecutive_frames=2):
+    smile_ratios = []  # Store the smile ratios for plotting
+    smiles = []
+    smile_durations = []  # To store the duration of each smile
+    total_smiles = 0
+    smile_in_progress = False
+    smile_start_frame = None
+    avg_dynamic_threshold=[]
+    for frame_idx, (landmarks, face_size) in enumerate(zip(landmarks_list, face_sizes)):
+        if landmarks is not None:
+            # Use NumPy array indices for the relevant mouth landmarks
+            left_corner = np.array(landmarks[48])
+            right_corner = np.array(landmarks[54])
+            top_lip = np.array(landmarks[51])
+            bottom_lip = np.array(landmarks[57])
+            mouth_width = euclidean_distance(left_corner, right_corner)
+            mouth_height = euclidean_distance(top_lip, bottom_lip)
+            face_width, face_height = face_size  # face_size is (width, height)
+            if face_width > 0 and face_height > 0:
+                normalized_mouth_width = mouth_width / face_width
+                normalized_mouth_height = mouth_height / face_height
             else:
+                normalized_mouth_width = 0
+                normalized_mouth_height = 0
+            smile_ratios.append(normalized_mouth_width)
+            dynamic_threshold = 0.2 + (0.05 * face_width / 100)
+            avg_dynamic_threshold.append(dynamic_threshold)
+            # print(dynamic_threshold)
+            # Check if the smile meets the threshold
+            if (normalized_mouth_width > dynamic_threshold) and (normalized_mouth_height > 0.06):
                 smiles.append(True)
+                if not smile_in_progress:
+                    smile_in_progress = True
+                    smile_start_frame = frame_idx  # Record the start of the smile
             else:
                 smiles.append(False)
+                if smile_in_progress and (frame_idx - smile_start_frame >= consecutive_frames):
+                    smile_in_progress = False
+                    smile_end_frame = frame_idx
+                    smile_duration = (smile_end_frame - smile_start_frame) / fps  # Calculate smile duration
+                    smile_durations.append(smile_duration)
+                    total_smiles += 1  # Increment total smile count
         else:
             smiles.append(None)
+    try:
+        avg_thr=sum(avg_dynamic_threshold)/len(avg_dynamic_threshold)
+    except:
+        avg_thr=0
+    return smiles, smile_ratios, total_smiles, smile_durations,avg_thr
+# Function to detect blinks based on the eye aspect ratio (EAR)
+import numpy as np
+# Function to detect blinks based on the eye aspect ratio (EAR)
+def detect_blinks(landmarks_list, face_sizes, ear_threshold=0.24, consecutive_frames=2):
+    ear_ratios = []  # Store EAR for plotting
+    blinks = []
+    # Variables to monitor consecutive low EAR values
+    blink_count = 0
+    consec_low_ear = 0
+    for landmarks, face in zip(landmarks_list, face_sizes):
+        if landmarks is not None:
+            left_eye = landmarks[36:42]  # Points 36-41 (inclusive) for the left eye
+            right_eye = landmarks[42:48]
+            def eye_aspect_ratio(eye):
+                A = euclidean_distance(eye[1], eye[5])
+                B = euclidean_distance(eye[2], eye[4])
+                C = euclidean_distance(eye[0], eye[3])
+                ear = (A + B) / (2.0 * C)
+                return ear
+            left_ear = eye_aspect_ratio(left_eye)
+            right_ear = eye_aspect_ratio(right_eye)
+            avg_ear = (left_ear + right_ear) / 2.0
+            ear_ratios.append(avg_ear)
+            if avg_ear < ear_threshold:
+                consec_low_ear += 1
+            else:
+                # If low EAR is detected for enough consecutive frames, count as a blink
+                if consec_low_ear >= consecutive_frames:
+                    blink_count += 1
+                consec_low_ear = 0  # Reset the consecutive low EAR counter
+        else:
+            blinks.append(None)
+    return blink_count, ear_ratios
+# Function to detect yawns based on the vertical distance between top and bottom lips
+# Function to detect yawns based on the vertical distance between top and bottom lips
+def detect_yawns(landmarks_list, face_sizes, fps=30, consecutive_frames=3):
+    yawn_ratios = []  # Store the yawn ratios for plotting
+    yawns = []
+    yawn_durations = []  # To store the duration of each yawn
+    total_yawns = 0
+    yawn_in_progress = False
+    yawn_start_frame = None
+    for frame_idx, (landmarks, face_size) in enumerate(zip(landmarks_list, face_sizes)):
+        if landmarks is not None:
+            top_lip = np.array(landmarks[51])
+            bottom_lip = np.array(landmarks[57])
+            mouth_height = euclidean_distance(top_lip, bottom_lip)
+            face_width, face_height = face_size  # face_size is (width, height)
+            if face_height > 0:
+                normalized_mouth_height = mouth_height / face_height
             else:
+                normalized_mouth_height = 0
+            yawn_ratios.append(normalized_mouth_height)
+            # Check if the yawn meets the threshold
+            if normalized_mouth_height > 0.24:
+                yawns.append(True)
+                if not yawn_in_progress:
+                    yawn_in_progress = True
+                    yawn_start_frame = frame_idx  # Record the start of the yawn
+            else:
+                yawns.append(False)
+                if yawn_in_progress and (frame_idx - yawn_start_frame >= consecutive_frames):
+                    yawn_in_progress = False
+                    yawn_end_frame = frame_idx
+                    yawn_duration = (yawn_end_frame - yawn_start_frame) / fps  # Calculate yawn duration
+                    yawn_durations.append(yawn_duration)
+                    total_yawns += 1  # Increment total yawn count
         else:
+            yawns.append(None)
+    return yawns, yawn_ratios, total_yawns, yawn_durations

main.py CHANGED Viewed

@@ -6,12 +6,14 @@ os.environ['TF_ENABLE_ONEDNN_OPTS'] = '0'
 import logging
 logging.getLogger('absl').setLevel(logging.ERROR)
 from functions.models import models_dict
-from functions.helper import extract_faces_from_frames
-from functions.video import eyebrow,blinks,detect_yawn,detect_smiles
 from functions.valence_arousal import va_predict
 from functions.fer import fer_predict,plot_graph
 from moviepy.editor import VideoFileClip
 import json
 import pandas as pd
 from typing import Callable
 from functions.audio import extract_audio_features
@@ -57,7 +59,9 @@ def analyze_live_video(video_path: str, uid: str, user_id: str, count: int, fina
         os.makedirs(folder_path,exist_ok=True)
         meta_data_path=os.path.join(folder_path,'metadata.json')
         valence_plot=os.path.join(folder_path,"vas.png")
         df_path=os.path.join(folder_path,'data.csv')
         video_clip=VideoFileClip(video_path)
         video_clip=video_clip.set_fps(fps)
@@ -72,8 +76,8 @@ def analyze_live_video(video_path: str, uid: str, user_id: str, count: int, fina
         # faces=[extract_face(frame) for frame in tqdm(video_frames)]
-        af=extract_audio_features(audio_path,asrmodel,asrproc,sentipipe)
         fer_emotions,class_wise_frame_count,em_tensors=fer_predict(faces,fps,fer_model)
         valence_list,arousal_list,stress_list=va_predict(valence_arousal_model,val_ar_feat_model,faces,list(em_tensors))
@@ -81,30 +85,35 @@ def analyze_live_video(video_path: str, uid: str, user_id: str, count: int, fina
         eyebrow_dist=eyebrow(landmarks,sizes)
         print('eyebrow done')
-        blink_durations,total_blinks=blinks(landmarks,sizes,fps)
-        print('blinks done')
-        smiles,smile_count=detect_smiles(faces,smile_cascade)
-        print('smiles done')
-        yawn,normalized_lip_distances,yawn_count=detect_yawn(landmarks,sizes)
         print('ywan done')
-        y_vals = [valence_list, arousal_list, stress_list,eyebrow_dist]
-        labels = ['Valence', 'Arousal', 'Stress',"EyeBrowDistance"]
-        plot_graph(timestamps, y_vals, labels, valence_plot)
         print('graph_plotted')
         meta_data={}
         meta_data['facial_emotion_recognition'] = {
 			"class_wise_frame_count": class_wise_frame_count,
 		}
         meta_data['audio']=af
-        meta_data['blinks']={
-            'blink_durations':blink_durations,
-            'total_blinks':total_blinks
-        }
-        meta_data['smile']=smile_count
-        meta_data['yawn']=yawn_count
         with open(meta_data_path, 'w') as json_file:
             json.dump(meta_data, json_file, indent=4)
         df=pd.DataFrame(

 import logging
 logging.getLogger('absl').setLevel(logging.ERROR)
 from functions.models import models_dict
+from functions.helper import extract_faces_from_frames,make_pdf
+from functions.video import eyebrow,detect_blinks,detect_yawns,detect_smiles
 from functions.valence_arousal import va_predict
 from functions.fer import fer_predict,plot_graph
+from functions.helper import plot_facial_expression_graphs
 from moviepy.editor import VideoFileClip
 import json
+# from trash import detect_eyes_in_faces
 import pandas as pd
 from typing import Callable
 from functions.audio import extract_audio_features
         os.makedirs(folder_path,exist_ok=True)
         meta_data_path=os.path.join(folder_path,'metadata.json')
         valence_plot=os.path.join(folder_path,"vas.png")
+        word_cloud=os.path.join(folder_path,'wordcloud.jpg')
         df_path=os.path.join(folder_path,'data.csv')
+        pdf_filename = os.path.join(folder_path,"formatted_output_with_plots.pdf")
         video_clip=VideoFileClip(video_path)
         video_clip=video_clip.set_fps(fps)
         # faces=[extract_face(frame) for frame in tqdm(video_frames)]
+        af,pitches=extract_audio_features(audio_path,asrmodel,asrproc,sentipipe,duration,word_cloud)
+        pitches=[float(pitch) for pitch in pitches]
         fer_emotions,class_wise_frame_count,em_tensors=fer_predict(faces,fps,fer_model)
         valence_list,arousal_list,stress_list=va_predict(valence_arousal_model,val_ar_feat_model,faces,list(em_tensors))
         eyebrow_dist=eyebrow(landmarks,sizes)
         print('eyebrow done')
+        blink_count, ear_ratios=detect_blinks(landmarks,sizes,fps)
+        ear_ratios=[float(pitch) for pitch in ear_ratios]
+        print('blinks done',blink_count)
+        smiles, smile_ratios, total_smiles, smile_durations,smile_threshold=detect_smiles(landmarks,sizes)
+        smile_ratios=[float(smile) for smile in smile_ratios]
+        print('smiles done',total_smiles)
+        yawns, yawn_ratios, total_yawns, yawn_durations=detect_yawns(landmarks,sizes)
         print('ywan done')
+        thresholds=[smile_threshold,0.225,0.22]
+        buffer = plot_facial_expression_graphs(smile_ratios, ear_ratios, yawn_ratios, thresholds, 'path_to_save_plot.pdf')
+        # print("detect_eyes : ",detect_eyes_in_faces(faces))
+        y_vals = [valence_list, arousal_list, stress_list,eyebrow_dist,pitches]
+        labels = ['Valence', 'Arousal', 'Stress',"EyeBrowDistance","Pitch"]
+        buf=plot_graph(timestamps, y_vals, labels, valence_plot)
         print('graph_plotted')
         meta_data={}
+        meta_data['duration']=duration
         meta_data['facial_emotion_recognition'] = {
 			"class_wise_frame_count": class_wise_frame_count,
 		}
         meta_data['audio']=af
+        make_pdf(pdf_filename,meta_data,buf,buffer)
         with open(meta_data_path, 'w') as json_file:
             json.dump(meta_data, json_file, indent=4)
         df=pd.DataFrame(

requirements.txt CHANGED Viewed

Binary files a/requirements.txt and b/requirements.txt differ