Spaces:

Rahulk2197
/

Interview

Sleeping

App Files Files Community

Rahulk2197 commited on Oct 18, 2024

Commit

e61da93

verified ·

1 Parent(s): dd3c6f1

Update functions/audio.py

Browse files

Files changed (1) hide show

functions/audio.py +147 -145

functions/audio.py CHANGED Viewed

@@ -1,145 +1,147 @@
-import librosa
-import numpy as np
-import torch
-from collections import Counter
-import nltk
-import string
-import matplotlib.pyplot as plt
-from wordcloud import WordCloud
-nltk.download('punkt')
-nltk.download('averaged_perceptron_tagger')
-device = "cuda:0" if torch.cuda.is_available() else "cpu"
-torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
-def get_pitch_list(y,sr):
-    hop_length = int(sr / 30)  # hop_length determines how far apart the frames are
-    # Extract the pitch (F0) using librosa's piptrack method
-    pitches, magnitudes = librosa.piptrack(y=y, sr=sr, hop_length=hop_length)
-    # Get the pitch frequencies from the pitch array
-    pitch_frequencies = []
-    for t in range(pitches.shape[1]):
-        index = magnitudes[:, t].argmax()  # Get the index of the maximum magnitude
-        pitch = pitches[index, t]
-        pitch_frequencies.append(pitch)
-    # Convert pitch_frequencies to a NumPy array
-    pitch_frequencies = np.array(pitch_frequencies)
-    print("shape : ",pitch_frequencies.shape)
-    return pitch_frequencies
-def extract_audio_features(audio_path, asrmodel, asrproc, sentipipe, duration, wordcloud_path):
-    y, sr = librosa.load(audio_path, sr=16000)
-    inputs = asrproc(y, sampling_rate=sr, return_tensors="pt").input_features
-    inputs = inputs.to(device, dtype=torch_dtype)
-    with torch.no_grad():
-        generated_ids = asrmodel.generate(inputs)
-        transcript = asrproc.batch_decode(generated_ids, skip_special_tokens=True)[0]
-    # Sound intensity (RMS)
-    rms = librosa.feature.rms(y=y)
-    sound_intensity = np.mean(rms)
-    # Pitch list
-    pitches=get_pitch_list(y,sr)
-    # Fundamental frequency (F0)
-    f0, voiced_flag, voiced_probs = librosa.pyin(y, fmin=librosa.note_to_hz('C2'), fmax=librosa.note_to_hz('C7'))
-    fundamental_frequency = np.nanmean(f0)
-    # Spectral energy (based on STFT)
-    S = np.abs(librosa.stft(y))
-    spectral_energy = np.mean(np.sum(S ** 2, axis=0))
-    # Spectral centroid
-    spectral_centroid = librosa.feature.spectral_centroid(y=y, sr=sr)
-    avg_spectral_centroid = np.mean(spectral_centroid)
-    # Zero-crossing rate
-    zcr = librosa.feature.zero_crossing_rate(y)
-    zero_crossing_rate = np.mean(zcr)
-    # Pause detection
-    silence_threshold = -40
-    silent_intervals = librosa.effects.split(y, top_db=silence_threshold)
-    pause_duration = 0
-    for start, end in silent_intervals:
-        pause_duration += (end - start) / sr
-    total_duration = librosa.get_duration(y=y, sr=sr)
-    pause_rate = (pause_duration / total_duration) * 60  # Convert to pauses per minute
-    # Transcript processing
-    words = nltk.word_tokenize(transcript)
-    words = [word.lower() for word in words if word not in string.punctuation]
-    num_words = len(words)
-    unique_words = len(set(words))
-    word_frequencies = Counter(words)
-    # Duration in minutes
-    duration_minutes = total_duration / 60
-    avg_words_per_minute = num_words / duration_minutes
-    avg_unique_words_per_minute = unique_words / duration_minutes
-    # Filler word detection
-    filler_words = [
-        'uh', 'um', 'like', 'you know', 'ah', 'er', 'hmm', 'well', 'so',
-        'I mean', 'okay', 'right', 'actually', 'basically', 'you see',
-        'sort of', 'kind of', 'yeah', 'literally', 'just', 'I guess',
-        'totally', 'honestly', 'seriously', 'alright'
-    ]
-    filler_word_count = sum([word_frequencies.get(filler, 0) for filler in filler_words])
-    filler_words_per_minute = filler_word_count / duration_minutes
-    # POS tagging
-    pos_tags = nltk.pos_tag(words)
-    nouns = [word for word, pos in pos_tags if pos.startswith('NN')]
-    adjectives = [word for word, pos in pos_tags if pos.startswith('JJ')]
-    verbs = [word for word, pos in pos_tags if pos.startswith('VB')]
-    # Sentiment analysis
-    sentiment = sentipipe(transcript)
-    sentiment_mapping = {
-        "LABEL_0": "Negative",
-        "LABEL_1": "Neutral",
-        "LABEL_2": "Positive"
-    }
-    sentiment[0]['label'] = sentiment_mapping[sentiment[0]['label']]
-    # Generate Word Cloud and Save it as an Image
-    wordcloud = WordCloud(width=800, height=400, background_color='white').generate_from_frequencies(word_frequencies)
-    # Save the Word Cloud to the provided path
-    plt.figure(figsize=(10, 5))
-    plt.imshow(wordcloud, interpolation='bilinear')
-    plt.axis('off')
-    plt.savefig(wordcloud_path, format='png')
-    plt.close()
-    print("Nouns: ", nouns)
-    print("Adjectives: ", adjectives)
-    print("Verbs: ", verbs)
-    print("Sentiment: ", sentiment)
-    return {
-        "transcript": transcript,
-        "sentiment": sentiment,
-        "sound_intensity": float(sound_intensity),
-        "fundamental_frequency": float(fundamental_frequency),
-        "spectral_energy": float(spectral_energy),
-        "spectral_centroid": float(avg_spectral_centroid),
-        "zero_crossing_rate": float(zero_crossing_rate),
-        "avg_words_per_minute": float(avg_words_per_minute),
-        "avg_unique_words_per_minute": float(avg_unique_words_per_minute),
-        "unique_word_count": int(unique_words),
-        "filler_words_per_minute": float(filler_words_per_minute),
-        "noun_count": len(nouns),
-        "adjective_count": len(adjectives),
-        "verb_count": len(verbs),
-        "pause_rate": float(pause_rate)
-    },pitches

+import librosa
+import numpy as np
+import torch
+from collections import Counter
+import nltk
+import string
+import matplotlib.pyplot as plt
+from wordcloud import WordCloud
+nltk.download('punkt')
+nltk.download('punkt_tab')
+nltk.download('averaged_perceptron_tagger_eng')
+nltk.download('averaged_perceptron_tagger')
+device = "cuda:0" if torch.cuda.is_available() else "cpu"
+torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
+def get_pitch_list(y,sr):
+    hop_length = int(sr / 30)  # hop_length determines how far apart the frames are
+    # Extract the pitch (F0) using librosa's piptrack method
+    pitches, magnitudes = librosa.piptrack(y=y, sr=sr, hop_length=hop_length)
+    # Get the pitch frequencies from the pitch array
+    pitch_frequencies = []
+    for t in range(pitches.shape[1]):
+        index = magnitudes[:, t].argmax()  # Get the index of the maximum magnitude
+        pitch = pitches[index, t]
+        pitch_frequencies.append(pitch)
+    # Convert pitch_frequencies to a NumPy array
+    pitch_frequencies = np.array(pitch_frequencies)
+    print("shape : ",pitch_frequencies.shape)
+    return pitch_frequencies
+def extract_audio_features(audio_path, asrmodel, asrproc, sentipipe, duration, wordcloud_path):
+    y, sr = librosa.load(audio_path, sr=16000)
+    inputs = asrproc(y, sampling_rate=sr, return_tensors="pt").input_features
+    inputs = inputs.to(device, dtype=torch_dtype)
+    with torch.no_grad():
+        generated_ids = asrmodel.generate(inputs)
+        transcript = asrproc.batch_decode(generated_ids, skip_special_tokens=True)[0]
+    # Sound intensity (RMS)
+    rms = librosa.feature.rms(y=y)
+    sound_intensity = np.mean(rms)
+    # Pitch list
+    pitches=get_pitch_list(y,sr)
+    # Fundamental frequency (F0)
+    f0, voiced_flag, voiced_probs = librosa.pyin(y, fmin=librosa.note_to_hz('C2'), fmax=librosa.note_to_hz('C7'))
+    fundamental_frequency = np.nanmean(f0)
+    # Spectral energy (based on STFT)
+    S = np.abs(librosa.stft(y))
+    spectral_energy = np.mean(np.sum(S ** 2, axis=0))
+    # Spectral centroid
+    spectral_centroid = librosa.feature.spectral_centroid(y=y, sr=sr)
+    avg_spectral_centroid = np.mean(spectral_centroid)
+    # Zero-crossing rate
+    zcr = librosa.feature.zero_crossing_rate(y)
+    zero_crossing_rate = np.mean(zcr)
+    # Pause detection
+    silence_threshold = -40
+    silent_intervals = librosa.effects.split(y, top_db=silence_threshold)
+    pause_duration = 0
+    for start, end in silent_intervals:
+        pause_duration += (end - start) / sr
+    total_duration = librosa.get_duration(y=y, sr=sr)
+    pause_rate = (pause_duration / total_duration) * 60  # Convert to pauses per minute
+    # Transcript processing
+    words = nltk.word_tokenize(transcript)
+    words = [word.lower() for word in words if word not in string.punctuation]
+    num_words = len(words)
+    unique_words = len(set(words))
+    word_frequencies = Counter(words)
+    # Duration in minutes
+    duration_minutes = total_duration / 60
+    avg_words_per_minute = num_words / duration_minutes
+    avg_unique_words_per_minute = unique_words / duration_minutes
+    # Filler word detection
+    filler_words = [
+        'uh', 'um', 'like', 'you know', 'ah', 'er', 'hmm', 'well', 'so',
+        'I mean', 'okay', 'right', 'actually', 'basically', 'you see',
+        'sort of', 'kind of', 'yeah', 'literally', 'just', 'I guess',
+        'totally', 'honestly', 'seriously', 'alright'
+    ]
+    filler_word_count = sum([word_frequencies.get(filler, 0) for filler in filler_words])
+    filler_words_per_minute = filler_word_count / duration_minutes
+    # POS tagging
+    pos_tags = nltk.pos_tag(words)
+    nouns = [word for word, pos in pos_tags if pos.startswith('NN')]
+    adjectives = [word for word, pos in pos_tags if pos.startswith('JJ')]
+    verbs = [word for word, pos in pos_tags if pos.startswith('VB')]
+    # Sentiment analysis
+    sentiment = sentipipe(transcript)
+    sentiment_mapping = {
+        "LABEL_0": "Negative",
+        "LABEL_1": "Neutral",
+        "LABEL_2": "Positive"
+    }
+    sentiment[0]['label'] = sentiment_mapping[sentiment[0]['label']]
+    # Generate Word Cloud and Save it as an Image
+    wordcloud = WordCloud(width=800, height=400, background_color='white').generate_from_frequencies(word_frequencies)
+    # Save the Word Cloud to the provided path
+    plt.figure(figsize=(10, 5))
+    plt.imshow(wordcloud, interpolation='bilinear')
+    plt.axis('off')
+    plt.savefig(wordcloud_path, format='png')
+    plt.close()
+    print("Nouns: ", nouns)
+    print("Adjectives: ", adjectives)
+    print("Verbs: ", verbs)
+    print("Sentiment: ", sentiment)
+    return {
+        "transcript": transcript,
+        "sentiment": sentiment,
+        "sound_intensity": float(sound_intensity),
+        "fundamental_frequency": float(fundamental_frequency),
+        "spectral_energy": float(spectral_energy),
+        "spectral_centroid": float(avg_spectral_centroid),
+        "zero_crossing_rate": float(zero_crossing_rate),
+        "avg_words_per_minute": float(avg_words_per_minute),
+        "avg_unique_words_per_minute": float(avg_unique_words_per_minute),
+        "unique_word_count": int(unique_words),
+        "filler_words_per_minute": float(filler_words_per_minute),
+        "noun_count": len(nouns),
+        "adjective_count": len(adjectives),
+        "verb_count": len(verbs),
+        "pause_rate": float(pause_rate)
+    },pitches