Rahulk2197 commited on
Commit
e61da93
·
verified ·
1 Parent(s): dd3c6f1

Update functions/audio.py

Browse files
Files changed (1) hide show
  1. functions/audio.py +147 -145
functions/audio.py CHANGED
@@ -1,145 +1,147 @@
1
- import librosa
2
- import numpy as np
3
- import torch
4
- from collections import Counter
5
- import nltk
6
- import string
7
- import matplotlib.pyplot as plt
8
- from wordcloud import WordCloud
9
-
10
- nltk.download('punkt')
11
- nltk.download('averaged_perceptron_tagger')
12
- device = "cuda:0" if torch.cuda.is_available() else "cpu"
13
- torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
14
-
15
- def get_pitch_list(y,sr):
16
- hop_length = int(sr / 30) # hop_length determines how far apart the frames are
17
-
18
- # Extract the pitch (F0) using librosa's piptrack method
19
- pitches, magnitudes = librosa.piptrack(y=y, sr=sr, hop_length=hop_length)
20
-
21
- # Get the pitch frequencies from the pitch array
22
- pitch_frequencies = []
23
-
24
- for t in range(pitches.shape[1]):
25
- index = magnitudes[:, t].argmax() # Get the index of the maximum magnitude
26
- pitch = pitches[index, t]
27
-
28
- pitch_frequencies.append(pitch)
29
-
30
- # Convert pitch_frequencies to a NumPy array
31
- pitch_frequencies = np.array(pitch_frequencies)
32
- print("shape : ",pitch_frequencies.shape)
33
- return pitch_frequencies
34
-
35
-
36
- def extract_audio_features(audio_path, asrmodel, asrproc, sentipipe, duration, wordcloud_path):
37
- y, sr = librosa.load(audio_path, sr=16000)
38
- inputs = asrproc(y, sampling_rate=sr, return_tensors="pt").input_features
39
- inputs = inputs.to(device, dtype=torch_dtype)
40
- with torch.no_grad():
41
- generated_ids = asrmodel.generate(inputs)
42
- transcript = asrproc.batch_decode(generated_ids, skip_special_tokens=True)[0]
43
-
44
- # Sound intensity (RMS)
45
- rms = librosa.feature.rms(y=y)
46
- sound_intensity = np.mean(rms)
47
-
48
- # Pitch list
49
- pitches=get_pitch_list(y,sr)
50
-
51
- # Fundamental frequency (F0)
52
- f0, voiced_flag, voiced_probs = librosa.pyin(y, fmin=librosa.note_to_hz('C2'), fmax=librosa.note_to_hz('C7'))
53
- fundamental_frequency = np.nanmean(f0)
54
-
55
- # Spectral energy (based on STFT)
56
- S = np.abs(librosa.stft(y))
57
- spectral_energy = np.mean(np.sum(S ** 2, axis=0))
58
-
59
- # Spectral centroid
60
- spectral_centroid = librosa.feature.spectral_centroid(y=y, sr=sr)
61
- avg_spectral_centroid = np.mean(spectral_centroid)
62
-
63
- # Zero-crossing rate
64
- zcr = librosa.feature.zero_crossing_rate(y)
65
- zero_crossing_rate = np.mean(zcr)
66
-
67
- # Pause detection
68
- silence_threshold = -40
69
- silent_intervals = librosa.effects.split(y, top_db=silence_threshold)
70
- pause_duration = 0
71
- for start, end in silent_intervals:
72
- pause_duration += (end - start) / sr
73
-
74
- total_duration = librosa.get_duration(y=y, sr=sr)
75
- pause_rate = (pause_duration / total_duration) * 60 # Convert to pauses per minute
76
-
77
- # Transcript processing
78
- words = nltk.word_tokenize(transcript)
79
- words = [word.lower() for word in words if word not in string.punctuation]
80
- num_words = len(words)
81
- unique_words = len(set(words))
82
- word_frequencies = Counter(words)
83
-
84
- # Duration in minutes
85
- duration_minutes = total_duration / 60
86
- avg_words_per_minute = num_words / duration_minutes
87
- avg_unique_words_per_minute = unique_words / duration_minutes
88
-
89
- # Filler word detection
90
- filler_words = [
91
- 'uh', 'um', 'like', 'you know', 'ah', 'er', 'hmm', 'well', 'so',
92
- 'I mean', 'okay', 'right', 'actually', 'basically', 'you see',
93
- 'sort of', 'kind of', 'yeah', 'literally', 'just', 'I guess',
94
- 'totally', 'honestly', 'seriously', 'alright'
95
- ]
96
- filler_word_count = sum([word_frequencies.get(filler, 0) for filler in filler_words])
97
- filler_words_per_minute = filler_word_count / duration_minutes
98
-
99
- # POS tagging
100
- pos_tags = nltk.pos_tag(words)
101
- nouns = [word for word, pos in pos_tags if pos.startswith('NN')]
102
- adjectives = [word for word, pos in pos_tags if pos.startswith('JJ')]
103
- verbs = [word for word, pos in pos_tags if pos.startswith('VB')]
104
-
105
- # Sentiment analysis
106
- sentiment = sentipipe(transcript)
107
- sentiment_mapping = {
108
- "LABEL_0": "Negative",
109
- "LABEL_1": "Neutral",
110
- "LABEL_2": "Positive"
111
- }
112
- sentiment[0]['label'] = sentiment_mapping[sentiment[0]['label']]
113
-
114
- # Generate Word Cloud and Save it as an Image
115
- wordcloud = WordCloud(width=800, height=400, background_color='white').generate_from_frequencies(word_frequencies)
116
-
117
- # Save the Word Cloud to the provided path
118
- plt.figure(figsize=(10, 5))
119
- plt.imshow(wordcloud, interpolation='bilinear')
120
- plt.axis('off')
121
- plt.savefig(wordcloud_path, format='png')
122
- plt.close()
123
-
124
- print("Nouns: ", nouns)
125
- print("Adjectives: ", adjectives)
126
- print("Verbs: ", verbs)
127
- print("Sentiment: ", sentiment)
128
-
129
- return {
130
- "transcript": transcript,
131
- "sentiment": sentiment,
132
- "sound_intensity": float(sound_intensity),
133
- "fundamental_frequency": float(fundamental_frequency),
134
- "spectral_energy": float(spectral_energy),
135
- "spectral_centroid": float(avg_spectral_centroid),
136
- "zero_crossing_rate": float(zero_crossing_rate),
137
- "avg_words_per_minute": float(avg_words_per_minute),
138
- "avg_unique_words_per_minute": float(avg_unique_words_per_minute),
139
- "unique_word_count": int(unique_words),
140
- "filler_words_per_minute": float(filler_words_per_minute),
141
- "noun_count": len(nouns),
142
- "adjective_count": len(adjectives),
143
- "verb_count": len(verbs),
144
- "pause_rate": float(pause_rate)
145
- },pitches
 
 
 
1
+ import librosa
2
+ import numpy as np
3
+ import torch
4
+ from collections import Counter
5
+ import nltk
6
+ import string
7
+ import matplotlib.pyplot as plt
8
+ from wordcloud import WordCloud
9
+
10
+ nltk.download('punkt')
11
+ nltk.download('punkt_tab')
12
+ nltk.download('averaged_perceptron_tagger_eng')
13
+ nltk.download('averaged_perceptron_tagger')
14
+ device = "cuda:0" if torch.cuda.is_available() else "cpu"
15
+ torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
16
+
17
+ def get_pitch_list(y,sr):
18
+ hop_length = int(sr / 30) # hop_length determines how far apart the frames are
19
+
20
+ # Extract the pitch (F0) using librosa's piptrack method
21
+ pitches, magnitudes = librosa.piptrack(y=y, sr=sr, hop_length=hop_length)
22
+
23
+ # Get the pitch frequencies from the pitch array
24
+ pitch_frequencies = []
25
+
26
+ for t in range(pitches.shape[1]):
27
+ index = magnitudes[:, t].argmax() # Get the index of the maximum magnitude
28
+ pitch = pitches[index, t]
29
+
30
+ pitch_frequencies.append(pitch)
31
+
32
+ # Convert pitch_frequencies to a NumPy array
33
+ pitch_frequencies = np.array(pitch_frequencies)
34
+ print("shape : ",pitch_frequencies.shape)
35
+ return pitch_frequencies
36
+
37
+
38
+ def extract_audio_features(audio_path, asrmodel, asrproc, sentipipe, duration, wordcloud_path):
39
+ y, sr = librosa.load(audio_path, sr=16000)
40
+ inputs = asrproc(y, sampling_rate=sr, return_tensors="pt").input_features
41
+ inputs = inputs.to(device, dtype=torch_dtype)
42
+ with torch.no_grad():
43
+ generated_ids = asrmodel.generate(inputs)
44
+ transcript = asrproc.batch_decode(generated_ids, skip_special_tokens=True)[0]
45
+
46
+ # Sound intensity (RMS)
47
+ rms = librosa.feature.rms(y=y)
48
+ sound_intensity = np.mean(rms)
49
+
50
+ # Pitch list
51
+ pitches=get_pitch_list(y,sr)
52
+
53
+ # Fundamental frequency (F0)
54
+ f0, voiced_flag, voiced_probs = librosa.pyin(y, fmin=librosa.note_to_hz('C2'), fmax=librosa.note_to_hz('C7'))
55
+ fundamental_frequency = np.nanmean(f0)
56
+
57
+ # Spectral energy (based on STFT)
58
+ S = np.abs(librosa.stft(y))
59
+ spectral_energy = np.mean(np.sum(S ** 2, axis=0))
60
+
61
+ # Spectral centroid
62
+ spectral_centroid = librosa.feature.spectral_centroid(y=y, sr=sr)
63
+ avg_spectral_centroid = np.mean(spectral_centroid)
64
+
65
+ # Zero-crossing rate
66
+ zcr = librosa.feature.zero_crossing_rate(y)
67
+ zero_crossing_rate = np.mean(zcr)
68
+
69
+ # Pause detection
70
+ silence_threshold = -40
71
+ silent_intervals = librosa.effects.split(y, top_db=silence_threshold)
72
+ pause_duration = 0
73
+ for start, end in silent_intervals:
74
+ pause_duration += (end - start) / sr
75
+
76
+ total_duration = librosa.get_duration(y=y, sr=sr)
77
+ pause_rate = (pause_duration / total_duration) * 60 # Convert to pauses per minute
78
+
79
+ # Transcript processing
80
+ words = nltk.word_tokenize(transcript)
81
+ words = [word.lower() for word in words if word not in string.punctuation]
82
+ num_words = len(words)
83
+ unique_words = len(set(words))
84
+ word_frequencies = Counter(words)
85
+
86
+ # Duration in minutes
87
+ duration_minutes = total_duration / 60
88
+ avg_words_per_minute = num_words / duration_minutes
89
+ avg_unique_words_per_minute = unique_words / duration_minutes
90
+
91
+ # Filler word detection
92
+ filler_words = [
93
+ 'uh', 'um', 'like', 'you know', 'ah', 'er', 'hmm', 'well', 'so',
94
+ 'I mean', 'okay', 'right', 'actually', 'basically', 'you see',
95
+ 'sort of', 'kind of', 'yeah', 'literally', 'just', 'I guess',
96
+ 'totally', 'honestly', 'seriously', 'alright'
97
+ ]
98
+ filler_word_count = sum([word_frequencies.get(filler, 0) for filler in filler_words])
99
+ filler_words_per_minute = filler_word_count / duration_minutes
100
+
101
+ # POS tagging
102
+ pos_tags = nltk.pos_tag(words)
103
+ nouns = [word for word, pos in pos_tags if pos.startswith('NN')]
104
+ adjectives = [word for word, pos in pos_tags if pos.startswith('JJ')]
105
+ verbs = [word for word, pos in pos_tags if pos.startswith('VB')]
106
+
107
+ # Sentiment analysis
108
+ sentiment = sentipipe(transcript)
109
+ sentiment_mapping = {
110
+ "LABEL_0": "Negative",
111
+ "LABEL_1": "Neutral",
112
+ "LABEL_2": "Positive"
113
+ }
114
+ sentiment[0]['label'] = sentiment_mapping[sentiment[0]['label']]
115
+
116
+ # Generate Word Cloud and Save it as an Image
117
+ wordcloud = WordCloud(width=800, height=400, background_color='white').generate_from_frequencies(word_frequencies)
118
+
119
+ # Save the Word Cloud to the provided path
120
+ plt.figure(figsize=(10, 5))
121
+ plt.imshow(wordcloud, interpolation='bilinear')
122
+ plt.axis('off')
123
+ plt.savefig(wordcloud_path, format='png')
124
+ plt.close()
125
+
126
+ print("Nouns: ", nouns)
127
+ print("Adjectives: ", adjectives)
128
+ print("Verbs: ", verbs)
129
+ print("Sentiment: ", sentiment)
130
+
131
+ return {
132
+ "transcript": transcript,
133
+ "sentiment": sentiment,
134
+ "sound_intensity": float(sound_intensity),
135
+ "fundamental_frequency": float(fundamental_frequency),
136
+ "spectral_energy": float(spectral_energy),
137
+ "spectral_centroid": float(avg_spectral_centroid),
138
+ "zero_crossing_rate": float(zero_crossing_rate),
139
+ "avg_words_per_minute": float(avg_words_per_minute),
140
+ "avg_unique_words_per_minute": float(avg_unique_words_per_minute),
141
+ "unique_word_count": int(unique_words),
142
+ "filler_words_per_minute": float(filler_words_per_minute),
143
+ "noun_count": len(nouns),
144
+ "adjective_count": len(adjectives),
145
+ "verb_count": len(verbs),
146
+ "pause_rate": float(pause_rate)
147
+ },pitches