AyeshaAmeen
commited on
Upload 3 files
Browse files- app.py +234 -0
- download_spacy_model.py +11 -0
- requirements.txt +10 -0
app.py
ADDED
@@ -0,0 +1,234 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# -*- coding: utf-8 -*-
|
2 |
+
"""app.ipynb
|
3 |
+
|
4 |
+
Automatically generated by Colab.
|
5 |
+
|
6 |
+
Original file is located at
|
7 |
+
https://colab.research.google.com/drive/1a3BQS9Nu4qUbxFVu7gP9XtVhZA0c-ldN
|
8 |
+
"""
|
9 |
+
|
10 |
+
import assemblyai as aai
|
11 |
+
from transformers import T5Tokenizer, T5ForConditionalGeneration, AutoModelForSequenceClassification, AutoTokenizer
|
12 |
+
from deep_translator import GoogleTranslator
|
13 |
+
import spacy
|
14 |
+
import gradio as gr
|
15 |
+
from pydub import AudioSegment
|
16 |
+
import os
|
17 |
+
from resemblyzer import VoiceEncoder, preprocess_wav
|
18 |
+
from pathlib import Path
|
19 |
+
import torch
|
20 |
+
import numpy as np
|
21 |
+
import requests
|
22 |
+
from tempfile import NamedTemporaryFile
|
23 |
+
from yt_dlp import YoutubeDL
|
24 |
+
from urllib.parse import urlparse
|
25 |
+
from sklearn.cluster import AgglomerativeClustering
|
26 |
+
|
27 |
+
# Step 1: Set AssemblyAI API Key
|
28 |
+
aai.settings.api_key = "your_assemblyai_api_key"
|
29 |
+
transcriber = aai.Transcriber()
|
30 |
+
|
31 |
+
def transcribe_audio(audio_file_path):
|
32 |
+
transcript = transcriber.transcribe(audio_file_path)
|
33 |
+
transcription_text = transcript.text if hasattr(transcript, 'text') else ""
|
34 |
+
transcription_words = transcript.words if hasattr(transcript, 'words') else []
|
35 |
+
return transcription_text, transcription_words
|
36 |
+
|
37 |
+
# Step 2: Language Translation (English and Urdu) with chunking
|
38 |
+
def translate_text(text, target_language):
|
39 |
+
translator = GoogleTranslator(source='auto', target=target_language)
|
40 |
+
chunk_size = 4999 # Ensure we do not exceed the limit
|
41 |
+
translated_chunks = []
|
42 |
+
for i in range(0, len(text), chunk_size):
|
43 |
+
chunk = text[i:i + chunk_size]
|
44 |
+
translated_chunk = translator.translate(chunk)
|
45 |
+
translated_chunks.append(translated_chunk)
|
46 |
+
translated_text = " ".join(translated_chunks)
|
47 |
+
return translated_text
|
48 |
+
|
49 |
+
# Step 3: Summarization with T5 Model
|
50 |
+
tokenizer = T5Tokenizer.from_pretrained('t5-base')
|
51 |
+
model_t5 = T5ForConditionalGeneration.from_pretrained('t5-base')
|
52 |
+
|
53 |
+
def summarize_text(text, source_language, target_language):
|
54 |
+
if source_language == 'urdu':
|
55 |
+
text = translate_text(text, 'en') # Translate to English for summarization
|
56 |
+
inputs = tokenizer.encode("summarize: " + text, return_tensors="pt", max_length=512, truncation=True)
|
57 |
+
summary_ids = model_t5.generate(inputs, max_length=150, min_length=30, length_penalty=2.0, num_beams=4, early_stopping=True)
|
58 |
+
summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
|
59 |
+
if source_language == 'urdu':
|
60 |
+
summary = translate_text(summary, target_language) # Translate back to Urdu
|
61 |
+
return summary
|
62 |
+
|
63 |
+
# Step 4: Key Points Extraction with spaCy
|
64 |
+
nlp = spacy.load("en_core_web_sm")
|
65 |
+
|
66 |
+
def extract_key_points(text):
|
67 |
+
doc = nlp(text)
|
68 |
+
tasks = []
|
69 |
+
for ent in doc.ents:
|
70 |
+
if ent.label_ in ["TASK", "DATE", "PERSON", "ORG"]:
|
71 |
+
tasks.append(ent.text)
|
72 |
+
return tasks
|
73 |
+
|
74 |
+
# Step 5: Speaker Identification using silero and resemblyzer
|
75 |
+
def identify_speakers(audio_file_path):
|
76 |
+
wav_fpath = Path(audio_file_path)
|
77 |
+
wav = preprocess_wav(wav_fpath)
|
78 |
+
|
79 |
+
# Load the silero VAD model and utilities
|
80 |
+
vad_model, utils = torch.hub.load(repo_or_dir='snakers4/silero-vad', model='silero_vad', trust_repo=True)
|
81 |
+
(get_speech_timestamps, _, _, _, _) = utils
|
82 |
+
sampling_rate = 16000 # Set the sampling rate
|
83 |
+
|
84 |
+
# Get speech timestamps using silero VAD
|
85 |
+
speech_timestamps = get_speech_timestamps(wav, vad_model, sampling_rate=sampling_rate)
|
86 |
+
|
87 |
+
encoder = VoiceEncoder()
|
88 |
+
speaker_segments = []
|
89 |
+
|
90 |
+
for ts in speech_timestamps:
|
91 |
+
start, end = ts['start'], ts['end']
|
92 |
+
segment = wav[start:end]
|
93 |
+
speaker_embeds = encoder.embed_utterance(segment)
|
94 |
+
speaker_segments.append((start / sampling_rate, end / sampling_rate, speaker_embeds))
|
95 |
+
|
96 |
+
# Use AgglomerativeClustering to cluster the speakers
|
97 |
+
embeddings = np.vstack([seg[2] for seg in speaker_segments])
|
98 |
+
clustering = AgglomerativeClustering(n_clusters=None, distance_threshold=0.75).fit(embeddings)
|
99 |
+
speaker_labels = clustering.labels_
|
100 |
+
|
101 |
+
# Merge adjacent segments identified as the same speaker
|
102 |
+
merged_segments = []
|
103 |
+
for i, (start_time, end_time, _) in enumerate(speaker_segments):
|
104 |
+
label = speaker_labels[i]
|
105 |
+
if merged_segments and merged_segments[-1][0] == label:
|
106 |
+
merged_segments[-1] = (label, merged_segments[-1][1], end_time)
|
107 |
+
else:
|
108 |
+
merged_segments.append((label, start_time, end_time))
|
109 |
+
|
110 |
+
return merged_segments, len(np.unique(speaker_labels))
|
111 |
+
|
112 |
+
# Step 6: Sentiment Analysis using transformers
|
113 |
+
model_sentiment = AutoModelForSequenceClassification.from_pretrained("cardiffnlp/twitter-roberta-base-sentiment")
|
114 |
+
tokenizer_sentiment = AutoTokenizer.from_pretrained("cardiffnlp/twitter-roberta-base-sentiment")
|
115 |
+
|
116 |
+
def analyze_sentiment(text):
|
117 |
+
max_length = 512 # Set the maximum length for the tokenizer
|
118 |
+
inputs = tokenizer_sentiment(text, return_tensors="pt", truncation=True, padding=True, max_length=max_length)
|
119 |
+
outputs = model_sentiment(**inputs)
|
120 |
+
probs = torch.nn.functional.softmax(outputs.logits, dim=-1)
|
121 |
+
sentiment = torch.argmax(probs, dim=1).item()
|
122 |
+
sentiment_map = {0: "Negative", 1: "Neutral", 2: "Positive"}
|
123 |
+
return sentiment_map[sentiment]
|
124 |
+
|
125 |
+
# Ensure the directory exists
|
126 |
+
output_dir = "/content"
|
127 |
+
os.makedirs(output_dir, exist_ok=True)
|
128 |
+
|
129 |
+
# Step 7: Download audio from YouTube using yt-dlp
|
130 |
+
def download_audio_from_youtube(url):
|
131 |
+
ydl_opts = {
|
132 |
+
'format': 'bestaudio/best',
|
133 |
+
'postprocessors': [{
|
134 |
+
'key': 'FFmpegExtractAudio',
|
135 |
+
'preferredcodec': 'wav',
|
136 |
+
'preferredquality': '192',
|
137 |
+
}],
|
138 |
+
'outtmpl': '/tmp/%(id)s.%(ext)s',
|
139 |
+
'quiet': True
|
140 |
+
}
|
141 |
+
with YoutubeDL(ydl_opts) as ydl:
|
142 |
+
info_dict = ydl.extract_info(url, download=True)
|
143 |
+
audio_file = ydl.prepare_filename(info_dict)
|
144 |
+
base, ext = os.path.splitext(audio_file)
|
145 |
+
audio_file = base + '.wav'
|
146 |
+
return audio_file
|
147 |
+
|
148 |
+
# Step 8: Gradio Interface Setup
|
149 |
+
def process_meeting(file, url, language):
|
150 |
+
audio_path = None
|
151 |
+
if file is not None:
|
152 |
+
file_path = file.name
|
153 |
+
audio_path = os.path.join(output_dir, "uploaded_audio.wav")
|
154 |
+
|
155 |
+
# Convert video to audio if necessary
|
156 |
+
if file_path.endswith(('.mp4', '.avi', '.mov', '.mkv')):
|
157 |
+
video = AudioSegment.from_file(file_path)
|
158 |
+
video.export(audio_path, format="wav")
|
159 |
+
else:
|
160 |
+
audio_path = file_path
|
161 |
+
elif url is not None:
|
162 |
+
parsed_url = urlparse(url)
|
163 |
+
if "youtube.com" in parsed_url.netloc or "youtu.be" in parsed_url.netloc:
|
164 |
+
audio_path = download_audio_from_youtube(url)
|
165 |
+
else:
|
166 |
+
response = requests.get(url)
|
167 |
+
with NamedTemporaryFile(delete=False, suffix=".wav") as temp_file:
|
168 |
+
temp_file.write(response.content)
|
169 |
+
audio_path = temp_file.name
|
170 |
+
|
171 |
+
if audio_path is None:
|
172 |
+
return "Please provide either a file or a URL."
|
173 |
+
|
174 |
+
transcription, words = transcribe_audio(audio_path)
|
175 |
+
|
176 |
+
# Step 2: Translation based on user-selected language
|
177 |
+
if language == "urdu":
|
178 |
+
translated_text = translate_text(transcription, 'ur')
|
179 |
+
else: # default to English
|
180 |
+
translated_text = transcription
|
181 |
+
|
182 |
+
# Step 3: Summarization and Key Points Extraction
|
183 |
+
summary = summarize_text(translated_text, language, 'ur')
|
184 |
+
key_points = extract_key_points(translated_text)
|
185 |
+
|
186 |
+
# Step 4: Speaker Identification
|
187 |
+
speakers, num_speakers = identify_speakers(audio_path)
|
188 |
+
|
189 |
+
# Map speakers to their spoken text
|
190 |
+
speaker_transcripts = {i: [] for i in range(num_speakers)}
|
191 |
+
|
192 |
+
for label, start_time, end_time in speakers:
|
193 |
+
segment = [word.text for word in words if start_time <= word.start / 1000 <= end_time]
|
194 |
+
text_segment = " ".join(segment)
|
195 |
+
speaker_transcripts[label].append(text_segment)
|
196 |
+
|
197 |
+
speaker_details = ""
|
198 |
+
for label, segments in speaker_transcripts.items():
|
199 |
+
speaker_name = f"Speaker {label + 1}"
|
200 |
+
speaker_details += f"{speaker_name}:\n"
|
201 |
+
speaker_details += "\n".join(segments) + "\n\n"
|
202 |
+
|
203 |
+
# Step 5: Sentiment Analysis
|
204 |
+
sentiment = analyze_sentiment(transcription)
|
205 |
+
|
206 |
+
speaker_details = f"Total number of speakers: {num_speakers}\n" + speaker_details
|
207 |
+
|
208 |
+
return transcription, translated_text, key_points, summary, speaker_details, sentiment
|
209 |
+
|
210 |
+
# Step 9: Launch Gradio Interface with Scrollbars
|
211 |
+
iface = gr.Interface(
|
212 |
+
fn=process_meeting,
|
213 |
+
inputs=[
|
214 |
+
gr.File(label="Upload Meeting Recording"),
|
215 |
+
gr.Textbox(label="Enter Meeting URL"),
|
216 |
+
gr.Radio(["english", "urdu"], label="Select Summary Language")
|
217 |
+
],
|
218 |
+
outputs=[
|
219 |
+
gr.Textbox(label="Transcription", lines=20),
|
220 |
+
gr.Textbox(label="Translated Text", lines=20),
|
221 |
+
gr.Textbox(label="Key Points", lines=20),
|
222 |
+
gr.Textbox(label="Summary", lines=20),
|
223 |
+
gr.Textbox(label="Speakers", lines=20),
|
224 |
+
gr.Textbox(label="Sentiment", lines=1)
|
225 |
+
],
|
226 |
+
title="Smart AI Meeting Assistant",
|
227 |
+
description="""
|
228 |
+
<div style='text-align: center;'>by Ayesha Ameen & Sana Sadiq</div>
|
229 |
+
<br>Upload your meeting recording or enter a publicly accessible URL and choose the summary language (English or Urdu).
|
230 |
+
""",
|
231 |
+
)
|
232 |
+
|
233 |
+
if __name__ == "__main__":
|
234 |
+
iface.launch(share=True, debug=True)
|
download_spacy_model.py
ADDED
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# -*- coding: utf-8 -*-
|
2 |
+
"""download_spacy_model.py.ipynb
|
3 |
+
|
4 |
+
Automatically generated by Colab.
|
5 |
+
|
6 |
+
Original file is located at
|
7 |
+
https://colab.research.google.com/drive/1a3BQS9Nu4qUbxFVu7gP9XtVhZA0c-ldN
|
8 |
+
"""
|
9 |
+
|
10 |
+
import spacy
|
11 |
+
spacy.cli.download("en_core_web_sm")
|
requirements.txt
ADDED
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
gradio
|
2 |
+
transformers
|
3 |
+
assemblyai
|
4 |
+
deep-translator
|
5 |
+
spacy
|
6 |
+
pydub
|
7 |
+
torch
|
8 |
+
resemblyzer
|
9 |
+
yt-dlp
|
10 |
+
scikit-learn
|