Spaces:
Paused
Paused
import os | |
import time | |
import gradio as gr | |
import numpy as np | |
import librosa | |
import soundfile as sf | |
from twilio.rest import Client | |
from twilio.twiml.voice_response import VoiceResponse, Dial | |
import requests | |
from datetime import datetime | |
import tempfile | |
from nemo.collections.asr.models import ASRModel | |
import torch | |
import gradio.themes as gr_themes | |
import csv | |
from pathlib import Path | |
import shutil | |
import gc | |
import re | |
import threading | |
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline | |
from docx import Document | |
from pydub import AudioSegment | |
# ========== Twilio Functions ========== | |
def get_twilio_credentials(): | |
account_sid = os.environ.get("TWILIO_ACCOUNT_SID") | |
auth_token = os.environ.get("TWILIO_AUTH_TOKEN") | |
twilio_number = os.environ.get("TWILIO_PHONE_NUMBER") | |
return account_sid, auth_token, twilio_number | |
def make_conference_call(phone_number, conference_code, wait_time=30): | |
try: | |
account_sid, auth_token, twilio_number = get_twilio_credentials() | |
if not all([account_sid, auth_token, twilio_number]): | |
return None, "Twilio credentials not found. Please set environment variables." | |
client = Client(account_sid, auth_token) | |
response = VoiceResponse() | |
response.say("Joining conference call. This call will be recorded for diarization.") | |
response.pause(length=2) | |
if conference_code: | |
for digit in conference_code: | |
if digit.isdigit() or digit in ['*', '#']: | |
response.play(digits=digit) | |
response.pause(length=1) | |
response.record(timeout=0, transcribe=False, recording_status_callback="/recording-status") | |
dial = Dial() | |
dial.conference('ConferenceRoom', record='record-from-start', recording_status_callback="/recording-status") | |
response.append(dial) | |
call = client.calls.create( | |
to=phone_number, | |
from_=twilio_number, | |
twiml=str(response), | |
record=True | |
) | |
return call.sid, f"Call initiated with SID: {call.sid}. Wait for the call to complete before retrieving the recording." | |
except Exception as e: | |
return None, f"Error initiating call: {str(e)}" | |
def check_call_status(call_sid): | |
try: | |
account_sid, auth_token, _ = get_twilio_credentials() | |
if not all([account_sid, auth_token]): | |
return None, "Twilio credentials not found. Please set environment variables." | |
client = Client(account_sid, auth_token) | |
call = client.calls(call_sid).fetch() | |
if call.status in ['in-progress', 'queued', 'ringing']: | |
return None, f"Call is still {call.status}. Please check again later." | |
recordings = client.recordings.list(call_sid=call_sid) | |
if not recordings: | |
return None, "No recordings found for this call yet. Please check again later." | |
recording = recordings[0] | |
recording_url = f"https://api.twilio.com/2010-04-01/Accounts/{account_sid}/Recordings/{recording.sid}.wav" | |
response = requests.get(recording_url, auth=(account_sid, auth_token)) | |
if response.status_code != 200: | |
return None, f"Failed to download recording: {response.status_code}" | |
temp_file = tempfile.NamedTemporaryFile(delete=False, suffix='.wav') | |
temp_file.write(response.content) | |
temp_file.close() | |
return temp_file.name, f"Recording downloaded successfully: {temp_file.name}" | |
except Exception as e: | |
return None, f"Error checking call status: {str(e)}" | |
# ========== Audio Processing ========== | |
def upsample_to_16k(input_wav): | |
try: | |
y, sr = librosa.load(input_wav, sr=None) | |
if sr != 16000: | |
y = librosa.resample(y, orig_sr=sr, target_sr=16000) | |
output_file = f"16k_{datetime.now().strftime('%Y%m%d_%H%M%S')}.wav" | |
sf.write(output_file, y, 16000) | |
return output_file, f"Audio upsampled to 16kHz: {output_file}" | |
except Exception as e: | |
return None, f"Error upsampling audio: {str(e)}" | |
# ========== ASR and Meeting Minutes Setup ========== | |
QWEN_MODEL = "Qwen/Qwen2.5-1.5B-Instruct" | |
qwen_tokenizer = AutoTokenizer.from_pretrained(QWEN_MODEL) | |
qwen_model = AutoModelForCausalLM.from_pretrained( | |
QWEN_MODEL, torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32 | |
) | |
if torch.cuda.is_available(): | |
qwen_model = qwen_model.cuda() | |
qwen_pipe = pipeline( | |
"text-generation", | |
model=qwen_model, | |
tokenizer=qwen_tokenizer, | |
device=0 if torch.cuda.is_available() else -1, | |
max_new_tokens=1024, | |
do_sample=True, | |
temperature=0.3, | |
) | |
device = "cuda" if torch.cuda.is_available() else "cpu" | |
MODEL_NAME = "nvidia/parakeet-tdt-0.6b-v2" | |
model = ASRModel.from_pretrained(model_name=MODEL_NAME) | |
model.eval() | |
model_lock = threading.Lock() | |
def start_session(request: gr.Request): | |
session_hash = request.session_hash | |
session_dir = Path(f'/tmp/{session_hash}') | |
session_dir.mkdir(parents=True, exist_ok=True) | |
print(f"Session with hash {session_hash} started.") | |
return session_dir.as_posix() | |
def end_session(request: gr.Request): | |
session_hash = request.session_hash | |
session_dir = Path(f'/tmp/{session_hash}') | |
if session_dir.exists(): | |
shutil.rmtree(session_dir) | |
print(f"Session with hash {session_hash} ended.") | |
def get_audio_segment(audio_path, start_second, end_second): | |
if not audio_path or not Path(audio_path).exists(): | |
print(f"Warning: Audio path '{audio_path}' not found or invalid for clipping.") | |
return None | |
try: | |
start_ms = int(start_second * 1000) | |
end_ms = int(end_second * 1000) | |
start_ms = max(0, start_ms) | |
if end_ms <= start_ms: | |
end_ms = start_ms + 100 | |
audio = AudioSegment.from_file(audio_path) | |
clipped_audio = audio[start_ms:end_ms] | |
samples = np.array(clipped_audio.get_array_of_samples()) | |
if clipped_audio.channels == 2: | |
samples = samples.reshape((-1, 2)).mean(axis=1).astype(samples.dtype) | |
frame_rate = clipped_audio.frame_rate | |
if frame_rate <= 0: | |
frame_rate = audio.frame_rate | |
if samples.size == 0: | |
return None | |
return (frame_rate, samples) | |
except Exception as e: | |
print(f"Error clipping audio {audio_path} from {start_second}s to {end_second}s: {e}") | |
return None | |
def format_srt_time(seconds: float) -> str: | |
import datetime | |
sanitized_total_seconds = max(0.0, seconds) | |
delta = datetime.timedelta(seconds=sanitized_total_seconds) | |
total_int_seconds = int(delta.total_seconds()) | |
hours = total_int_seconds // 3600 | |
remainder_seconds_after_hours = total_int_seconds % 3600 | |
minutes = remainder_seconds_after_hours // 60 | |
seconds_part = remainder_seconds_after_hours % 60 | |
milliseconds = delta.microseconds // 1000 | |
return f"{hours:02d}:{minutes:02d}:{seconds_part:02d},{milliseconds:03d}" | |
def generate_srt_content(segment_timestamps: list) -> str: | |
srt_content = [] | |
for i, ts in enumerate(segment_timestamps): | |
start_time = format_srt_time(ts['start']) | |
end_time = format_srt_time(ts['end']) | |
text = ts['segment'] | |
srt_content.append(str(i + 1)) | |
srt_content.append(f"{start_time} --> {end_time}") | |
srt_content.append(text) | |
srt_content.append("") | |
return "\n".join(srt_content) | |
def get_transcripts_and_raw_times(audio_path, session_dir): | |
import gradio as gr | |
if not audio_path: | |
gr.Error("No audio file path provided for transcription.", duration=None) | |
return [], [], None, gr.DownloadButton(label="Download Transcript (CSV)", visible=False), gr.DownloadButton(label="Download Transcript (SRT)", visible=False) | |
vis_data = [["N/A", "N/A", "Processing failed"]] | |
raw_times_data = [[0.0, 0.0]] | |
processed_audio_path = None | |
csv_file_path = None | |
srt_file_path = None | |
original_path_name = Path(audio_path).name | |
audio_name = Path(audio_path).stem | |
csv_button_update = gr.DownloadButton(label="Download Transcript (CSV)", visible=False) | |
srt_button_update = gr.DownloadButton(label="Download Transcript (SRT)", visible=False) | |
try: | |
gr.Info(f"Upsampling and loading audio: {original_path_name}", duration=2) | |
upsampled_path, upsample_msg = upsample_to_16k(audio_path) | |
if not upsampled_path: | |
gr.Error(upsample_msg, duration=None) | |
return [["Error", "Error", upsample_msg]], [[0.0, 0.0]], audio_path, csv_button_update, srt_button_update | |
audio = AudioSegment.from_file(upsampled_path) | |
duration_sec = audio.duration_seconds | |
info_path_name = Path(upsampled_path).name | |
long_audio_settings_applied = False | |
try: | |
with model_lock: | |
model.to(device) | |
model.to(torch.float32) | |
gr.Info(f"Transcribing {info_path_name} on {device}...", duration=2) | |
if duration_sec > 480: # 8 minutes | |
try: | |
gr.Info("Audio longer than 8 minutes. Applying optimized settings for long transcription.", duration=3) | |
print("Applying long audio settings: Local Attention and Chunking.") | |
model.change_attention_model("rel_pos_local_attn", [256, 256]) | |
model.change_subsampling_conv_chunking_factor(1) # 1 = auto select | |
long_audio_settings_applied = True | |
except Exception as setting_e: | |
gr.Warning(f"Could not apply long audio settings: {setting_e}", duration=5) | |
print(f"Warning: Failed to apply long audio settings: {setting_e}") | |
model.to(torch.bfloat16) | |
output = model.transcribe([upsampled_path], timestamps=True) | |
if not output or not isinstance(output, list) or not output[0] or not hasattr(output[0], 'timestamp') or not output[0].timestamp or 'segment' not in output[0].timestamp: | |
gr.Error("Transcription failed or produced unexpected output format.", duration=None) | |
return [["Error", "Error", "Transcription Format Issue"]], [[0.0, 0.0]], audio_path, csv_button_update, srt_button_update | |
segment_timestamps = output[0].timestamp['segment'] | |
csv_headers = ["Start (s)", "End (s)", "Segment"] | |
vis_data = [[f"{ts['start']:.2f}", f"{ts['end']:.2f}", ts['segment']] for ts in segment_timestamps] | |
raw_times_data = [[ts['start'], ts['end']] for ts in segment_timestamps] | |
try: | |
csv_file_path = Path(session_dir, f"transcription_{audio_name}.csv") | |
with open(csv_file_path, 'w', encoding="utf-8") as csvfile: | |
writer = csv.writer(csvfile) | |
writer.writerow(csv_headers) | |
writer.writerows(vis_data) | |
print(f"CSV transcript saved to temporary file: {csv_file_path}") | |
csv_button_update = gr.DownloadButton(value=csv_file_path, visible=True, label="Download Transcript (CSV)") | |
except Exception as csv_e: | |
gr.Error(f"Failed to create transcript CSV file: {csv_e}", duration=None) | |
print(f"Error writing CSV: {csv_e}") | |
if segment_timestamps: | |
try: | |
srt_content = generate_srt_content(segment_timestamps) | |
srt_file_path = Path(session_dir, f"transcription_{audio_name}.srt") | |
with open(srt_file_path, 'w', encoding='utf-8') as f: | |
f.write(srt_content) | |
print(f"SRT transcript saved to temporary file: {srt_file_path}") | |
srt_button_update = gr.DownloadButton(value=srt_file_path, visible=True, label="Download Transcript (SRT)") | |
except Exception as srt_e: | |
gr.Warning(f"Failed to create transcript SRT file: {srt_e}", duration=5) | |
print(f"Error writing SRT: {srt_e}") | |
gr.Info("Transcription complete.", duration=2) | |
return vis_data, raw_times_data, upsampled_path, csv_button_update, srt_button_update | |
except torch.cuda.OutOfMemoryError as e: | |
error_msg = 'CUDA out of memory. Please try a shorter audio or reduce GPU load.' | |
print(f"CUDA OutOfMemoryError: {e}") | |
gr.Error(error_msg, duration=None) | |
return [["OOM", "OOM", error_msg]], [[0.0, 0.0]], audio_path, csv_button_update, srt_button_update | |
except FileNotFoundError: | |
error_msg = f"Audio file for transcription not found: {Path(upsampled_path).name}." | |
print(f"Error: Transcribe audio file not found at path: {upsampled_path}") | |
gr.Error(error_msg, duration=None) | |
return [["Error", "Error", "File not found for transcription"]], [[0.0, 0.0]], audio_path, csv_button_update, srt_button_update | |
except Exception as e: | |
error_msg = f"Transcription failed: {e}" | |
print(f"Error during transcription processing: {e}") | |
gr.Error(error_msg, duration=None) | |
vis_data = [["Error", "Error", error_msg]] | |
raw_times_data = [[0.0, 0.0]] | |
return vis_data, raw_times_data, audio_path, csv_button_update, srt_button_update | |
finally: | |
with model_lock: | |
try: | |
if long_audio_settings_applied: | |
try: | |
print("Reverting long audio settings.") | |
model.change_attention_model("rel_pos") | |
model.change_subsampling_conv_chunking_factor(-1) | |
long_audio_settings_applied = False | |
except Exception as revert_e: | |
print(f"Warning: Failed to revert long audio settings: {revert_e}") | |
gr.Warning(f"Issue reverting model settings after long transcription: {revert_e}", duration=5) | |
if 'model' in locals() and hasattr(model, 'cpu'): | |
if device == 'cuda': | |
model.cpu() | |
gc.collect() | |
if device == 'cuda': | |
torch.cuda.empty_cache() | |
except Exception as cleanup_e: | |
print(f"Error during model cleanup: {cleanup_e}") | |
gr.Warning(f"Issue during model cleanup: {cleanup_e}", duration=5) | |
finally: | |
if processed_audio_path and os.path.exists(processed_audio_path): | |
try: | |
os.remove(processed_audio_path) | |
print(f"Temporary audio file {processed_audio_path} removed.") | |
except Exception as e: | |
print(f"Error removing temporary audio file {processed_audio_path}: {e}") | |
def strip_markdown(text): | |
text = re.sub(r'(\*\*|__)(.*?)\1', r'\2', text) | |
text = re.sub(r'(\*|_)(.*?)\1', r'\2', text) | |
text = re.sub(r'`(.+?)`', r'\1', text) | |
text = re.sub(r'!\[.*?\]\(.*?\)', '', text) | |
text = re.sub(r'\[([^\]]+)\]\([^\)]+\)', r'\1', text) | |
text = re.sub(r'^#+\s*', '', text, flags=re.MULTILINE) | |
text = re.sub(r'^>\s*', '', text, flags=re.MULTILINE) | |
text = re.sub(r'^-\s+', '', text, flags=re.MULTILINE) | |
text = re.sub(r'^\d+\.\s+', '', text, flags=re.MULTILINE) | |
text = re.sub(r'---', '', text) | |
return text.strip() | |
def generate_meeting_minutes(session_dir): | |
try: | |
csv_files = sorted(Path(session_dir).glob("transcription_*.csv"), key=os.path.getmtime, reverse=True) | |
if not csv_files: | |
return "No transcript CSV found. Please transcribe first.", None, gr.update(visible=True) | |
csv_path = csv_files[0] | |
with open(csv_path, "r", encoding="utf-8") as f: | |
transcript = f.read() | |
prompt = ( | |
"You are an expert meeting minutes assistant. " | |
"Given the following transcript CSV (with start and end times and segments), " | |
"summarize the meeting into structured minutes. " | |
"provide the minutes only and nothing else, no intro, no outro, no comments, just the minutes. " | |
"Include: Attendees (if mentioned), Topics, Discussion Points, Decisions, Action Items, and Next Steps. " | |
"Be concise and use bullet points where possible.\n\n" | |
"Transcript CSV:\n" | |
f"{transcript}\n" | |
"Structured Meeting Minutes:" | |
) | |
print("Sending prompt to Qwen2.5-1.5B-Instruct...") | |
out = qwen_pipe(prompt) | |
minutes = out[0]["generated_text"][len(prompt):].strip() | |
clean_minutes = strip_markdown(minutes) | |
docx_file = Path(session_dir) / "meeting_minutes.docx" | |
doc = Document() | |
for line in clean_minutes.splitlines(): | |
doc.add_paragraph(line) | |
doc.save(docx_file) | |
print("Minutes generated and saved to:", docx_file) | |
return minutes, str(docx_file), gr.update(visible=True) | |
except Exception as e: | |
print("Error in generate_meeting_minutes:", e) | |
return f"Error generating minutes: {e}", None, gr.update(visible=True) | |
def hangup_call(call_sid): | |
try: | |
account_sid, auth_token, _ = get_twilio_credentials() | |
if not all([account_sid, auth_token]): | |
return "Twilio credentials not found. Please set environment variables." | |
client = Client(account_sid, auth_token) | |
call = client.calls(call_sid).update(status="completed") | |
return f"Call {call_sid} has been hung up." | |
except Exception as e: | |
return f"Error hanging up call: {str(e)}" | |
# ========== Gradio UI ========== | |
nvidia_theme = gr_themes.Default( | |
primary_hue=gr_themes.Color( | |
c50="#E6F1D9", c100="#CEE3B3", c200="#B5D58C", c300="#9CC766", c400="#84B940", | |
c500="#76B900", c600="#68A600", c700="#5A9200", c800="#4C7E00", c900="#3E6A00", c950="#2F5600" | |
), | |
neutral_hue="gray", | |
font=[gr_themes.GoogleFont("Inter"), "ui-sans-serif", "system-ui", "sans-serif"], | |
).set() | |
with gr.Blocks(theme=nvidia_theme) as demo: | |
current_audio_path_state = gr.State(None) | |
raw_timestamps_list_state = gr.State([]) | |
session_dir = gr.State() | |
demo.load(start_session, outputs=[session_dir]) | |
# ====== Twilio Tab ====== | |
with gr.Tab("Twilio Call & Recording"): | |
gr.Markdown("### 1. Make Twilio Call and Record") | |
phone_number = gr.Textbox(label="Phone Number (E.164)", placeholder="+15551234567") | |
conference_code = gr.Textbox(label="Conference Code (optional)", placeholder="123456#") | |
call_btn = gr.Button("Make Call") | |
call_sid = gr.Textbox(label="Call SID", interactive=False) | |
call_status = gr.Textbox(label="Call Status", interactive=False) | |
call_btn.click( | |
make_conference_call, | |
inputs=[phone_number, conference_code], | |
outputs=[call_sid, call_status] | |
) | |
hangup_btn = gr.Button("Hangup Call") | |
hangup_status = gr.Textbox(label="Hangup Status", interactive=False) | |
hangup_btn.click( | |
hangup_call, | |
inputs=[call_sid], | |
outputs=[hangup_status] | |
) | |
gr.Markdown("### 2. Retrieve Recording") | |
sid_input = gr.Textbox(label="Call SID") | |
get_recording_btn = gr.Button("Get Recording") | |
recording_path = gr.Textbox(label="Recording File Path", interactive=False) | |
recording_status = gr.Textbox(label="Recording Status", interactive=False) | |
get_recording_btn.click( | |
check_call_status, | |
inputs=[sid_input], | |
outputs=[recording_path, recording_status] | |
) | |
gr.Markdown("### 3. Transcribe and Analyze Processed Audio") | |
transcribe_btn = gr.Button("Transcribe Processed Recording") | |
vis_timestamps_df = gr.DataFrame( | |
headers=["Start (s)", "End (s)", "Segment"], | |
datatype=["number", "number", "str"], | |
wrap=True, | |
label="Transcription Segments" | |
) | |
download_btn_csv = gr.DownloadButton(label="Download Transcript (CSV)", visible=False) | |
download_btn_srt = gr.DownloadButton(label="Download Transcript (SRT)", visible=False) | |
transcribe_btn.click( | |
get_transcripts_and_raw_times, | |
inputs=[recording_path, session_dir], | |
outputs=[vis_timestamps_df, raw_timestamps_list_state, current_audio_path_state, download_btn_csv, download_btn_srt], | |
) | |
# ====== Your Existing UI ====== | |
gr.Markdown("---") | |
gr.Markdown("<p><strong style='color: #FF0000; font-size: 1.2em;'>Transcription Results (Click row to play segment)</strong></p>") | |
with gr.Row(): | |
gen_minutes_btn = gr.Button("Generate Meeting Minutes", variant="primary") | |
minutes_output = gr.Textbox(label="Structured Meeting Minutes", visible=False, lines=15) | |
minutes_download = gr.DownloadButton(label="Download Meeting Minutes (.docx)", visible=False) | |
with gr.Row(): | |
download_btn_csv = gr.DownloadButton(label="Download Transcript (CSV)", visible=False) | |
download_btn_srt = gr.DownloadButton(label="Download Transcript (SRT)", visible=False) | |
vis_timestamps_df = gr.DataFrame( | |
headers=["Start (s)", "End (s)", "Segment"], | |
datatype=["number", "number", "str"], | |
wrap=True, | |
label="Transcription Segments" | |
) | |
selected_segment_player = gr.Audio(label="Selected Segment", interactive=False) | |
mic_input = gr.Audio(sources=["microphone"], type="filepath", label="Record Audio") | |
mic_transcribe_btn = gr.Button("Transcribe Microphone Input", variant="primary") | |
file_input = gr.Audio(sources=["upload"], type="filepath", label="Upload Audio File") | |
file_transcribe_btn = gr.Button("Transcribe Uploaded File", variant="primary") | |
mic_transcribe_btn.click( | |
get_transcripts_and_raw_times, | |
inputs=[mic_input, session_dir], | |
outputs=[vis_timestamps_df, raw_timestamps_list_state, current_audio_path_state, download_btn_csv, download_btn_srt], | |
api_name="transcribe_mic" | |
) | |
file_transcribe_btn.click( | |
get_transcripts_and_raw_times, | |
inputs=[file_input, session_dir], | |
outputs=[vis_timestamps_df, raw_timestamps_list_state, current_audio_path_state, download_btn_csv, download_btn_srt], | |
api_name="transcribe_file" | |
) | |
gen_minutes_btn.click( | |
generate_meeting_minutes, | |
inputs=[session_dir], | |
outputs=[minutes_output, minutes_download, minutes_download], | |
) | |
def play_segment(evt: gr.SelectData, raw_ts_list, current_audio_path): | |
if not isinstance(raw_ts_list, list): | |
return gr.Audio(value=None, label="Selected Segment") | |
if not current_audio_path: | |
return gr.Audio(value=None, label="Selected Segment") | |
selected_index = evt.index[0] | |
if selected_index < 0 or selected_index >= len(raw_ts_list): | |
return gr.Audio(value=None, label="Selected Segment") | |
if not isinstance(raw_ts_list[selected_index], (list, tuple)) or len(raw_ts_list[selected_index]) != 2: | |
return gr.Audio(value=None, label="Selected Segment") | |
start_time_s, end_time_s = raw_ts_list[selected_index] | |
segment_data = get_audio_segment(current_audio_path, start_time_s, end_time_s) | |
if segment_data: | |
return gr.Audio(value=segment_data, autoplay=True, label=f"Segment: {start_time_s:.2f}s - {end_time_s:.2f}s", interactive=False) | |
else: | |
return gr.Audio(value=None, label="Selected Segment") | |
vis_timestamps_df.select( | |
play_segment, | |
inputs=[raw_timestamps_list_state, current_audio_path_state], | |
outputs=[selected_segment_player], | |
) | |
demo.unload(end_session) | |
if __name__ == "__main__": | |
print("Launching Gradio Demo...") | |
demo.queue() | |
demo.launch() |