Spaces:
Paused
Paused
| import gradio as gr | |
| import torch | |
| from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline | |
| import spaces | |
| import numpy as np | |
| from pydub import AudioSegment | |
| import tempfile | |
| import os | |
| import subprocess | |
| import re | |
| # Model configuration - Using CrisperWhisper for TRUE verbatim transcription | |
| # CrisperWhisper is designed to transcribe EVERY word including um, uh, fillers, stutters, false starts | |
| MODEL_NAME = "nyrahealth/CrisperWhisper" | |
| device = "cuda:0" if torch.cuda.is_available() else "cpu" | |
| torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32 | |
| print(f"Loading {MODEL_NAME} for verbatim transcription...") | |
| # Load model and processor | |
| model = AutoModelForSpeechSeq2Seq.from_pretrained( | |
| MODEL_NAME, | |
| torch_dtype=torch_dtype, | |
| low_cpu_mem_usage=True, | |
| use_safetensors=True | |
| ) | |
| model.to(device) | |
| processor = AutoProcessor.from_pretrained(MODEL_NAME) | |
| # Create pipeline optimized for verbatim output | |
| pipe = pipeline( | |
| "automatic-speech-recognition", | |
| model=model, | |
| tokenizer=processor.tokenizer, | |
| feature_extractor=processor.feature_extractor, | |
| chunk_length_s=30, | |
| batch_size=8, # Reduced batch size for stability | |
| torch_dtype=torch_dtype, | |
| device=device, | |
| ) | |
| print("Model loaded successfully!") | |
| def get_audio_duration(audio_path): | |
| """Get duration of audio file in seconds.""" | |
| try: | |
| audio = AudioSegment.from_file(audio_path) | |
| return len(audio) / 1000.0 | |
| except: | |
| return None | |
| def slice_audio(audio_path, chunk_duration=300): | |
| """ | |
| Slice audio into chunks of specified duration (in seconds). | |
| Default is 5 minutes (300 seconds) per chunk. | |
| """ | |
| audio = AudioSegment.from_file(audio_path) | |
| duration_ms = len(audio) | |
| chunk_duration_ms = chunk_duration * 1000 | |
| chunks = [] | |
| for i in range(0, duration_ms, chunk_duration_ms): | |
| chunk = audio[i:i + chunk_duration_ms] | |
| with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_file: | |
| chunk.export(temp_file.name, format="wav") | |
| chunks.append(temp_file.name) | |
| return chunks | |
| def transcribe_audio_chunk(audio_input, task="transcribe", language=None, return_timestamps=False): | |
| """ | |
| Transcribe a single audio chunk with CrisperWhisper. | |
| This model is specifically trained for verbatim transcription. | |
| """ | |
| try: | |
| generate_kwargs = { | |
| "task": task, | |
| } | |
| if language: | |
| generate_kwargs["language"] = language | |
| # Only add timestamps if requested and handle the potential error | |
| if return_timestamps: | |
| try: | |
| generate_kwargs["return_timestamps"] = "word" | |
| result = pipe(audio_input, generate_kwargs=generate_kwargs) | |
| return result | |
| except RuntimeError as e: | |
| if "size of tensor" in str(e): | |
| # Fallback to chunk-level timestamps if word-level fails | |
| print("Word-level timestamps failed, trying chunk-level...") | |
| generate_kwargs["return_timestamps"] = True | |
| result = pipe(audio_input, generate_kwargs=generate_kwargs) | |
| return result | |
| raise | |
| else: | |
| # No timestamps requested | |
| result = pipe(audio_input, generate_kwargs=generate_kwargs) | |
| return result | |
| except Exception as e: | |
| # Last resort fallback: try with minimal parameters | |
| print(f"Error with generate_kwargs: {e}") | |
| try: | |
| result = pipe(audio_input) | |
| return result | |
| except Exception as e2: | |
| raise Exception(f"Transcription failed: {str(e2)}") | |
| def create_srt_file(transcription_data, output_path): | |
| """ | |
| Create an SRT subtitle file from transcription data. | |
| """ | |
| with open(output_path, 'w', encoding='utf-8') as f: | |
| counter = 1 | |
| for item in transcription_data: | |
| start_time = item['start'] | |
| end_time = item['end'] | |
| text = item['text'].strip() | |
| if text: # Only add non-empty subtitles | |
| # Convert seconds to SRT time format (HH:MM:SS,mmm) | |
| start_srt = format_timestamp_srt(start_time) | |
| end_srt = format_timestamp_srt(end_time) | |
| f.write(f"{counter}\n") | |
| f.write(f"{start_srt} --> {end_srt}\n") | |
| f.write(f"{text}\n\n") | |
| counter += 1 | |
| def format_timestamp_srt(seconds): | |
| """Convert seconds to SRT timestamp format.""" | |
| hours = int(seconds // 3600) | |
| minutes = int((seconds % 3600) // 60) | |
| secs = int(seconds % 60) | |
| millis = int((seconds % 1) * 1000) | |
| return f"{hours:02d}:{minutes:02d}:{secs:02d},{millis:03d}" | |
| def extract_audio_from_video(video_path): | |
| """Extract audio from video file using ffmpeg.""" | |
| try: | |
| audio_path = tempfile.NamedTemporaryFile(suffix=".wav", delete=False).name | |
| # Use ffmpeg directly for more reliable extraction | |
| cmd = [ | |
| 'ffmpeg', | |
| '-i', video_path, | |
| '-vn', # No video | |
| '-acodec', 'pcm_s16le', | |
| '-ar', '16000', # 16kHz sample rate for Whisper | |
| '-ac', '1', # Mono | |
| '-y', # Overwrite output | |
| audio_path | |
| ] | |
| subprocess.run(cmd, check=True, capture_output=True) | |
| return audio_path | |
| except Exception as e: | |
| raise Exception(f"Failed to extract audio: {str(e)}") | |
| def burn_subtitles_to_video(video_path, srt_path, progress=gr.Progress()): | |
| """ | |
| Burn subtitles into video using ffmpeg. | |
| """ | |
| try: | |
| progress(0.7, desc="Creating video with subtitles...") | |
| output_path = tempfile.NamedTemporaryFile(suffix=".mp4", delete=False).name | |
| # Escape the SRT path for ffmpeg filter | |
| srt_escaped = srt_path.replace('\\', '\\\\').replace(':', '\\:') | |
| # Use ffmpeg to burn subtitles | |
| cmd = [ | |
| 'ffmpeg', | |
| '-i', video_path, | |
| '-vf', f"subtitles={srt_escaped}:force_style='FontName=Arial,FontSize=24,PrimaryColour=&HFFFFFF,OutlineColour=&H000000,Outline=2,Alignment=2,MarginV=50'", | |
| '-c:a', 'copy', | |
| '-y', | |
| output_path | |
| ] | |
| subprocess.run(cmd, check=True, capture_output=True) | |
| progress(1.0, desc="Done!") | |
| return output_path | |
| except Exception as e: | |
| raise Exception(f"Failed to create subtitled video: {str(e)}") | |
| def merge_subtitle_segments(segments, max_duration=5.0, max_words=15): | |
| """ | |
| Merge small subtitle segments into larger, more readable ones. | |
| """ | |
| if not segments: | |
| return [] | |
| merged = [] | |
| # Start with the first segment | |
| current_segment = segments[0].copy() | |
| for i in range(1, len(segments)): | |
| next_segment = segments[i] | |
| # Combine text and calculate new word count | |
| new_text = current_segment['text'] + " " + next_segment['text'].lstrip() | |
| new_word_count = len(new_text.split()) | |
| # Calculate new duration | |
| new_duration = next_segment['end'] - current_segment['start'] | |
| # If merging doesn't exceed limits, merge | |
| if new_duration <= max_duration and new_word_count <= max_words: | |
| current_segment['end'] = next_segment['end'] | |
| current_segment['text'] = new_text | |
| else: | |
| # Otherwise, save the current segment and start a new one | |
| merged.append(current_segment) | |
| current_segment = next_segment.copy() | |
| # Don't forget the last segment | |
| merged.append(current_segment) | |
| return merged | |
| def process_video(video_path, task="transcribe", language=None, subtitle_format="burned", progress=gr.Progress()): | |
| """ | |
| Process video: extract audio, transcribe, and add subtitles. | |
| """ | |
| if video_path is None: | |
| return None, "Please provide a video file.", None | |
| temp_files = [] | |
| srt_path = None # Initialize to prevent NameError in finally block | |
| try: | |
| # Extract audio from video | |
| progress(0, desc="Extracting audio from video...") | |
| audio_path = extract_audio_from_video(video_path) | |
| temp_files.append(audio_path) | |
| # Check audio duration | |
| duration = get_audio_duration(audio_path) | |
| chunk_duration = 300 # 5 minutes per chunk | |
| if duration and duration > chunk_duration: | |
| progress(0.1, desc=f"Audio is {duration:.1f}s long. Slicing into chunks...") | |
| audio_chunks = slice_audio(audio_path, chunk_duration) | |
| temp_files.extend(audio_chunks) | |
| else: | |
| audio_chunks = [audio_path] | |
| # Transcribe each chunk with timestamps | |
| all_transcriptions = [] | |
| total_chunks = len(audio_chunks) | |
| for idx, chunk_path in enumerate(audio_chunks): | |
| progress(0.1 + (idx / total_chunks) * 0.5, desc=f"Transcribing chunk {idx + 1}/{total_chunks}...") | |
| result = transcribe_audio_chunk(chunk_path, task, language, return_timestamps=True) | |
| if "chunks" in result: | |
| chunk_offset = idx * chunk_duration | |
| for word_chunk in result["chunks"]: | |
| start = word_chunk["timestamp"][0] | |
| end = word_chunk["timestamp"][1] | |
| if start is not None and end is not None: | |
| all_transcriptions.append({ | |
| "start": start + chunk_offset, | |
| "end": end + chunk_offset, | |
| "text": word_chunk["text"] | |
| }) | |
| if not all_transcriptions: | |
| return None, "No transcription data available. Timestamps may have failed.", None | |
| # Merge close timestamps for better subtitle readability | |
| progress(0.6, desc="Optimizing subtitle timing...") | |
| merged_transcriptions = merge_subtitle_segments(all_transcriptions, max_duration=5.0, max_words=15) | |
| # Generate full text transcript | |
| full_text = "".join([t["text"] for t in merged_transcriptions]).strip() | |
| transcript_output = f"**Verbatim Transcription:**\n{full_text}\n\n" | |
| transcript_output += f"*Total duration: {duration:.1f}s | {len(merged_transcriptions)} subtitle segments*" | |
| # Create SRT file (needed for all formats) | |
| srt_path = tempfile.NamedTemporaryFile(suffix=".srt", delete=False).name | |
| create_srt_file(merged_transcriptions, srt_path) | |
| temp_files.append(srt_path) | |
| if subtitle_format == "burned": | |
| # Burn subtitles into video | |
| output_video = burn_subtitles_to_video(video_path, srt_path, progress) | |
| return output_video, transcript_output, None | |
| elif subtitle_format == "srt": | |
| # Return SRT file only | |
| progress(0.7, desc="Creating SRT subtitle file...") | |
| return None, transcript_output, srt_path | |
| else: # both | |
| progress(0.7, desc="Creating video with subtitles and SRT file...") | |
| output_video = burn_subtitles_to_video(video_path, srt_path, progress) | |
| # Create a copy of SRT for download | |
| srt_download = tempfile.NamedTemporaryFile(suffix=".srt", delete=False).name | |
| import shutil | |
| shutil.copy(srt_path, srt_download) | |
| return output_video, transcript_output, srt_download | |
| except Exception as e: | |
| return None, f"Error processing video: {str(e)}", None | |
| finally: | |
| # Clean up temporary audio files (keep video and srt outputs) | |
| for temp_file in temp_files: | |
| try: | |
| # srt_path could be None if an error occurs early | |
| if srt_path and os.path.exists(temp_file) and temp_file != srt_path: | |
| os.unlink(temp_file) | |
| elif os.path.exists(temp_file): | |
| os.unlink(temp_file) | |
| except: | |
| pass | |
| def transcribe_audio(audio, task="transcribe", return_timestamps=False, language=None, export_srt=False, progress=gr.Progress()): | |
| """ | |
| Transcribe audio with VERY VERBATIM output using CrisperWhisper. | |
| This model transcribes every spoken word exactly as it is, including fillers, stutters, and false starts. | |
| """ | |
| if audio is None: | |
| return "Please provide an audio file or recording.", None | |
| # If SRT export is requested, we must generate timestamps. | |
| if export_srt: | |
| return_timestamps = True | |
| temp_files = [] | |
| try: | |
| # Handle different audio input formats | |
| if isinstance(audio, str): | |
| audio_path = audio | |
| elif isinstance(audio, tuple): | |
| sr, audio_data = audio | |
| with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_file: | |
| import scipy.io.wavfile | |
| scipy.io.wavfile.write(temp_file.name, sr, audio_data) | |
| audio_path = temp_file.name | |
| temp_files.append(audio_path) | |
| else: | |
| return "Unsupported audio format.", None | |
| # Check audio duration and slice if necessary | |
| duration = get_audio_duration(audio_path) | |
| chunk_duration = 300 # 5 minutes per chunk | |
| if duration and duration > chunk_duration: | |
| progress(0, desc=f"Audio is {duration:.1f}s long. Slicing into chunks...") | |
| audio_chunks = slice_audio(audio_path, chunk_duration) | |
| temp_files.extend(audio_chunks) | |
| else: | |
| audio_chunks = [audio_path] | |
| # Process each chunk | |
| all_word_chunks = [] | |
| full_text_parts = [] | |
| total_chunks = len(audio_chunks) | |
| for idx, chunk_path in enumerate(audio_chunks): | |
| progress((idx + 1) / total_chunks, desc=f"Transcribing chunk {idx + 1}/{total_chunks}...") | |
| result = transcribe_audio_chunk(chunk_path, task, language, return_timestamps) | |
| full_text_parts.append(result["text"]) | |
| if return_timestamps and "chunks" in result: | |
| chunk_offset = idx * chunk_duration | |
| for word_chunk in result["chunks"]: | |
| start = word_chunk["timestamp"][0] | |
| end = word_chunk["timestamp"][1] | |
| if start is not None and end is not None: | |
| all_word_chunks.append({ | |
| "start": start + chunk_offset, | |
| "end": end + chunk_offset, | |
| "text": word_chunk["text"] | |
| }) | |
| # Combine all transcriptions | |
| full_text = "".join(full_text_parts).strip() | |
| output = f"**Verbatim Transcription:**\n{full_text}\n" | |
| srt_file_path = None | |
| if return_timestamps and all_word_chunks: | |
| # If timestamps are requested but not for SRT, display them in the textbox | |
| if not export_srt: | |
| output += "\n**Word-level Timestamps:**\n" | |
| for ts in all_word_chunks: | |
| output += f"[{ts['start']:.2f}s - {ts['end']:.2f}s]{ts['text']}\n" | |
| # Generate SRT file if requested | |
| if export_srt: | |
| if all_word_chunks: | |
| merged_transcriptions = merge_subtitle_segments(all_word_chunks, max_duration=5.0, max_words=15) | |
| srt_file = tempfile.NamedTemporaryFile(suffix=".srt", delete=False).name | |
| create_srt_file(merged_transcriptions, srt_file) | |
| srt_file_path = srt_file | |
| else: | |
| output += "\n**Warning:** Could not generate SRT file as word-level timestamps were not available." | |
| if duration: | |
| output += f"\n*Total duration: {duration:.1f}s | Processed in {total_chunks} chunk(s)*" | |
| return output, srt_file_path | |
| except Exception as e: | |
| return f"Error during transcription: {str(e)}", None | |
| finally: | |
| # Clean up temporary files | |
| for temp_file in temp_files: | |
| try: | |
| if os.path.exists(temp_file): | |
| os.unlink(temp_file) | |
| except: | |
| pass | |
| # Language options for manual selection | |
| LANGUAGES = { | |
| "Auto-detect": None, | |
| "English": "en", | |
| "Spanish": "es", | |
| "French": "fr", | |
| "German": "de", | |
| "Italian": "it", | |
| "Portuguese": "pt", | |
| "Dutch": "nl", | |
| "Russian": "ru", | |
| "Chinese": "zh", | |
| "Japanese": "ja", | |
| "Korean": "ko", | |
| "Arabic": "ar", | |
| "Hindi": "hi", | |
| "Turkish": "tr", | |
| "Polish": "pl", | |
| "Ukrainian": "uk", | |
| "Vietnamese": "vi", | |
| "Thai": "th", | |
| "Indonesian": "id", | |
| "Czech": "cs", | |
| "Romanian": "ro", | |
| "Swedish": "sv", | |
| "Danish": "da", | |
| "Norwegian": "no", | |
| "Finnish": "fi", | |
| "Greek": "el", | |
| "Hebrew": "he", | |
| } | |
| # Create Gradio interface | |
| with gr.Blocks(theme=gr.themes.Soft()) as demo: | |
| gr.Markdown( | |
| """ | |
| # 🎙️ Very Verbatim Multilingual Speech-to-Text | |
| Powered by **CrisperWhisper** - specifically designed for verbatim transcription with ZeroGPU acceleration. | |
| ## 🔥 TRUE Verbatim Transcription | |
| Unlike standard Whisper (which omits disfluencies), **CrisperWhisper captures EVERYTHING**: | |
| - ✅ **Fillers**: um, uh, ah, er, mm, like, you know | |
| - ✅ **Hesitations**: pauses, breath sounds, stutters | |
| - ✅ **False Starts**: "I was- I went to the store" | |
| - ✅ **Repetitions**: "I I I think that..." | |
| - ✅ **Disfluencies**: Every non-fluent speech element | |
| - ✅ **Accurate Word-Level Timestamps**: Precise timing even around disfluencies | |
| - ✅ **Multilingual**: Supports 99+ languages | |
| - ✅ **Long Audio Support**: Automatic 5-minute chunking | |
| - ✅ **Video Subtitles**: Automatic caption generation with burned-in or SRT output | |
| **Perfect for:** Legal transcription, linguistic research, therapy sessions, interviews, | |
| conversational AI training, video subtitling, or any use case requiring exact speech capture. | |
| """ | |
| ) | |
| with gr.Tabs(): | |
| # Audio Tab | |
| with gr.Tab("🎤 Audio Transcription"): | |
| with gr.Row(): | |
| with gr.Column(): | |
| audio_input = gr.Audio( | |
| sources=["upload", "microphone"], | |
| type="filepath", | |
| label="Audio Input" | |
| ) | |
| with gr.Row(): | |
| task_radio = gr.Radio( | |
| choices=["transcribe", "translate"], | |
| value="transcribe", | |
| label="Task", | |
| info="Transcribe verbatim or translate to English" | |
| ) | |
| language_dropdown = gr.Dropdown( | |
| choices=list(LANGUAGES.keys()), | |
| value="Auto-detect", | |
| label="Language", | |
| info="Select language or use auto-detect" | |
| ) | |
| timestamps_checkbox = gr.Checkbox( | |
| label="Show word-level timestamps in text output", | |
| value=False, | |
| info="Display precise timing for each word" | |
| ) | |
| export_srt_checkbox = gr.Checkbox( | |
| label="Export as SRT file", | |
| value=False, | |
| info="Generate downloadable SRT subtitle file" | |
| ) | |
| transcribe_btn = gr.Button("🎯 Transcribe Verbatim", variant="primary", size="lg") | |
| with gr.Column(): | |
| output_text = gr.Textbox( | |
| label="Verbatim Transcription (includes all um, uh, hesitations)", | |
| lines=18, | |
| show_copy_button=True, | |
| placeholder="Your VERY verbatim transcription will appear here...\n\nEvery um, uh, stutter, and hesitation will be captured!" | |
| ) | |
| output_audio_srt = gr.File( | |
| label="Download SRT Subtitles", | |
| interactive=False, | |
| visible=False | |
| ) | |
| gr.Markdown( | |
| """ | |
| ### Why CrisperWhisper for Verbatim? | |
| **Standard Whisper** is trained to transcribe the "intended meaning" - it automatically cleans up: | |
| - ❌ Removes "um", "uh", "ah" | |
| - ❌ Omits false starts | |
| - ❌ Skips repetitions | |
| - ❌ Ignores stutters | |
| **CrisperWhisper** is specifically trained for verbatim transcription: | |
| - ✅ Keeps every filler word | |
| - ✅ Preserves all disfluencies | |
| - ✅ Captures exact speech patterns | |
| - ✅ Accurate timestamps around hesitations | |
| - ✅ Export as SRT file for use in video editors, YouTube, etc. | |
| """ | |
| ) | |
| # Video Tab | |
| with gr.Tab("🎬 Video Subtitles"): | |
| with gr.Row(): | |
| with gr.Column(): | |
| video_input = gr.Video( | |
| label="Video Input", | |
| sources=["upload"] | |
| ) | |
| with gr.Row(): | |
| video_task_radio = gr.Radio( | |
| choices=["transcribe", "translate"], | |
| value="transcribe", | |
| label="Task", | |
| info="Transcribe verbatim or translate to English" | |
| ) | |
| video_language_dropdown = gr.Dropdown( | |
| choices=list(LANGUAGES.keys()), | |
| value="Auto-detect", | |
| label="Language", | |
| info="Select language or use auto-detect" | |
| ) | |
| subtitle_format_radio = gr.Radio( | |
| choices=[ | |
| ("Burned-in subtitles (permanent)", "burned"), | |
| ("SRT file only (external subtitles)", "srt"), | |
| ("Both burned-in video + SRT file", "both") | |
| ], | |
| value="burned", | |
| label="Subtitle Format", | |
| info="Choose output format" | |
| ) | |
| process_video_btn = gr.Button("🎬 Generate Subtitles", variant="primary", size="lg") | |
| with gr.Column(): | |
| output_video = gr.Video( | |
| label="Video with Subtitles", | |
| interactive=False | |
| ) | |
| video_transcript = gr.Textbox( | |
| label="Verbatim Transcript", | |
| lines=10, | |
| show_copy_button=True, | |
| placeholder="Transcript will appear here..." | |
| ) | |
| output_srt = gr.File( | |
| label="Download SRT Subtitles", | |
| interactive=False | |
| ) | |
| gr.Markdown( | |
| """ | |
| ### Video Subtitle Features | |
| - **Burned-in Subtitles**: Permanently embedded in video (white text with black outline) | |
| - **SRT File**: Standard subtitle file with timestamps (HH:MM:SS,mmm format) | |
| - Compatible with YouTube, VLC, Premiere Pro, Final Cut, DaVinci Resolve | |
| - Easy to edit timings and text in any text editor | |
| - Can be translated and re-synced | |
| - **Verbatim Captions**: All hesitations, fillers, and disfluencies included | |
| - **Smart Timing**: Automatically merges short segments for readability | |
| - **Long Video Support**: Handles videos of any length (automatic chunking) | |
| ### SRT File Format Example | |
| ``` | |
| 1 | |
| 00:00:01,500 --> 00:00:03,200 | |
| Um, so I was thinking that | |
| 2 | |
| 00:00:03,200 --> 00:00:05,800 | |
| we could, uh, go to the store | |
| ``` | |
| ### Tips | |
| - Use "Burned-in" for sharing videos with guaranteed subtitle visibility | |
| - Use "SRT file" for flexible editing, translation, and platform uploads | |
| - Use "Both" to have maximum flexibility | |
| - SRT files work with all major video platforms and editors | |
| - Subtitles are positioned at the bottom center of the video | |
| """ | |
| ) | |
| gr.Markdown( | |
| """ | |
| ### Use Cases | |
| - **Legal/Court Transcription**: Exact wording required by law | |
| - **Linguistic Research**: Study of natural speech patterns and disfluencies | |
| - **Medical/Therapy Sessions**: Capturing patient speech patterns | |
| - **Interview Transcription**: Preserving speaker mannerisms | |
| - **Conversational AI Training**: Realistic dialogue data | |
| - **Accessibility**: Complete transcripts and captions for deaf/hard-of-hearing | |
| - **Video Content**: YouTube, social media, educational content with accurate captions | |
| - **Language Learning**: Analyzing natural spoken language | |
| ### Tips for Best Results | |
| - Clear audio with minimal background noise works best | |
| - The model captures quiet speech - ensure consistent audio levels | |
| - Manual language selection can improve accuracy | |
| - Long files are automatically processed in 5-minute chunks | |
| - For videos, ensure good audio quality for best subtitle accuracy | |
| """ | |
| ) | |
| # Set up event handlers | |
| def transcribe_wrapper(audio, task, timestamps, export_srt, language_name, progress=gr.Progress()): | |
| language_code = LANGUAGES[language_name] | |
| transcript, srt_file = transcribe_audio(audio, task, timestamps, language_code, export_srt, progress) | |
| # Control visibility of SRT download | |
| srt_visible = gr.update(visible=srt_file is not None, value=srt_file) | |
| return transcript, srt_visible | |
| def video_wrapper(video, task, language_name, subtitle_format, progress=gr.Progress()): | |
| language_code = LANGUAGES[language_name] | |
| return process_video(video, task, language_code, subtitle_format, progress) | |
| transcribe_btn.click( | |
| fn=transcribe_wrapper, | |
| inputs=[audio_input, task_radio, timestamps_checkbox, export_srt_checkbox, language_dropdown], | |
| outputs=[output_text, output_audio_srt] | |
| ) | |
| process_video_btn.click( | |
| fn=video_wrapper, | |
| inputs=[video_input, video_task_radio, video_language_dropdown, subtitle_format_radio], | |
| outputs=[output_video, video_transcript, output_srt] | |
| ) | |
| # Launch the app | |
| if __name__ == "__main__": | |
| demo.launch() | |