import gradio as gr import torch from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline import spaces import numpy as np from pydub import AudioSegment import tempfile import os import subprocess import re # Model configuration - Using CrisperWhisper for TRUE verbatim transcription # CrisperWhisper is designed to transcribe EVERY word including um, uh, fillers, stutters, false starts MODEL_NAME = "nyrahealth/CrisperWhisper" device = "cuda:0" if torch.cuda.is_available() else "cpu" torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32 print(f"Loading {MODEL_NAME} for verbatim transcription...") # Load model and processor model = AutoModelForSpeechSeq2Seq.from_pretrained( MODEL_NAME, torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True ) model.to(device) processor = AutoProcessor.from_pretrained(MODEL_NAME) # Create pipeline optimized for verbatim output pipe = pipeline( "automatic-speech-recognition", model=model, tokenizer=processor.tokenizer, feature_extractor=processor.feature_extractor, chunk_length_s=30, batch_size=8, # Reduced batch size for stability torch_dtype=torch_dtype, device=device, ) print("Model loaded successfully!") def get_audio_duration(audio_path): """Get duration of audio file in seconds.""" try: audio = AudioSegment.from_file(audio_path) return len(audio) / 1000.0 except: return None def slice_audio(audio_path, chunk_duration=300): """ Slice audio into chunks of specified duration (in seconds). Default is 5 minutes (300 seconds) per chunk. """ audio = AudioSegment.from_file(audio_path) duration_ms = len(audio) chunk_duration_ms = chunk_duration * 1000 chunks = [] for i in range(0, duration_ms, chunk_duration_ms): chunk = audio[i:i + chunk_duration_ms] with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_file: chunk.export(temp_file.name, format="wav") chunks.append(temp_file.name) return chunks @spaces.GPU def transcribe_audio_chunk(audio_input, task="transcribe", language=None, return_timestamps=False): """ Transcribe a single audio chunk with CrisperWhisper. This model is specifically trained for verbatim transcription. """ try: generate_kwargs = { "task": task, } if language: generate_kwargs["language"] = language # Only add timestamps if requested and handle the potential error if return_timestamps: try: generate_kwargs["return_timestamps"] = "word" result = pipe(audio_input, generate_kwargs=generate_kwargs) return result except RuntimeError as e: if "size of tensor" in str(e): # Fallback to chunk-level timestamps if word-level fails print("Word-level timestamps failed, trying chunk-level...") generate_kwargs["return_timestamps"] = True result = pipe(audio_input, generate_kwargs=generate_kwargs) return result raise else: # No timestamps requested result = pipe(audio_input, generate_kwargs=generate_kwargs) return result except Exception as e: # Last resort fallback: try with minimal parameters print(f"Error with generate_kwargs: {e}") try: result = pipe(audio_input) return result except Exception as e2: raise Exception(f"Transcription failed: {str(e2)}") def create_srt_file(transcription_data, output_path): """ Create an SRT subtitle file from transcription data. """ with open(output_path, 'w', encoding='utf-8') as f: counter = 1 for item in transcription_data: start_time = item['start'] end_time = item['end'] text = item['text'].strip() if text: # Only add non-empty subtitles # Convert seconds to SRT time format (HH:MM:SS,mmm) start_srt = format_timestamp_srt(start_time) end_srt = format_timestamp_srt(end_time) f.write(f"{counter}\n") f.write(f"{start_srt} --> {end_srt}\n") f.write(f"{text}\n\n") counter += 1 def format_timestamp_srt(seconds): """Convert seconds to SRT timestamp format.""" hours = int(seconds // 3600) minutes = int((seconds % 3600) // 60) secs = int(seconds % 60) millis = int((seconds % 1) * 1000) return f"{hours:02d}:{minutes:02d}:{secs:02d},{millis:03d}" def extract_audio_from_video(video_path): """Extract audio from video file using ffmpeg.""" try: audio_path = tempfile.NamedTemporaryFile(suffix=".wav", delete=False).name # Use ffmpeg directly for more reliable extraction cmd = [ 'ffmpeg', '-i', video_path, '-vn', # No video '-acodec', 'pcm_s16le', '-ar', '16000', # 16kHz sample rate for Whisper '-ac', '1', # Mono '-y', # Overwrite output audio_path ] subprocess.run(cmd, check=True, capture_output=True) return audio_path except Exception as e: raise Exception(f"Failed to extract audio: {str(e)}") def burn_subtitles_to_video(video_path, srt_path, progress=gr.Progress()): """ Burn subtitles into video using ffmpeg. """ try: progress(0.7, desc="Creating video with subtitles...") output_path = tempfile.NamedTemporaryFile(suffix=".mp4", delete=False).name # Escape the SRT path for ffmpeg filter srt_escaped = srt_path.replace('\\', '\\\\').replace(':', '\\:') # Use ffmpeg to burn subtitles cmd = [ 'ffmpeg', '-i', video_path, '-vf', f"subtitles={srt_escaped}:force_style='FontName=Arial,FontSize=24,PrimaryColour=&HFFFFFF,OutlineColour=&H000000,Outline=2,Alignment=2,MarginV=50'", '-c:a', 'copy', '-y', output_path ] subprocess.run(cmd, check=True, capture_output=True) progress(1.0, desc="Done!") return output_path except Exception as e: raise Exception(f"Failed to create subtitled video: {str(e)}") def merge_subtitle_segments(segments, max_duration=5.0, max_words=15): """ Merge small subtitle segments into larger, more readable ones. """ if not segments: return [] merged = [] # Start with the first segment current_segment = segments[0].copy() for i in range(1, len(segments)): next_segment = segments[i] # Combine text and calculate new word count new_text = current_segment['text'] + " " + next_segment['text'].lstrip() new_word_count = len(new_text.split()) # Calculate new duration new_duration = next_segment['end'] - current_segment['start'] # If merging doesn't exceed limits, merge if new_duration <= max_duration and new_word_count <= max_words: current_segment['end'] = next_segment['end'] current_segment['text'] = new_text else: # Otherwise, save the current segment and start a new one merged.append(current_segment) current_segment = next_segment.copy() # Don't forget the last segment merged.append(current_segment) return merged @spaces.GPU def process_video(video_path, task="transcribe", language=None, subtitle_format="burned", progress=gr.Progress()): """ Process video: extract audio, transcribe, and add subtitles. """ if video_path is None: return None, "Please provide a video file.", None temp_files = [] srt_path = None # Initialize to prevent NameError in finally block try: # Extract audio from video progress(0, desc="Extracting audio from video...") audio_path = extract_audio_from_video(video_path) temp_files.append(audio_path) # Check audio duration duration = get_audio_duration(audio_path) chunk_duration = 300 # 5 minutes per chunk if duration and duration > chunk_duration: progress(0.1, desc=f"Audio is {duration:.1f}s long. Slicing into chunks...") audio_chunks = slice_audio(audio_path, chunk_duration) temp_files.extend(audio_chunks) else: audio_chunks = [audio_path] # Transcribe each chunk with timestamps all_transcriptions = [] total_chunks = len(audio_chunks) for idx, chunk_path in enumerate(audio_chunks): progress(0.1 + (idx / total_chunks) * 0.5, desc=f"Transcribing chunk {idx + 1}/{total_chunks}...") result = transcribe_audio_chunk(chunk_path, task, language, return_timestamps=True) if "chunks" in result: chunk_offset = idx * chunk_duration for word_chunk in result["chunks"]: start = word_chunk["timestamp"][0] end = word_chunk["timestamp"][1] if start is not None and end is not None: all_transcriptions.append({ "start": start + chunk_offset, "end": end + chunk_offset, "text": word_chunk["text"] }) if not all_transcriptions: return None, "No transcription data available. Timestamps may have failed.", None # Merge close timestamps for better subtitle readability progress(0.6, desc="Optimizing subtitle timing...") merged_transcriptions = merge_subtitle_segments(all_transcriptions, max_duration=5.0, max_words=15) # Generate full text transcript full_text = "".join([t["text"] for t in merged_transcriptions]).strip() transcript_output = f"**Verbatim Transcription:**\n{full_text}\n\n" transcript_output += f"*Total duration: {duration:.1f}s | {len(merged_transcriptions)} subtitle segments*" # Create SRT file (needed for all formats) srt_path = tempfile.NamedTemporaryFile(suffix=".srt", delete=False).name create_srt_file(merged_transcriptions, srt_path) temp_files.append(srt_path) if subtitle_format == "burned": # Burn subtitles into video output_video = burn_subtitles_to_video(video_path, srt_path, progress) return output_video, transcript_output, None elif subtitle_format == "srt": # Return SRT file only progress(0.7, desc="Creating SRT subtitle file...") return None, transcript_output, srt_path else: # both progress(0.7, desc="Creating video with subtitles and SRT file...") output_video = burn_subtitles_to_video(video_path, srt_path, progress) # Create a copy of SRT for download srt_download = tempfile.NamedTemporaryFile(suffix=".srt", delete=False).name import shutil shutil.copy(srt_path, srt_download) return output_video, transcript_output, srt_download except Exception as e: return None, f"Error processing video: {str(e)}", None finally: # Clean up temporary audio files (keep video and srt outputs) for temp_file in temp_files: try: # srt_path could be None if an error occurs early if srt_path and os.path.exists(temp_file) and temp_file != srt_path: os.unlink(temp_file) elif os.path.exists(temp_file): os.unlink(temp_file) except: pass def transcribe_audio(audio, task="transcribe", return_timestamps=False, language=None, export_srt=False, progress=gr.Progress()): """ Transcribe audio with VERY VERBATIM output using CrisperWhisper. This model transcribes every spoken word exactly as it is, including fillers, stutters, and false starts. """ if audio is None: return "Please provide an audio file or recording.", None # If SRT export is requested, we must generate timestamps. if export_srt: return_timestamps = True temp_files = [] try: # Handle different audio input formats if isinstance(audio, str): audio_path = audio elif isinstance(audio, tuple): sr, audio_data = audio with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_file: import scipy.io.wavfile scipy.io.wavfile.write(temp_file.name, sr, audio_data) audio_path = temp_file.name temp_files.append(audio_path) else: return "Unsupported audio format.", None # Check audio duration and slice if necessary duration = get_audio_duration(audio_path) chunk_duration = 300 # 5 minutes per chunk if duration and duration > chunk_duration: progress(0, desc=f"Audio is {duration:.1f}s long. Slicing into chunks...") audio_chunks = slice_audio(audio_path, chunk_duration) temp_files.extend(audio_chunks) else: audio_chunks = [audio_path] # Process each chunk all_word_chunks = [] full_text_parts = [] total_chunks = len(audio_chunks) for idx, chunk_path in enumerate(audio_chunks): progress((idx + 1) / total_chunks, desc=f"Transcribing chunk {idx + 1}/{total_chunks}...") result = transcribe_audio_chunk(chunk_path, task, language, return_timestamps) full_text_parts.append(result["text"]) if return_timestamps and "chunks" in result: chunk_offset = idx * chunk_duration for word_chunk in result["chunks"]: start = word_chunk["timestamp"][0] end = word_chunk["timestamp"][1] if start is not None and end is not None: all_word_chunks.append({ "start": start + chunk_offset, "end": end + chunk_offset, "text": word_chunk["text"] }) # Combine all transcriptions full_text = "".join(full_text_parts).strip() output = f"**Verbatim Transcription:**\n{full_text}\n" srt_file_path = None if return_timestamps and all_word_chunks: # If timestamps are requested but not for SRT, display them in the textbox if not export_srt: output += "\n**Word-level Timestamps:**\n" for ts in all_word_chunks: output += f"[{ts['start']:.2f}s - {ts['end']:.2f}s]{ts['text']}\n" # Generate SRT file if requested if export_srt: if all_word_chunks: merged_transcriptions = merge_subtitle_segments(all_word_chunks, max_duration=5.0, max_words=15) srt_file = tempfile.NamedTemporaryFile(suffix=".srt", delete=False).name create_srt_file(merged_transcriptions, srt_file) srt_file_path = srt_file else: output += "\n**Warning:** Could not generate SRT file as word-level timestamps were not available." if duration: output += f"\n*Total duration: {duration:.1f}s | Processed in {total_chunks} chunk(s)*" return output, srt_file_path except Exception as e: return f"Error during transcription: {str(e)}", None finally: # Clean up temporary files for temp_file in temp_files: try: if os.path.exists(temp_file): os.unlink(temp_file) except: pass # Language options for manual selection LANGUAGES = { "Auto-detect": None, "English": "en", "Spanish": "es", "French": "fr", "German": "de", "Italian": "it", "Portuguese": "pt", "Dutch": "nl", "Russian": "ru", "Chinese": "zh", "Japanese": "ja", "Korean": "ko", "Arabic": "ar", "Hindi": "hi", "Turkish": "tr", "Polish": "pl", "Ukrainian": "uk", "Vietnamese": "vi", "Thai": "th", "Indonesian": "id", "Czech": "cs", "Romanian": "ro", "Swedish": "sv", "Danish": "da", "Norwegian": "no", "Finnish": "fi", "Greek": "el", "Hebrew": "he", } # Create Gradio interface with gr.Blocks(theme=gr.themes.Soft()) as demo: gr.Markdown( """ # 🎙️ Very Verbatim Multilingual Speech-to-Text Powered by **CrisperWhisper** - specifically designed for verbatim transcription with ZeroGPU acceleration. ## 🔥 TRUE Verbatim Transcription Unlike standard Whisper (which omits disfluencies), **CrisperWhisper captures EVERYTHING**: - ✅ **Fillers**: um, uh, ah, er, mm, like, you know - ✅ **Hesitations**: pauses, breath sounds, stutters - ✅ **False Starts**: "I was- I went to the store" - ✅ **Repetitions**: "I I I think that..." - ✅ **Disfluencies**: Every non-fluent speech element - ✅ **Accurate Word-Level Timestamps**: Precise timing even around disfluencies - ✅ **Multilingual**: Supports 99+ languages - ✅ **Long Audio Support**: Automatic 5-minute chunking - ✅ **Video Subtitles**: Automatic caption generation with burned-in or SRT output **Perfect for:** Legal transcription, linguistic research, therapy sessions, interviews, conversational AI training, video subtitling, or any use case requiring exact speech capture. """ ) with gr.Tabs(): # Audio Tab with gr.Tab("🎤 Audio Transcription"): with gr.Row(): with gr.Column(): audio_input = gr.Audio( sources=["upload", "microphone"], type="filepath", label="Audio Input" ) with gr.Row(): task_radio = gr.Radio( choices=["transcribe", "translate"], value="transcribe", label="Task", info="Transcribe verbatim or translate to English" ) language_dropdown = gr.Dropdown( choices=list(LANGUAGES.keys()), value="Auto-detect", label="Language", info="Select language or use auto-detect" ) timestamps_checkbox = gr.Checkbox( label="Show word-level timestamps in text output", value=False, info="Display precise timing for each word" ) export_srt_checkbox = gr.Checkbox( label="Export as SRT file", value=False, info="Generate downloadable SRT subtitle file" ) transcribe_btn = gr.Button("🎯 Transcribe Verbatim", variant="primary", size="lg") with gr.Column(): output_text = gr.Textbox( label="Verbatim Transcription (includes all um, uh, hesitations)", lines=18, show_copy_button=True, placeholder="Your VERY verbatim transcription will appear here...\n\nEvery um, uh, stutter, and hesitation will be captured!" ) output_audio_srt = gr.File( label="Download SRT Subtitles", interactive=False, visible=False ) gr.Markdown( """ ### Why CrisperWhisper for Verbatim? **Standard Whisper** is trained to transcribe the "intended meaning" - it automatically cleans up: - ❌ Removes "um", "uh", "ah" - ❌ Omits false starts - ❌ Skips repetitions - ❌ Ignores stutters **CrisperWhisper** is specifically trained for verbatim transcription: - ✅ Keeps every filler word - ✅ Preserves all disfluencies - ✅ Captures exact speech patterns - ✅ Accurate timestamps around hesitations - ✅ Export as SRT file for use in video editors, YouTube, etc. """ ) # Video Tab with gr.Tab("🎬 Video Subtitles"): with gr.Row(): with gr.Column(): video_input = gr.Video( label="Video Input", sources=["upload"] ) with gr.Row(): video_task_radio = gr.Radio( choices=["transcribe", "translate"], value="transcribe", label="Task", info="Transcribe verbatim or translate to English" ) video_language_dropdown = gr.Dropdown( choices=list(LANGUAGES.keys()), value="Auto-detect", label="Language", info="Select language or use auto-detect" ) subtitle_format_radio = gr.Radio( choices=[ ("Burned-in subtitles (permanent)", "burned"), ("SRT file only (external subtitles)", "srt"), ("Both burned-in video + SRT file", "both") ], value="burned", label="Subtitle Format", info="Choose output format" ) process_video_btn = gr.Button("🎬 Generate Subtitles", variant="primary", size="lg") with gr.Column(): output_video = gr.Video( label="Video with Subtitles", interactive=False ) video_transcript = gr.Textbox( label="Verbatim Transcript", lines=10, show_copy_button=True, placeholder="Transcript will appear here..." ) output_srt = gr.File( label="Download SRT Subtitles", interactive=False ) gr.Markdown( """ ### Video Subtitle Features - **Burned-in Subtitles**: Permanently embedded in video (white text with black outline) - **SRT File**: Standard subtitle file with timestamps (HH:MM:SS,mmm format) - Compatible with YouTube, VLC, Premiere Pro, Final Cut, DaVinci Resolve - Easy to edit timings and text in any text editor - Can be translated and re-synced - **Verbatim Captions**: All hesitations, fillers, and disfluencies included - **Smart Timing**: Automatically merges short segments for readability - **Long Video Support**: Handles videos of any length (automatic chunking) ### SRT File Format Example ``` 1 00:00:01,500 --> 00:00:03,200 Um, so I was thinking that 2 00:00:03,200 --> 00:00:05,800 we could, uh, go to the store ``` ### Tips - Use "Burned-in" for sharing videos with guaranteed subtitle visibility - Use "SRT file" for flexible editing, translation, and platform uploads - Use "Both" to have maximum flexibility - SRT files work with all major video platforms and editors - Subtitles are positioned at the bottom center of the video """ ) gr.Markdown( """ ### Use Cases - **Legal/Court Transcription**: Exact wording required by law - **Linguistic Research**: Study of natural speech patterns and disfluencies - **Medical/Therapy Sessions**: Capturing patient speech patterns - **Interview Transcription**: Preserving speaker mannerisms - **Conversational AI Training**: Realistic dialogue data - **Accessibility**: Complete transcripts and captions for deaf/hard-of-hearing - **Video Content**: YouTube, social media, educational content with accurate captions - **Language Learning**: Analyzing natural spoken language ### Tips for Best Results - Clear audio with minimal background noise works best - The model captures quiet speech - ensure consistent audio levels - Manual language selection can improve accuracy - Long files are automatically processed in 5-minute chunks - For videos, ensure good audio quality for best subtitle accuracy """ ) # Set up event handlers def transcribe_wrapper(audio, task, timestamps, export_srt, language_name, progress=gr.Progress()): language_code = LANGUAGES[language_name] transcript, srt_file = transcribe_audio(audio, task, timestamps, language_code, export_srt, progress) # Control visibility of SRT download srt_visible = gr.update(visible=srt_file is not None, value=srt_file) return transcript, srt_visible def video_wrapper(video, task, language_name, subtitle_format, progress=gr.Progress()): language_code = LANGUAGES[language_name] return process_video(video, task, language_code, subtitle_format, progress) transcribe_btn.click( fn=transcribe_wrapper, inputs=[audio_input, task_radio, timestamps_checkbox, export_srt_checkbox, language_dropdown], outputs=[output_text, output_audio_srt] ) process_video_btn.click( fn=video_wrapper, inputs=[video_input, video_task_radio, video_language_dropdown, subtitle_format_radio], outputs=[output_video, video_transcript, output_srt] ) # Launch the app if __name__ == "__main__": demo.launch()