Spaces:

rafaaa2105
/

Crisper-Whisper

Paused

App Files Files Community

rafaaa2105 commited on Oct 6

Commit

706ca1e

verified ·

1 Parent(s): dd41807

Update app.py

Browse files

Files changed (1) hide show

app.py +338 -58

app.py CHANGED Viewed

@@ -6,6 +6,8 @@ import numpy as np
 from pydub import AudioSegment
 import tempfile
 import os
 # Model configuration - Using CrisperWhisper for TRUE verbatim transcription
 # CrisperWhisper is designed to transcribe EVERY word including um, uh, fillers, stutters, false starts
@@ -109,6 +111,199 @@ def transcribe_audio_chunk(audio_input, task="transcribe", language=None, return
         except Exception as e2:
             raise Exception(f"Transcription failed: {str(e2)}")
 def transcribe_audio(audio, task="transcribe", return_timestamps=False, language=None, progress=gr.Progress()):
     """
     Transcribe audio with VERY VERBATIM output using CrisperWhisper.
@@ -263,75 +458,149 @@ with gr.Blocks(theme=gr.themes.Soft()) as demo:
         - ✅ **Accurate Word-Level Timestamps**: Precise timing even around disfluencies
         - ✅ **Multilingual**: Supports 99+ languages
         - ✅ **Long Audio Support**: Automatic 5-minute chunking
         **Perfect for:** Legal transcription, linguistic research, therapy sessions, interviews,
-        conversational AI training, or any use case requiring exact speech capture.
         """
     )
-    with gr.Row():
-        with gr.Column():
-            audio_input = gr.Audio(
-                sources=["upload", "microphone"],
-                type="filepath",
-                label="Audio Input"
-            )
             with gr.Row():
-                task_radio = gr.Radio(
-                    choices=["transcribe", "translate"],
-                    value="transcribe",
-                    label="Task",
-                    info="Transcribe verbatim or translate to English"
-                )
-                language_dropdown = gr.Dropdown(
-                    choices=list(LANGUAGES.keys()),
-                    value="Auto-detect",
-                    label="Language",
-                    info="Select language or use auto-detect"
-                )
-            timestamps_checkbox = gr.Checkbox(
-                label="Show word-level timestamps",
-                value=True,
-                info="Display precise timing for each word"
             )
-            transcribe_btn = gr.Button("🎯 Transcribe Verbatim", variant="primary", size="lg")
-        with gr.Column():
-            output_text = gr.Textbox(
-                label="Verbatim Transcription (includes all um, uh, hesitations)",
-                lines=20,
-                show_copy_button=True,
-                placeholder="Your VERY verbatim transcription will appear here...\n\nEvery um, uh, stutter, and hesitation will be captured!"
             )
     gr.Markdown(
         """
-        ### Why CrisperWhisper for Verbatim?
-        **Standard Whisper** is trained to transcribe the "intended meaning" - it automatically cleans up:
-        - ❌ Removes "um", "uh", "ah"
-        - ❌ Omits false starts
-        - ❌ Skips repetitions
-        - ❌ Ignores stutters
-        **CrisperWhisper** is specifically trained for verbatim transcription:
-        - ✅ Keeps every filler word
-        - ✅ Preserves all disfluencies
-        - ✅ Captures exact speech patterns
-        - ✅ Accurate timestamps around hesitations
-        ### Example Comparison
-        **Input Audio:** "Um, so, uh, I was- I was thinking that, like, we could- we could go to the, uh, the store"
-        **Standard Whisper:** "So I was thinking that we could go to the store"
-        **CrisperWhisper:** "Um, so, uh, I was- I was thinking that, like, we could- we could go to the, uh, the store"
         ### Use Cases
         - **Legal/Court Transcription**: Exact wording required by law
@@ -339,7 +608,8 @@ with gr.Blocks(theme=gr.themes.Soft()) as demo:
         - **Medical/Therapy Sessions**: Capturing patient speech patterns
         - **Interview Transcription**: Preserving speaker mannerisms
         - **Conversational AI Training**: Realistic dialogue data
-        - **Accessibility**: Providing complete transcripts for deaf/hard-of-hearing
         - **Language Learning**: Analyzing natural spoken language
         ### Tips for Best Results
@@ -348,20 +618,30 @@ with gr.Blocks(theme=gr.themes.Soft()) as demo:
         - The model captures quiet speech - ensure consistent audio levels
         - Manual language selection can improve accuracy
         - Long files are automatically processed in 5-minute chunks
-        - Timestamps help identify exact moments of hesitations
         """
     )
-    # Set up event handler
     def transcribe_wrapper(audio, task, timestamps, language_name, progress=gr.Progress()):
         language_code = LANGUAGES[language_name]
         return transcribe_audio(audio, task, timestamps, language_code, progress)
     transcribe_btn.click(
         fn=transcribe_wrapper,
         inputs=[audio_input, task_radio, timestamps_checkbox, language_dropdown],
         outputs=output_text
     )
 # Launch the app
 if __name__ == "__main__":

 from pydub import AudioSegment
 import tempfile
 import os
+from moviepy.editor import VideoFileClip, TextClip, CompositeVideoClip
+import re
 # Model configuration - Using CrisperWhisper for TRUE verbatim transcription
 # CrisperWhisper is designed to transcribe EVERY word including um, uh, fillers, stutters, false starts
         except Exception as e2:
             raise Exception(f"Transcription failed: {str(e2)}")
+def create_srt_file(transcription_data, output_path):
+    """
+    Create an SRT subtitle file from transcription data.
+    """
+    with open(output_path, 'w', encoding='utf-8') as f:
+        counter = 1
+        for item in transcription_data:
+            start_time = item['start']
+            end_time = item['end']
+            text = item['text'].strip()
+            if text:  # Only add non-empty subtitles
+                # Convert seconds to SRT time format (HH:MM:SS,mmm)
+                start_srt = format_timestamp_srt(start_time)
+                end_srt = format_timestamp_srt(end_time)
+                f.write(f"{counter}\n")
+                f.write(f"{start_srt} --> {end_srt}\n")
+                f.write(f"{text}\n\n")
+                counter += 1
+def format_timestamp_srt(seconds):
+    """Convert seconds to SRT timestamp format."""
+    hours = int(seconds // 3600)
+    minutes = int((seconds % 3600) // 60)
+    secs = int(seconds % 60)
+    millis = int((seconds % 1) * 1000)
+    return f"{hours:02d}:{minutes:02d}:{secs:02d},{millis:03d}"
+def extract_audio_from_video(video_path):
+    """Extract audio from video file."""
+    try:
+        video = VideoFileClip(video_path)
+        audio_path = tempfile.NamedTemporaryFile(suffix=".wav", delete=False).name
+        video.audio.write_audiofile(audio_path, codec='pcm_s16le', verbose=False, logger=None)
+        video.close()
+        return audio_path
+    except Exception as e:
+        raise Exception(f"Failed to extract audio: {str(e)}")
+def burn_subtitles_to_video(video_path, transcription_data, progress=gr.Progress()):
+    """
+    Burn subtitles directly into the video.
+    """
+    try:
+        progress(0.1, desc="Loading video...")
+        video = VideoFileClip(video_path)
+        progress(0.3, desc="Creating subtitle clips...")
+        subtitle_clips = []
+        for item in transcription_data:
+            start_time = item['start']
+            end_time = item['end']
+            text = item['text'].strip()
+            if text and end_time > start_time:
+                # Create text clip with styling
+                txt_clip = (TextClip(
+                    text,
+                    fontsize=40,
+                    color='white',
+                    font='Arial-Bold',
+                    stroke_color='black',
+                    stroke_width=2,
+                    method='caption',
+                    size=(video.w * 0.9, None),
+                    align='center'
+                )
+                .set_position(('center', video.h * 0.85))
+                .set_start(start_time)
+                .set_duration(end_time - start_time))
+                subtitle_clips.append(txt_clip)
+        progress(0.6, desc="Compositing video with subtitles...")
+        # Composite video with subtitles
+        final_video = CompositeVideoClip([video] + subtitle_clips)
+        progress(0.8, desc="Rendering final video...")
+        output_path = tempfile.NamedTemporaryFile(suffix=".mp4", delete=False).name
+        final_video.write_videofile(
+            output_path,
+            codec='libx264',
+            audio_codec='aac',
+            temp_audiofile=tempfile.NamedTemporaryFile(suffix=".m4a", delete=False).name,
+            remove_temp=True,
+            verbose=False,
+            logger=None
+        )
+        video.close()
+        final_video.close()
+        progress(1.0, desc="Done!")
+        return output_path
+    except Exception as e:
+        raise Exception(f"Failed to create subtitled video: {str(e)}")
+@spaces.GPU
+def process_video(video_path, task="transcribe", language=None, subtitle_format="burned", progress=gr.Progress()):
+    """
+    Process video: extract audio, transcribe, and add subtitles.
+    """
+    if video_path is None:
+        return None, "Please provide a video file.", None
+    temp_files = []
+    try:
+        # Extract audio from video
+        progress(0, desc="Extracting audio from video...")
+        audio_path = extract_audio_from_video(video_path)
+        temp_files.append(audio_path)
+        # Check audio duration
+        duration = get_audio_duration(audio_path)
+        chunk_duration = 300  # 5 minutes per chunk
+        if duration and duration > chunk_duration:
+            progress(0.1, desc=f"Audio is {duration:.1f}s long. Slicing into chunks...")
+            audio_chunks = slice_audio(audio_path, chunk_duration)
+            temp_files.extend(audio_chunks)
+        else:
+            audio_chunks = [audio_path]
+        # Transcribe each chunk with timestamps
+        all_transcriptions = []
+        total_chunks = len(audio_chunks)
+        for idx, chunk_path in enumerate(audio_chunks):
+            progress(0.1 + (idx / total_chunks) * 0.5, desc=f"Transcribing chunk {idx + 1}/{total_chunks}...")
+            result = transcribe_audio_chunk(chunk_path, task, language, return_timestamps=True)
+            if "chunks" in result:
+                chunk_offset = idx * chunk_duration
+                for word_chunk in result["chunks"]:
+                    start = word_chunk["timestamp"][0]
+                    end = word_chunk["timestamp"][1]
+                    if start is not None and end is not None:
+                        all_transcriptions.append({
+                            "start": start + chunk_offset,
+                            "end": end + chunk_offset,
+                            "text": word_chunk["text"]
+                        })
+        if not all_transcriptions:
+            return None, "No transcription data available. Timestamps may have failed.", None
+        # Merge close timestamps for better subtitle readability
+        progress(0.6, desc="Optimizing subtitle timing...")
+        merged_transcriptions = merge_subtitle_segments(all_transcriptions, max_duration=5.0, max_words=15)
+        # Generate full text transcript
+        full_text = " ".join([t["text"] for t in merged_transcriptions])
+        transcript_output = f"**Verbatim Transcription:**\n{full_text}\n\n"
+        transcript_output += f"*Total duration: {duration:.1f}s | {len(merged_transcriptions)} subtitle segments*"
+        if subtitle_format == "burned":
+            # Burn subtitles into video
+            progress(0.7, desc="Creating video with burned-in subtitles...")
+            output_video = burn_subtitles_to_video(video_path, merged_transcriptions, progress)
+            return output_video, transcript_output, None
+        elif subtitle_format == "srt":
+            # Create SRT file
+            progress(0.7, desc="Creating SRT subtitle file...")
+            srt_path = tempfile.NamedTemporaryFile(suffix=".srt", delete=False).name
+            create_srt_file(merged_transcriptions, srt_path)
+            return None, transcript_output, srt_path
+        else:  # both
+            progress(0.7, desc="Creating video with subtitles and SRT file...")
+            output_video = burn_subtitles_to_video(video_path, merged_transcriptions, progress)
+            srt_path = tempfile.NamedTemporaryFile(suffix=".srt", delete=False).name
+            create_srt_file(merged_transcriptions, srt_path)
+            return output_video, transcript_output, srt_path
+    except Exception as e:
+        return None, f"Error processing video: {str(e)}", None
+    finally:
+        # Clean up temporary audio files (keep video and srt outputs)
+        for temp_file in temp_files:
+            try:
+                if os.path.exists(temp_file):
+                    os.unlink(temp_file)
+            except:
+                pass
 def transcribe_audio(audio, task="transcribe", return_timestamps=False, language=None, progress=gr.Progress()):
     """
     Transcribe audio with VERY VERBATIM output using CrisperWhisper.
         - ✅ **Accurate Word-Level Timestamps**: Precise timing even around disfluencies
         - ✅ **Multilingual**: Supports 99+ languages
         - ✅ **Long Audio Support**: Automatic 5-minute chunking
+        - ✅ **Video Subtitles**: Automatic caption generation with burned-in or SRT output
         **Perfect for:** Legal transcription, linguistic research, therapy sessions, interviews,
+        conversational AI training, video subtitling, or any use case requiring exact speech capture.
         """
     )
+    with gr.Tabs():
+        # Audio Tab
+        with gr.Tab("🎤 Audio Transcription"):
             with gr.Row():
+                with gr.Column():
+                    audio_input = gr.Audio(
+                        sources=["upload", "microphone"],
+                        type="filepath",
+                        label="Audio Input"
+                    )
+                    with gr.Row():
+                        task_radio = gr.Radio(
+                            choices=["transcribe", "translate"],
+                            value="transcribe",
+                            label="Task",
+                            info="Transcribe verbatim or translate to English"
+                        )
+                        language_dropdown = gr.Dropdown(
+                            choices=list(LANGUAGES.keys()),
+                            value="Auto-detect",
+                            label="Language",
+                            info="Select language or use auto-detect"
+                        )
+                    timestamps_checkbox = gr.Checkbox(
+                        label="Show word-level timestamps",
+                        value=True,
+                        info="Display precise timing for each word"
+                    )
+                    transcribe_btn = gr.Button("🎯 Transcribe Verbatim", variant="primary", size="lg")
+                with gr.Column():
+                    output_text = gr.Textbox(
+                        label="Verbatim Transcription (includes all um, uh, hesitations)",
+                        lines=20,
+                        show_copy_button=True,
+                        placeholder="Your VERY verbatim transcription will appear here...\n\nEvery um, uh, stutter, and hesitation will be captured!"
+                    )
+            gr.Markdown(
+                """
+                ### Why CrisperWhisper for Verbatim?
+                **Standard Whisper** is trained to transcribe the "intended meaning" - it automatically cleans up:
+                - ❌ Removes "um", "uh", "ah"
+                - ❌ Omits false starts
+                - ❌ Skips repetitions
+                - ❌ Ignores stutters
+                **CrisperWhisper** is specifically trained for verbatim transcription:
+                - ✅ Keeps every filler word
+                - ✅ Preserves all disfluencies
+                - ✅ Captures exact speech patterns
+                - ✅ Accurate timestamps around hesitations
+                """
             )
+        # Video Tab
+        with gr.Tab("🎬 Video Subtitles"):
+            with gr.Row():
+                with gr.Column():
+                    video_input = gr.Video(
+                        label="Video Input",
+                        sources=["upload"]
+                    )
+                    with gr.Row():
+                        video_task_radio = gr.Radio(
+                            choices=["transcribe", "translate"],
+                            value="transcribe",
+                            label="Task",
+                            info="Transcribe verbatim or translate to English"
+                        )
+                        video_language_dropdown = gr.Dropdown(
+                            choices=list(LANGUAGES.keys()),
+                            value="Auto-detect",
+                            label="Language",
+                            info="Select language or use auto-detect"
+                        )
+                    subtitle_format_radio = gr.Radio(
+                        choices=[
+                            ("Burned-in subtitles (permanent)", "burned"),
+                            ("SRT file only (external subtitles)", "srt"),
+                            ("Both burned-in video + SRT file", "both")
+                        ],
+                        value="burned",
+                        label="Subtitle Format",
+                        info="Choose output format"
+                    )
+                    process_video_btn = gr.Button("🎬 Generate Subtitles", variant="primary", size="lg")
+                with gr.Column():
+                    output_video = gr.Video(
+                        label="Video with Subtitles",
+                        interactive=False
+                    )
+                    video_transcript = gr.Textbox(
+                        label="Verbatim Transcript",
+                        lines=10,
+                        show_copy_button=True,
+                        placeholder="Transcript will appear here..."
+                    )
+                    output_srt = gr.File(
+                        label="Download SRT Subtitles",
+                        interactive=False
+                    )
+            gr.Markdown(
+                """
+                ### Video Subtitle Features
+                - **Burned-in Subtitles**: Permanently embedded in video (white text with black outline)
+                - **SRT File**: External subtitle file compatible with video players and editing software
+                - **Verbatim Captions**: All hesitations, fillers, and disfluencies included
+                - **Smart Timing**: Automatically merges short segments for readability
+                - **Long Video Support**: Handles videos of any length (automatic chunking)
+                ### Tips
+                - Use "Burned-in" for sharing videos with guaranteed subtitle visibility
+                - Use "SRT file" for flexible editing and translation
+                - Use "Both" to have both options available
+                - Subtitles are positioned at the bottom center of the video
+                """
             )
     gr.Markdown(
         """
         ### Use Cases
         - **Legal/Court Transcription**: Exact wording required by law
         - **Medical/Therapy Sessions**: Capturing patient speech patterns
         - **Interview Transcription**: Preserving speaker mannerisms
         - **Conversational AI Training**: Realistic dialogue data
+        - **Accessibility**: Complete transcripts and captions for deaf/hard-of-hearing
+        - **Video Content**: YouTube, social media, educational content with accurate captions
         - **Language Learning**: Analyzing natural spoken language
         ### Tips for Best Results
         - The model captures quiet speech - ensure consistent audio levels
         - Manual language selection can improve accuracy
         - Long files are automatically processed in 5-minute chunks
+        - For videos, ensure good audio quality for best subtitle accuracy
         """
     )
+    # Set up event handlers
     def transcribe_wrapper(audio, task, timestamps, language_name, progress=gr.Progress()):
         language_code = LANGUAGES[language_name]
         return transcribe_audio(audio, task, timestamps, language_code, progress)
+    def video_wrapper(video, task, language_name, subtitle_format, progress=gr.Progress()):
+        language_code = LANGUAGES[language_name]
+        return process_video(video, task, language_code, subtitle_format, progress)
     transcribe_btn.click(
         fn=transcribe_wrapper,
         inputs=[audio_input, task_radio, timestamps_checkbox, language_dropdown],
         outputs=output_text
     )
+    process_video_btn.click(
+        fn=video_wrapper,
+        inputs=[video_input, video_task_radio, video_language_dropdown, subtitle_format_radio],
+        outputs=[output_video, video_transcript, output_srt]
+    )
 # Launch the app
 if __name__ == "__main__":