Spaces:

rafaaa2105
/

Crisper-Whisper

Paused

App Files Files Community

rafaaa2105 commited on Oct 6

Commit

b9a9520

verified ·

1 Parent(s): 3d89603

Update app.py

Browse files

Files changed (1) hide show

app.py +85 -45

app.py CHANGED Viewed

@@ -190,6 +190,41 @@ def burn_subtitles_to_video(video_path, srt_path, progress=gr.Progress()):
     except Exception as e:
         raise Exception(f"Failed to create subtitled video: {str(e)}")
 @spaces.GPU
 def process_video(video_path, task="transcribe", language=None, subtitle_format="burned", progress=gr.Progress()):
     """
@@ -199,6 +234,7 @@ def process_video(video_path, task="transcribe", language=None, subtitle_format=
         return None, "Please provide a video file.", None
     temp_files = []
     try:
         # Extract audio from video
@@ -247,7 +283,7 @@ def process_video(video_path, task="transcribe", language=None, subtitle_format=
         merged_transcriptions = merge_subtitle_segments(all_transcriptions, max_duration=5.0, max_words=15)
         # Generate full text transcript
-        full_text = " ".join([t["text"] for t in merged_transcriptions])
         transcript_output = f"**Verbatim Transcription:**\n{full_text}\n\n"
         transcript_output += f"*Total duration: {duration:.1f}s | {len(merged_transcriptions)} subtitle segments*"
@@ -282,24 +318,26 @@ def process_video(video_path, task="transcribe", language=None, subtitle_format=
         # Clean up temporary audio files (keep video and srt outputs)
         for temp_file in temp_files:
             try:
-                if os.path.exists(temp_file) and temp_file not in [srt_path]:
                     os.unlink(temp_file)
             except:
                 pass
-def transcribe_audio(audio, task="transcribe", return_timestamps=False, language=None, progress=gr.Progress()):
     """
     Transcribe audio with VERY VERBATIM output using CrisperWhisper.
-    CrisperWhisper transcribes every spoken word exactly as it is, including:
-    - Fillers (um, uh, ah, er, mm)
-    - Pauses and hesitations
-    - Stutters and repetitions
-    - False starts
-    - Non-standard utterances
     """
     if audio is None:
-        return "Please provide an audio file or recording."
     temp_files = []
     try:
@@ -314,8 +352,8 @@ def transcribe_audio(audio, task="transcribe", return_timestamps=False, language
                 audio_path = temp_file.name
                 temp_files.append(audio_path)
         else:
-            return "Unsupported audio format."
         # Check audio duration and slice if necessary
         duration = get_audio_duration(audio_path)
         chunk_duration = 300  # 5 minutes per chunk
@@ -326,9 +364,10 @@ def transcribe_audio(audio, task="transcribe", return_timestamps=False, language
             temp_files.extend(audio_chunks)
         else:
             audio_chunks = [audio_path]
         # Process each chunk
-        all_transcriptions = []
         total_chunks = len(audio_chunks)
         for idx, chunk_path in enumerate(audio_chunks):
@@ -336,50 +375,51 @@ def transcribe_audio(audio, task="transcribe", return_timestamps=False, language
             result = transcribe_audio_chunk(chunk_path, task, language, return_timestamps)
             if return_timestamps and "chunks" in result:
                 chunk_offset = idx * chunk_duration
-                chunk_text = result["text"]
-                timestamp_text = []
                 for word_chunk in result["chunks"]:
                     start = word_chunk["timestamp"][0]
                     end = word_chunk["timestamp"][1]
                     if start is not None and end is not None:
-                        timestamp_text.append({
                             "start": start + chunk_offset,
                             "end": end + chunk_offset,
                             "text": word_chunk["text"]
                         })
-                all_transcriptions.append({
-                    "text": chunk_text,
-                    "timestamps": timestamp_text
-                })
-            else:
-                all_transcriptions.append({
-                    "text": result["text"],
-                    "timestamps": []
-                })
         # Combine all transcriptions
-        full_text = " ".join([t["text"] for t in all_transcriptions])
         output = f"**Verbatim Transcription:**\n{full_text}\n"
-        if return_timestamps:
-            output += "\n**Word-level Timestamps:**\n"
-            for trans in all_transcriptions:
-                for ts in trans["timestamps"]:
-                    output += f"[{ts['start']:.2f}s - {ts['end']:.2f}s] {ts['text']}\n"
         if duration:
             output += f"\n*Total duration: {duration:.1f}s | Processed in {total_chunks} chunk(s)*"
-        return output
     except Exception as e:
-        return f"Error during transcription: {str(e)}"
     finally:
         # Clean up temporary files
         for temp_file in temp_files:
@@ -475,8 +515,8 @@ with gr.Blocks(theme=gr.themes.Soft()) as demo:
                         )
                     timestamps_checkbox = gr.Checkbox(
-                        label="Show word-level timestamps",
-                        value=True,
                         info="Display precise timing for each word"
                     )
@@ -661,4 +701,4 @@ with gr.Blocks(theme=gr.themes.Soft()) as demo:
 # Launch the app
 if __name__ == "__main__":
-    demo.launch()

     except Exception as e:
         raise Exception(f"Failed to create subtitled video: {str(e)}")
+def merge_subtitle_segments(segments, max_duration=5.0, max_words=15):
+    """
+    Merge small subtitle segments into larger, more readable ones.
+    """
+    if not segments:
+        return []
+    merged = []
+    # Start with the first segment
+    current_segment = segments[0].copy()
+    for i in range(1, len(segments)):
+        next_segment = segments[i]
+        # Combine text and calculate new word count
+        new_text = current_segment['text'] + " " + next_segment['text'].lstrip()
+        new_word_count = len(new_text.split())
+        # Calculate new duration
+        new_duration = next_segment['end'] - current_segment['start']
+        # If merging doesn't exceed limits, merge
+        if new_duration <= max_duration and new_word_count <= max_words:
+            current_segment['end'] = next_segment['end']
+            current_segment['text'] = new_text
+        else:
+            # Otherwise, save the current segment and start a new one
+            merged.append(current_segment)
+            current_segment = next_segment.copy()
+    # Don't forget the last segment
+    merged.append(current_segment)
+    return merged
 @spaces.GPU
 def process_video(video_path, task="transcribe", language=None, subtitle_format="burned", progress=gr.Progress()):
     """
         return None, "Please provide a video file.", None
     temp_files = []
+    srt_path = None # Initialize to prevent NameError in finally block
     try:
         # Extract audio from video
         merged_transcriptions = merge_subtitle_segments(all_transcriptions, max_duration=5.0, max_words=15)
         # Generate full text transcript
+        full_text = "".join([t["text"] for t in merged_transcriptions]).strip()
         transcript_output = f"**Verbatim Transcription:**\n{full_text}\n\n"
         transcript_output += f"*Total duration: {duration:.1f}s | {len(merged_transcriptions)} subtitle segments*"
         # Clean up temporary audio files (keep video and srt outputs)
         for temp_file in temp_files:
             try:
+                # srt_path could be None if an error occurs early
+                if srt_path and os.path.exists(temp_file) and temp_file != srt_path:
                     os.unlink(temp_file)
+                elif os.path.exists(temp_file):
+                     os.unlink(temp_file)
             except:
                 pass
+def transcribe_audio(audio, task="transcribe", return_timestamps=False, language=None, export_srt=False, progress=gr.Progress()):
     """
     Transcribe audio with VERY VERBATIM output using CrisperWhisper.
+    This model transcribes every spoken word exactly as it is, including fillers, stutters, and false starts.
     """
     if audio is None:
+        return "Please provide an audio file or recording.", None
+    # If SRT export is requested, we must generate timestamps.
+    if export_srt:
+        return_timestamps = True
     temp_files = []
     try:
                 audio_path = temp_file.name
                 temp_files.append(audio_path)
         else:
+            return "Unsupported audio format.", None
         # Check audio duration and slice if necessary
         duration = get_audio_duration(audio_path)
         chunk_duration = 300  # 5 minutes per chunk
             temp_files.extend(audio_chunks)
         else:
             audio_chunks = [audio_path]
         # Process each chunk
+        all_word_chunks = []
+        full_text_parts = []
         total_chunks = len(audio_chunks)
         for idx, chunk_path in enumerate(audio_chunks):
             result = transcribe_audio_chunk(chunk_path, task, language, return_timestamps)
+            full_text_parts.append(result["text"])
             if return_timestamps and "chunks" in result:
                 chunk_offset = idx * chunk_duration
                 for word_chunk in result["chunks"]:
                     start = word_chunk["timestamp"][0]
                     end = word_chunk["timestamp"][1]
                     if start is not None and end is not None:
+                        all_word_chunks.append({
                             "start": start + chunk_offset,
                             "end": end + chunk_offset,
                             "text": word_chunk["text"]
                         })
         # Combine all transcriptions
+        full_text = "".join(full_text_parts).strip()
         output = f"**Verbatim Transcription:**\n{full_text}\n"
+        srt_file_path = None
+        if return_timestamps and all_word_chunks:
+             # If timestamps are requested but not for SRT, display them in the textbox
+            if not export_srt:
+                output += "\n**Word-level Timestamps:**\n"
+                for ts in all_word_chunks:
+                    output += f"[{ts['start']:.2f}s - {ts['end']:.2f}s]{ts['text']}\n"
+        # Generate SRT file if requested
+        if export_srt:
+            if all_word_chunks:
+                merged_transcriptions = merge_subtitle_segments(all_word_chunks, max_duration=5.0, max_words=15)
+                srt_file = tempfile.NamedTemporaryFile(suffix=".srt", delete=False).name
+                create_srt_file(merged_transcriptions, srt_file)
+                srt_file_path = srt_file
+            else:
+                 output += "\n**Warning:** Could not generate SRT file as word-level timestamps were not available."
         if duration:
             output += f"\n*Total duration: {duration:.1f}s | Processed in {total_chunks} chunk(s)*"
+        return output, srt_file_path
     except Exception as e:
+        return f"Error during transcription: {str(e)}", None
     finally:
         # Clean up temporary files
         for temp_file in temp_files:
                         )
                     timestamps_checkbox = gr.Checkbox(
+                        label="Show word-level timestamps in text output",
+                        value=False,
                         info="Display precise timing for each word"
                     )
 # Launch the app
 if __name__ == "__main__":
+    demo.launch()