rafaaa2105 commited on
Commit
706ca1e
Β·
verified Β·
1 Parent(s): dd41807

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +338 -58
app.py CHANGED
@@ -6,6 +6,8 @@ import numpy as np
6
  from pydub import AudioSegment
7
  import tempfile
8
  import os
 
 
9
 
10
  # Model configuration - Using CrisperWhisper for TRUE verbatim transcription
11
  # CrisperWhisper is designed to transcribe EVERY word including um, uh, fillers, stutters, false starts
@@ -109,6 +111,199 @@ def transcribe_audio_chunk(audio_input, task="transcribe", language=None, return
109
  except Exception as e2:
110
  raise Exception(f"Transcription failed: {str(e2)}")
111
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
112
  def transcribe_audio(audio, task="transcribe", return_timestamps=False, language=None, progress=gr.Progress()):
113
  """
114
  Transcribe audio with VERY VERBATIM output using CrisperWhisper.
@@ -263,75 +458,149 @@ with gr.Blocks(theme=gr.themes.Soft()) as demo:
263
  - βœ… **Accurate Word-Level Timestamps**: Precise timing even around disfluencies
264
  - βœ… **Multilingual**: Supports 99+ languages
265
  - βœ… **Long Audio Support**: Automatic 5-minute chunking
 
266
 
267
  **Perfect for:** Legal transcription, linguistic research, therapy sessions, interviews,
268
- conversational AI training, or any use case requiring exact speech capture.
269
  """
270
  )
271
 
272
- with gr.Row():
273
- with gr.Column():
274
- audio_input = gr.Audio(
275
- sources=["upload", "microphone"],
276
- type="filepath",
277
- label="Audio Input"
278
- )
279
-
280
  with gr.Row():
281
- task_radio = gr.Radio(
282
- choices=["transcribe", "translate"],
283
- value="transcribe",
284
- label="Task",
285
- info="Transcribe verbatim or translate to English"
286
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
287
 
288
- language_dropdown = gr.Dropdown(
289
- choices=list(LANGUAGES.keys()),
290
- value="Auto-detect",
291
- label="Language",
292
- info="Select language or use auto-detect"
293
- )
 
294
 
295
- timestamps_checkbox = gr.Checkbox(
296
- label="Show word-level timestamps",
297
- value=True,
298
- info="Display precise timing for each word"
 
 
 
 
 
 
 
 
 
 
 
 
299
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
300
 
301
- transcribe_btn = gr.Button("🎯 Transcribe Verbatim", variant="primary", size="lg")
302
-
303
- with gr.Column():
304
- output_text = gr.Textbox(
305
- label="Verbatim Transcription (includes all um, uh, hesitations)",
306
- lines=20,
307
- show_copy_button=True,
308
- placeholder="Your VERY verbatim transcription will appear here...\n\nEvery um, uh, stutter, and hesitation will be captured!"
 
 
 
 
 
 
 
 
 
309
  )
310
 
311
  gr.Markdown(
312
  """
313
- ### Why CrisperWhisper for Verbatim?
314
-
315
- **Standard Whisper** is trained to transcribe the "intended meaning" - it automatically cleans up:
316
- - ❌ Removes "um", "uh", "ah"
317
- - ❌ Omits false starts
318
- - ❌ Skips repetitions
319
- - ❌ Ignores stutters
320
-
321
- **CrisperWhisper** is specifically trained for verbatim transcription:
322
- - βœ… Keeps every filler word
323
- - βœ… Preserves all disfluencies
324
- - βœ… Captures exact speech patterns
325
- - βœ… Accurate timestamps around hesitations
326
-
327
- ### Example Comparison
328
-
329
- **Input Audio:** "Um, so, uh, I was- I was thinking that, like, we could- we could go to the, uh, the store"
330
-
331
- **Standard Whisper:** "So I was thinking that we could go to the store"
332
-
333
- **CrisperWhisper:** "Um, so, uh, I was- I was thinking that, like, we could- we could go to the, uh, the store"
334
-
335
  ### Use Cases
336
 
337
  - **Legal/Court Transcription**: Exact wording required by law
@@ -339,7 +608,8 @@ with gr.Blocks(theme=gr.themes.Soft()) as demo:
339
  - **Medical/Therapy Sessions**: Capturing patient speech patterns
340
  - **Interview Transcription**: Preserving speaker mannerisms
341
  - **Conversational AI Training**: Realistic dialogue data
342
- - **Accessibility**: Providing complete transcripts for deaf/hard-of-hearing
 
343
  - **Language Learning**: Analyzing natural spoken language
344
 
345
  ### Tips for Best Results
@@ -348,20 +618,30 @@ with gr.Blocks(theme=gr.themes.Soft()) as demo:
348
  - The model captures quiet speech - ensure consistent audio levels
349
  - Manual language selection can improve accuracy
350
  - Long files are automatically processed in 5-minute chunks
351
- - Timestamps help identify exact moments of hesitations
352
  """
353
  )
354
 
355
- # Set up event handler
356
  def transcribe_wrapper(audio, task, timestamps, language_name, progress=gr.Progress()):
357
  language_code = LANGUAGES[language_name]
358
  return transcribe_audio(audio, task, timestamps, language_code, progress)
359
 
 
 
 
 
360
  transcribe_btn.click(
361
  fn=transcribe_wrapper,
362
  inputs=[audio_input, task_radio, timestamps_checkbox, language_dropdown],
363
  outputs=output_text
364
  )
 
 
 
 
 
 
365
 
366
  # Launch the app
367
  if __name__ == "__main__":
 
6
  from pydub import AudioSegment
7
  import tempfile
8
  import os
9
+ from moviepy.editor import VideoFileClip, TextClip, CompositeVideoClip
10
+ import re
11
 
12
  # Model configuration - Using CrisperWhisper for TRUE verbatim transcription
13
  # CrisperWhisper is designed to transcribe EVERY word including um, uh, fillers, stutters, false starts
 
111
  except Exception as e2:
112
  raise Exception(f"Transcription failed: {str(e2)}")
113
 
114
+ def create_srt_file(transcription_data, output_path):
115
+ """
116
+ Create an SRT subtitle file from transcription data.
117
+ """
118
+ with open(output_path, 'w', encoding='utf-8') as f:
119
+ counter = 1
120
+ for item in transcription_data:
121
+ start_time = item['start']
122
+ end_time = item['end']
123
+ text = item['text'].strip()
124
+
125
+ if text: # Only add non-empty subtitles
126
+ # Convert seconds to SRT time format (HH:MM:SS,mmm)
127
+ start_srt = format_timestamp_srt(start_time)
128
+ end_srt = format_timestamp_srt(end_time)
129
+
130
+ f.write(f"{counter}\n")
131
+ f.write(f"{start_srt} --> {end_srt}\n")
132
+ f.write(f"{text}\n\n")
133
+ counter += 1
134
+
135
+ def format_timestamp_srt(seconds):
136
+ """Convert seconds to SRT timestamp format."""
137
+ hours = int(seconds // 3600)
138
+ minutes = int((seconds % 3600) // 60)
139
+ secs = int(seconds % 60)
140
+ millis = int((seconds % 1) * 1000)
141
+ return f"{hours:02d}:{minutes:02d}:{secs:02d},{millis:03d}"
142
+
143
+ def extract_audio_from_video(video_path):
144
+ """Extract audio from video file."""
145
+ try:
146
+ video = VideoFileClip(video_path)
147
+ audio_path = tempfile.NamedTemporaryFile(suffix=".wav", delete=False).name
148
+ video.audio.write_audiofile(audio_path, codec='pcm_s16le', verbose=False, logger=None)
149
+ video.close()
150
+ return audio_path
151
+ except Exception as e:
152
+ raise Exception(f"Failed to extract audio: {str(e)}")
153
+
154
+ def burn_subtitles_to_video(video_path, transcription_data, progress=gr.Progress()):
155
+ """
156
+ Burn subtitles directly into the video.
157
+ """
158
+ try:
159
+ progress(0.1, desc="Loading video...")
160
+ video = VideoFileClip(video_path)
161
+
162
+ progress(0.3, desc="Creating subtitle clips...")
163
+ subtitle_clips = []
164
+
165
+ for item in transcription_data:
166
+ start_time = item['start']
167
+ end_time = item['end']
168
+ text = item['text'].strip()
169
+
170
+ if text and end_time > start_time:
171
+ # Create text clip with styling
172
+ txt_clip = (TextClip(
173
+ text,
174
+ fontsize=40,
175
+ color='white',
176
+ font='Arial-Bold',
177
+ stroke_color='black',
178
+ stroke_width=2,
179
+ method='caption',
180
+ size=(video.w * 0.9, None),
181
+ align='center'
182
+ )
183
+ .set_position(('center', video.h * 0.85))
184
+ .set_start(start_time)
185
+ .set_duration(end_time - start_time))
186
+
187
+ subtitle_clips.append(txt_clip)
188
+
189
+ progress(0.6, desc="Compositing video with subtitles...")
190
+ # Composite video with subtitles
191
+ final_video = CompositeVideoClip([video] + subtitle_clips)
192
+
193
+ progress(0.8, desc="Rendering final video...")
194
+ output_path = tempfile.NamedTemporaryFile(suffix=".mp4", delete=False).name
195
+ final_video.write_videofile(
196
+ output_path,
197
+ codec='libx264',
198
+ audio_codec='aac',
199
+ temp_audiofile=tempfile.NamedTemporaryFile(suffix=".m4a", delete=False).name,
200
+ remove_temp=True,
201
+ verbose=False,
202
+ logger=None
203
+ )
204
+
205
+ video.close()
206
+ final_video.close()
207
+
208
+ progress(1.0, desc="Done!")
209
+ return output_path
210
+
211
+ except Exception as e:
212
+ raise Exception(f"Failed to create subtitled video: {str(e)}")
213
+
214
+ @spaces.GPU
215
+ def process_video(video_path, task="transcribe", language=None, subtitle_format="burned", progress=gr.Progress()):
216
+ """
217
+ Process video: extract audio, transcribe, and add subtitles.
218
+ """
219
+ if video_path is None:
220
+ return None, "Please provide a video file.", None
221
+
222
+ temp_files = []
223
+
224
+ try:
225
+ # Extract audio from video
226
+ progress(0, desc="Extracting audio from video...")
227
+ audio_path = extract_audio_from_video(video_path)
228
+ temp_files.append(audio_path)
229
+
230
+ # Check audio duration
231
+ duration = get_audio_duration(audio_path)
232
+ chunk_duration = 300 # 5 minutes per chunk
233
+
234
+ if duration and duration > chunk_duration:
235
+ progress(0.1, desc=f"Audio is {duration:.1f}s long. Slicing into chunks...")
236
+ audio_chunks = slice_audio(audio_path, chunk_duration)
237
+ temp_files.extend(audio_chunks)
238
+ else:
239
+ audio_chunks = [audio_path]
240
+
241
+ # Transcribe each chunk with timestamps
242
+ all_transcriptions = []
243
+ total_chunks = len(audio_chunks)
244
+
245
+ for idx, chunk_path in enumerate(audio_chunks):
246
+ progress(0.1 + (idx / total_chunks) * 0.5, desc=f"Transcribing chunk {idx + 1}/{total_chunks}...")
247
+
248
+ result = transcribe_audio_chunk(chunk_path, task, language, return_timestamps=True)
249
+
250
+ if "chunks" in result:
251
+ chunk_offset = idx * chunk_duration
252
+
253
+ for word_chunk in result["chunks"]:
254
+ start = word_chunk["timestamp"][0]
255
+ end = word_chunk["timestamp"][1]
256
+ if start is not None and end is not None:
257
+ all_transcriptions.append({
258
+ "start": start + chunk_offset,
259
+ "end": end + chunk_offset,
260
+ "text": word_chunk["text"]
261
+ })
262
+
263
+ if not all_transcriptions:
264
+ return None, "No transcription data available. Timestamps may have failed.", None
265
+
266
+ # Merge close timestamps for better subtitle readability
267
+ progress(0.6, desc="Optimizing subtitle timing...")
268
+ merged_transcriptions = merge_subtitle_segments(all_transcriptions, max_duration=5.0, max_words=15)
269
+
270
+ # Generate full text transcript
271
+ full_text = " ".join([t["text"] for t in merged_transcriptions])
272
+ transcript_output = f"**Verbatim Transcription:**\n{full_text}\n\n"
273
+ transcript_output += f"*Total duration: {duration:.1f}s | {len(merged_transcriptions)} subtitle segments*"
274
+
275
+ if subtitle_format == "burned":
276
+ # Burn subtitles into video
277
+ progress(0.7, desc="Creating video with burned-in subtitles...")
278
+ output_video = burn_subtitles_to_video(video_path, merged_transcriptions, progress)
279
+ return output_video, transcript_output, None
280
+
281
+ elif subtitle_format == "srt":
282
+ # Create SRT file
283
+ progress(0.7, desc="Creating SRT subtitle file...")
284
+ srt_path = tempfile.NamedTemporaryFile(suffix=".srt", delete=False).name
285
+ create_srt_file(merged_transcriptions, srt_path)
286
+ return None, transcript_output, srt_path
287
+
288
+ else: # both
289
+ progress(0.7, desc="Creating video with subtitles and SRT file...")
290
+ output_video = burn_subtitles_to_video(video_path, merged_transcriptions, progress)
291
+ srt_path = tempfile.NamedTemporaryFile(suffix=".srt", delete=False).name
292
+ create_srt_file(merged_transcriptions, srt_path)
293
+ return output_video, transcript_output, srt_path
294
+
295
+ except Exception as e:
296
+ return None, f"Error processing video: {str(e)}", None
297
+
298
+ finally:
299
+ # Clean up temporary audio files (keep video and srt outputs)
300
+ for temp_file in temp_files:
301
+ try:
302
+ if os.path.exists(temp_file):
303
+ os.unlink(temp_file)
304
+ except:
305
+ pass
306
+
307
  def transcribe_audio(audio, task="transcribe", return_timestamps=False, language=None, progress=gr.Progress()):
308
  """
309
  Transcribe audio with VERY VERBATIM output using CrisperWhisper.
 
458
  - βœ… **Accurate Word-Level Timestamps**: Precise timing even around disfluencies
459
  - βœ… **Multilingual**: Supports 99+ languages
460
  - βœ… **Long Audio Support**: Automatic 5-minute chunking
461
+ - βœ… **Video Subtitles**: Automatic caption generation with burned-in or SRT output
462
 
463
  **Perfect for:** Legal transcription, linguistic research, therapy sessions, interviews,
464
+ conversational AI training, video subtitling, or any use case requiring exact speech capture.
465
  """
466
  )
467
 
468
+ with gr.Tabs():
469
+ # Audio Tab
470
+ with gr.Tab("🎀 Audio Transcription"):
 
 
 
 
 
471
  with gr.Row():
472
+ with gr.Column():
473
+ audio_input = gr.Audio(
474
+ sources=["upload", "microphone"],
475
+ type="filepath",
476
+ label="Audio Input"
477
+ )
478
+
479
+ with gr.Row():
480
+ task_radio = gr.Radio(
481
+ choices=["transcribe", "translate"],
482
+ value="transcribe",
483
+ label="Task",
484
+ info="Transcribe verbatim or translate to English"
485
+ )
486
+
487
+ language_dropdown = gr.Dropdown(
488
+ choices=list(LANGUAGES.keys()),
489
+ value="Auto-detect",
490
+ label="Language",
491
+ info="Select language or use auto-detect"
492
+ )
493
+
494
+ timestamps_checkbox = gr.Checkbox(
495
+ label="Show word-level timestamps",
496
+ value=True,
497
+ info="Display precise timing for each word"
498
+ )
499
+
500
+ transcribe_btn = gr.Button("🎯 Transcribe Verbatim", variant="primary", size="lg")
501
 
502
+ with gr.Column():
503
+ output_text = gr.Textbox(
504
+ label="Verbatim Transcription (includes all um, uh, hesitations)",
505
+ lines=20,
506
+ show_copy_button=True,
507
+ placeholder="Your VERY verbatim transcription will appear here...\n\nEvery um, uh, stutter, and hesitation will be captured!"
508
+ )
509
 
510
+ gr.Markdown(
511
+ """
512
+ ### Why CrisperWhisper for Verbatim?
513
+
514
+ **Standard Whisper** is trained to transcribe the "intended meaning" - it automatically cleans up:
515
+ - ❌ Removes "um", "uh", "ah"
516
+ - ❌ Omits false starts
517
+ - ❌ Skips repetitions
518
+ - ❌ Ignores stutters
519
+
520
+ **CrisperWhisper** is specifically trained for verbatim transcription:
521
+ - βœ… Keeps every filler word
522
+ - βœ… Preserves all disfluencies
523
+ - βœ… Captures exact speech patterns
524
+ - βœ… Accurate timestamps around hesitations
525
+ """
526
  )
527
+
528
+ # Video Tab
529
+ with gr.Tab("🎬 Video Subtitles"):
530
+ with gr.Row():
531
+ with gr.Column():
532
+ video_input = gr.Video(
533
+ label="Video Input",
534
+ sources=["upload"]
535
+ )
536
+
537
+ with gr.Row():
538
+ video_task_radio = gr.Radio(
539
+ choices=["transcribe", "translate"],
540
+ value="transcribe",
541
+ label="Task",
542
+ info="Transcribe verbatim or translate to English"
543
+ )
544
+
545
+ video_language_dropdown = gr.Dropdown(
546
+ choices=list(LANGUAGES.keys()),
547
+ value="Auto-detect",
548
+ label="Language",
549
+ info="Select language or use auto-detect"
550
+ )
551
+
552
+ subtitle_format_radio = gr.Radio(
553
+ choices=[
554
+ ("Burned-in subtitles (permanent)", "burned"),
555
+ ("SRT file only (external subtitles)", "srt"),
556
+ ("Both burned-in video + SRT file", "both")
557
+ ],
558
+ value="burned",
559
+ label="Subtitle Format",
560
+ info="Choose output format"
561
+ )
562
+
563
+ process_video_btn = gr.Button("🎬 Generate Subtitles", variant="primary", size="lg")
564
+
565
+ with gr.Column():
566
+ output_video = gr.Video(
567
+ label="Video with Subtitles",
568
+ interactive=False
569
+ )
570
+
571
+ video_transcript = gr.Textbox(
572
+ label="Verbatim Transcript",
573
+ lines=10,
574
+ show_copy_button=True,
575
+ placeholder="Transcript will appear here..."
576
+ )
577
+
578
+ output_srt = gr.File(
579
+ label="Download SRT Subtitles",
580
+ interactive=False
581
+ )
582
 
583
+ gr.Markdown(
584
+ """
585
+ ### Video Subtitle Features
586
+
587
+ - **Burned-in Subtitles**: Permanently embedded in video (white text with black outline)
588
+ - **SRT File**: External subtitle file compatible with video players and editing software
589
+ - **Verbatim Captions**: All hesitations, fillers, and disfluencies included
590
+ - **Smart Timing**: Automatically merges short segments for readability
591
+ - **Long Video Support**: Handles videos of any length (automatic chunking)
592
+
593
+ ### Tips
594
+
595
+ - Use "Burned-in" for sharing videos with guaranteed subtitle visibility
596
+ - Use "SRT file" for flexible editing and translation
597
+ - Use "Both" to have both options available
598
+ - Subtitles are positioned at the bottom center of the video
599
+ """
600
  )
601
 
602
  gr.Markdown(
603
  """
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
604
  ### Use Cases
605
 
606
  - **Legal/Court Transcription**: Exact wording required by law
 
608
  - **Medical/Therapy Sessions**: Capturing patient speech patterns
609
  - **Interview Transcription**: Preserving speaker mannerisms
610
  - **Conversational AI Training**: Realistic dialogue data
611
+ - **Accessibility**: Complete transcripts and captions for deaf/hard-of-hearing
612
+ - **Video Content**: YouTube, social media, educational content with accurate captions
613
  - **Language Learning**: Analyzing natural spoken language
614
 
615
  ### Tips for Best Results
 
618
  - The model captures quiet speech - ensure consistent audio levels
619
  - Manual language selection can improve accuracy
620
  - Long files are automatically processed in 5-minute chunks
621
+ - For videos, ensure good audio quality for best subtitle accuracy
622
  """
623
  )
624
 
625
+ # Set up event handlers
626
  def transcribe_wrapper(audio, task, timestamps, language_name, progress=gr.Progress()):
627
  language_code = LANGUAGES[language_name]
628
  return transcribe_audio(audio, task, timestamps, language_code, progress)
629
 
630
+ def video_wrapper(video, task, language_name, subtitle_format, progress=gr.Progress()):
631
+ language_code = LANGUAGES[language_name]
632
+ return process_video(video, task, language_code, subtitle_format, progress)
633
+
634
  transcribe_btn.click(
635
  fn=transcribe_wrapper,
636
  inputs=[audio_input, task_radio, timestamps_checkbox, language_dropdown],
637
  outputs=output_text
638
  )
639
+
640
+ process_video_btn.click(
641
+ fn=video_wrapper,
642
+ inputs=[video_input, video_task_radio, video_language_dropdown, subtitle_format_radio],
643
+ outputs=[output_video, video_transcript, output_srt]
644
+ )
645
 
646
  # Launch the app
647
  if __name__ == "__main__":