rafaaa2105 commited on
Commit
b9a9520
·
verified ·
1 Parent(s): 3d89603

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +85 -45
app.py CHANGED
@@ -190,6 +190,41 @@ def burn_subtitles_to_video(video_path, srt_path, progress=gr.Progress()):
190
  except Exception as e:
191
  raise Exception(f"Failed to create subtitled video: {str(e)}")
192
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
193
  @spaces.GPU
194
  def process_video(video_path, task="transcribe", language=None, subtitle_format="burned", progress=gr.Progress()):
195
  """
@@ -199,6 +234,7 @@ def process_video(video_path, task="transcribe", language=None, subtitle_format=
199
  return None, "Please provide a video file.", None
200
 
201
  temp_files = []
 
202
 
203
  try:
204
  # Extract audio from video
@@ -247,7 +283,7 @@ def process_video(video_path, task="transcribe", language=None, subtitle_format=
247
  merged_transcriptions = merge_subtitle_segments(all_transcriptions, max_duration=5.0, max_words=15)
248
 
249
  # Generate full text transcript
250
- full_text = " ".join([t["text"] for t in merged_transcriptions])
251
  transcript_output = f"**Verbatim Transcription:**\n{full_text}\n\n"
252
  transcript_output += f"*Total duration: {duration:.1f}s | {len(merged_transcriptions)} subtitle segments*"
253
 
@@ -282,24 +318,26 @@ def process_video(video_path, task="transcribe", language=None, subtitle_format=
282
  # Clean up temporary audio files (keep video and srt outputs)
283
  for temp_file in temp_files:
284
  try:
285
- if os.path.exists(temp_file) and temp_file not in [srt_path]:
 
286
  os.unlink(temp_file)
 
 
287
  except:
288
  pass
289
 
290
- def transcribe_audio(audio, task="transcribe", return_timestamps=False, language=None, progress=gr.Progress()):
291
  """
292
  Transcribe audio with VERY VERBATIM output using CrisperWhisper.
293
- CrisperWhisper transcribes every spoken word exactly as it is, including:
294
- - Fillers (um, uh, ah, er, mm)
295
- - Pauses and hesitations
296
- - Stutters and repetitions
297
- - False starts
298
- - Non-standard utterances
299
  """
300
  if audio is None:
301
- return "Please provide an audio file or recording."
302
-
 
 
 
 
303
  temp_files = []
304
 
305
  try:
@@ -314,8 +352,8 @@ def transcribe_audio(audio, task="transcribe", return_timestamps=False, language
314
  audio_path = temp_file.name
315
  temp_files.append(audio_path)
316
  else:
317
- return "Unsupported audio format."
318
-
319
  # Check audio duration and slice if necessary
320
  duration = get_audio_duration(audio_path)
321
  chunk_duration = 300 # 5 minutes per chunk
@@ -326,9 +364,10 @@ def transcribe_audio(audio, task="transcribe", return_timestamps=False, language
326
  temp_files.extend(audio_chunks)
327
  else:
328
  audio_chunks = [audio_path]
329
-
330
  # Process each chunk
331
- all_transcriptions = []
 
332
  total_chunks = len(audio_chunks)
333
 
334
  for idx, chunk_path in enumerate(audio_chunks):
@@ -336,50 +375,51 @@ def transcribe_audio(audio, task="transcribe", return_timestamps=False, language
336
 
337
  result = transcribe_audio_chunk(chunk_path, task, language, return_timestamps)
338
 
 
 
339
  if return_timestamps and "chunks" in result:
340
  chunk_offset = idx * chunk_duration
341
- chunk_text = result["text"]
342
- timestamp_text = []
343
-
344
  for word_chunk in result["chunks"]:
345
  start = word_chunk["timestamp"][0]
346
  end = word_chunk["timestamp"][1]
347
  if start is not None and end is not None:
348
- timestamp_text.append({
349
  "start": start + chunk_offset,
350
  "end": end + chunk_offset,
351
  "text": word_chunk["text"]
352
  })
353
-
354
- all_transcriptions.append({
355
- "text": chunk_text,
356
- "timestamps": timestamp_text
357
- })
358
- else:
359
- all_transcriptions.append({
360
- "text": result["text"],
361
- "timestamps": []
362
- })
363
-
364
  # Combine all transcriptions
365
- full_text = " ".join([t["text"] for t in all_transcriptions])
366
 
367
  output = f"**Verbatim Transcription:**\n{full_text}\n"
368
-
369
- if return_timestamps:
370
- output += "\n**Word-level Timestamps:**\n"
371
- for trans in all_transcriptions:
372
- for ts in trans["timestamps"]:
373
- output += f"[{ts['start']:.2f}s - {ts['end']:.2f}s] {ts['text']}\n"
374
-
 
 
 
 
 
 
 
 
 
 
 
 
375
  if duration:
376
  output += f"\n*Total duration: {duration:.1f}s | Processed in {total_chunks} chunk(s)*"
377
-
378
- return output
379
 
380
  except Exception as e:
381
- return f"Error during transcription: {str(e)}"
382
-
383
  finally:
384
  # Clean up temporary files
385
  for temp_file in temp_files:
@@ -475,8 +515,8 @@ with gr.Blocks(theme=gr.themes.Soft()) as demo:
475
  )
476
 
477
  timestamps_checkbox = gr.Checkbox(
478
- label="Show word-level timestamps",
479
- value=True,
480
  info="Display precise timing for each word"
481
  )
482
 
@@ -661,4 +701,4 @@ with gr.Blocks(theme=gr.themes.Soft()) as demo:
661
 
662
  # Launch the app
663
  if __name__ == "__main__":
664
- demo.launch()
 
190
  except Exception as e:
191
  raise Exception(f"Failed to create subtitled video: {str(e)}")
192
 
193
+ def merge_subtitle_segments(segments, max_duration=5.0, max_words=15):
194
+ """
195
+ Merge small subtitle segments into larger, more readable ones.
196
+ """
197
+ if not segments:
198
+ return []
199
+
200
+ merged = []
201
+ # Start with the first segment
202
+ current_segment = segments[0].copy()
203
+
204
+ for i in range(1, len(segments)):
205
+ next_segment = segments[i]
206
+
207
+ # Combine text and calculate new word count
208
+ new_text = current_segment['text'] + " " + next_segment['text'].lstrip()
209
+ new_word_count = len(new_text.split())
210
+
211
+ # Calculate new duration
212
+ new_duration = next_segment['end'] - current_segment['start']
213
+
214
+ # If merging doesn't exceed limits, merge
215
+ if new_duration <= max_duration and new_word_count <= max_words:
216
+ current_segment['end'] = next_segment['end']
217
+ current_segment['text'] = new_text
218
+ else:
219
+ # Otherwise, save the current segment and start a new one
220
+ merged.append(current_segment)
221
+ current_segment = next_segment.copy()
222
+
223
+ # Don't forget the last segment
224
+ merged.append(current_segment)
225
+
226
+ return merged
227
+
228
  @spaces.GPU
229
  def process_video(video_path, task="transcribe", language=None, subtitle_format="burned", progress=gr.Progress()):
230
  """
 
234
  return None, "Please provide a video file.", None
235
 
236
  temp_files = []
237
+ srt_path = None # Initialize to prevent NameError in finally block
238
 
239
  try:
240
  # Extract audio from video
 
283
  merged_transcriptions = merge_subtitle_segments(all_transcriptions, max_duration=5.0, max_words=15)
284
 
285
  # Generate full text transcript
286
+ full_text = "".join([t["text"] for t in merged_transcriptions]).strip()
287
  transcript_output = f"**Verbatim Transcription:**\n{full_text}\n\n"
288
  transcript_output += f"*Total duration: {duration:.1f}s | {len(merged_transcriptions)} subtitle segments*"
289
 
 
318
  # Clean up temporary audio files (keep video and srt outputs)
319
  for temp_file in temp_files:
320
  try:
321
+ # srt_path could be None if an error occurs early
322
+ if srt_path and os.path.exists(temp_file) and temp_file != srt_path:
323
  os.unlink(temp_file)
324
+ elif os.path.exists(temp_file):
325
+ os.unlink(temp_file)
326
  except:
327
  pass
328
 
329
+ def transcribe_audio(audio, task="transcribe", return_timestamps=False, language=None, export_srt=False, progress=gr.Progress()):
330
  """
331
  Transcribe audio with VERY VERBATIM output using CrisperWhisper.
332
+ This model transcribes every spoken word exactly as it is, including fillers, stutters, and false starts.
 
 
 
 
 
333
  """
334
  if audio is None:
335
+ return "Please provide an audio file or recording.", None
336
+
337
+ # If SRT export is requested, we must generate timestamps.
338
+ if export_srt:
339
+ return_timestamps = True
340
+
341
  temp_files = []
342
 
343
  try:
 
352
  audio_path = temp_file.name
353
  temp_files.append(audio_path)
354
  else:
355
+ return "Unsupported audio format.", None
356
+
357
  # Check audio duration and slice if necessary
358
  duration = get_audio_duration(audio_path)
359
  chunk_duration = 300 # 5 minutes per chunk
 
364
  temp_files.extend(audio_chunks)
365
  else:
366
  audio_chunks = [audio_path]
367
+
368
  # Process each chunk
369
+ all_word_chunks = []
370
+ full_text_parts = []
371
  total_chunks = len(audio_chunks)
372
 
373
  for idx, chunk_path in enumerate(audio_chunks):
 
375
 
376
  result = transcribe_audio_chunk(chunk_path, task, language, return_timestamps)
377
 
378
+ full_text_parts.append(result["text"])
379
+
380
  if return_timestamps and "chunks" in result:
381
  chunk_offset = idx * chunk_duration
 
 
 
382
  for word_chunk in result["chunks"]:
383
  start = word_chunk["timestamp"][0]
384
  end = word_chunk["timestamp"][1]
385
  if start is not None and end is not None:
386
+ all_word_chunks.append({
387
  "start": start + chunk_offset,
388
  "end": end + chunk_offset,
389
  "text": word_chunk["text"]
390
  })
391
+
 
 
 
 
 
 
 
 
 
 
392
  # Combine all transcriptions
393
+ full_text = "".join(full_text_parts).strip()
394
 
395
  output = f"**Verbatim Transcription:**\n{full_text}\n"
396
+ srt_file_path = None
397
+
398
+ if return_timestamps and all_word_chunks:
399
+ # If timestamps are requested but not for SRT, display them in the textbox
400
+ if not export_srt:
401
+ output += "\n**Word-level Timestamps:**\n"
402
+ for ts in all_word_chunks:
403
+ output += f"[{ts['start']:.2f}s - {ts['end']:.2f}s]{ts['text']}\n"
404
+
405
+ # Generate SRT file if requested
406
+ if export_srt:
407
+ if all_word_chunks:
408
+ merged_transcriptions = merge_subtitle_segments(all_word_chunks, max_duration=5.0, max_words=15)
409
+ srt_file = tempfile.NamedTemporaryFile(suffix=".srt", delete=False).name
410
+ create_srt_file(merged_transcriptions, srt_file)
411
+ srt_file_path = srt_file
412
+ else:
413
+ output += "\n**Warning:** Could not generate SRT file as word-level timestamps were not available."
414
+
415
  if duration:
416
  output += f"\n*Total duration: {duration:.1f}s | Processed in {total_chunks} chunk(s)*"
417
+
418
+ return output, srt_file_path
419
 
420
  except Exception as e:
421
+ return f"Error during transcription: {str(e)}", None
422
+
423
  finally:
424
  # Clean up temporary files
425
  for temp_file in temp_files:
 
515
  )
516
 
517
  timestamps_checkbox = gr.Checkbox(
518
+ label="Show word-level timestamps in text output",
519
+ value=False,
520
  info="Display precise timing for each word"
521
  )
522
 
 
701
 
702
  # Launch the app
703
  if __name__ == "__main__":
704
+ demo.launch()