rafaaa2105 commited on
Commit
dd41807
·
verified ·
1 Parent(s): 3507ce7

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +22 -7
app.py CHANGED
@@ -34,7 +34,6 @@ pipe = pipeline(
34
  feature_extractor=processor.feature_extractor,
35
  chunk_length_s=30,
36
  batch_size=8, # Reduced batch size for stability
37
- return_timestamps="word", # CrisperWhisper provides accurate word-level timestamps
38
  torch_dtype=torch_dtype,
39
  device=device,
40
  )
@@ -69,7 +68,7 @@ def slice_audio(audio_path, chunk_duration=300):
69
  return chunks
70
 
71
  @spaces.GPU
72
- def transcribe_audio_chunk(audio_input, task="transcribe", language=None):
73
  """
74
  Transcribe a single audio chunk with CrisperWhisper.
75
  This model is specifically trained for verbatim transcription.
@@ -82,11 +81,27 @@ def transcribe_audio_chunk(audio_input, task="transcribe", language=None):
82
  if language:
83
  generate_kwargs["language"] = language
84
 
85
- # CrisperWhisper automatically provides verbatim transcription
86
- result = pipe(audio_input, generate_kwargs=generate_kwargs)
87
- return result
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
88
  except Exception as e:
89
- # Fallback: try without generate_kwargs if there's a tensor mismatch
90
  print(f"Error with generate_kwargs: {e}")
91
  try:
92
  result = pipe(audio_input)
@@ -141,7 +156,7 @@ def transcribe_audio(audio, task="transcribe", return_timestamps=False, language
141
  for idx, chunk_path in enumerate(audio_chunks):
142
  progress((idx + 1) / total_chunks, desc=f"Transcribing chunk {idx + 1}/{total_chunks}...")
143
 
144
- result = transcribe_audio_chunk(chunk_path, task, language)
145
 
146
  if return_timestamps and "chunks" in result:
147
  chunk_offset = idx * chunk_duration
 
34
  feature_extractor=processor.feature_extractor,
35
  chunk_length_s=30,
36
  batch_size=8, # Reduced batch size for stability
 
37
  torch_dtype=torch_dtype,
38
  device=device,
39
  )
 
68
  return chunks
69
 
70
  @spaces.GPU
71
+ def transcribe_audio_chunk(audio_input, task="transcribe", language=None, return_timestamps=False):
72
  """
73
  Transcribe a single audio chunk with CrisperWhisper.
74
  This model is specifically trained for verbatim transcription.
 
81
  if language:
82
  generate_kwargs["language"] = language
83
 
84
+ # Only add timestamps if requested and handle the potential error
85
+ if return_timestamps:
86
+ try:
87
+ generate_kwargs["return_timestamps"] = "word"
88
+ result = pipe(audio_input, generate_kwargs=generate_kwargs)
89
+ return result
90
+ except RuntimeError as e:
91
+ if "size of tensor" in str(e):
92
+ # Fallback to chunk-level timestamps if word-level fails
93
+ print("Word-level timestamps failed, trying chunk-level...")
94
+ generate_kwargs["return_timestamps"] = True
95
+ result = pipe(audio_input, generate_kwargs=generate_kwargs)
96
+ return result
97
+ raise
98
+ else:
99
+ # No timestamps requested
100
+ result = pipe(audio_input, generate_kwargs=generate_kwargs)
101
+ return result
102
+
103
  except Exception as e:
104
+ # Last resort fallback: try with minimal parameters
105
  print(f"Error with generate_kwargs: {e}")
106
  try:
107
  result = pipe(audio_input)
 
156
  for idx, chunk_path in enumerate(audio_chunks):
157
  progress((idx + 1) / total_chunks, desc=f"Transcribing chunk {idx + 1}/{total_chunks}...")
158
 
159
+ result = transcribe_audio_chunk(chunk_path, task, language, return_timestamps)
160
 
161
  if return_timestamps and "chunks" in result:
162
  chunk_offset = idx * chunk_duration