Hammedalmodel commited on
Commit
6014797
Β·
verified Β·
1 Parent(s): 7897caf

Update app.py

Browse files

Added video support as input

Files changed (1) hide show
  1. app.py +608 -513
app.py CHANGED
@@ -1,514 +1,609 @@
1
- import os
2
- import tempfile
3
- import json
4
- import pandas as pd
5
- import gradio as gr
6
- from aeneas.executetask import ExecuteTask
7
- from aeneas.task import Task
8
- import traceback
9
- import re
10
- import webvtt
11
- import threading
12
- import uvicorn
13
-
14
-
15
-
16
- def wrap_text(text, max_line_length=29):
17
- words = text.split()
18
- lines = []
19
- current_line = []
20
-
21
- for word in words:
22
- if len(' '.join(current_line + [word])) <= max_line_length:
23
- current_line.append(word)
24
- else:
25
- if current_line:
26
- lines.append(' '.join(current_line))
27
- current_line = [word]
28
-
29
- if current_line:
30
- lines.append(' '.join(current_line))
31
-
32
- return '\n'.join(lines)
33
-
34
-
35
- def segment_text_file(input_content, output_path,):
36
-
37
- words = re.findall(r'\S+', input_content)
38
- if not words:
39
- return ""
40
-
41
- result = []
42
- current_line = ""
43
-
44
- for word in words:
45
- remaining_line = ""
46
- if len(current_line) + len(word) + 1 <= 58:
47
- current_line += word + " "
48
- else:
49
- if current_line:
50
- if '.' in current_line[29:]:
51
- crr_line = current_line.split('.')
52
- remaining_line = crr_line[-1].strip()
53
- if len(crr_line) > 2:
54
- current_line = ''.join([cr + "." for cr in crr_line[:-1]])
55
- else:
56
- current_line = crr_line[0].strip() + '.'
57
-
58
- # Check wrapped lines and extract excess if any
59
- wrapped = wrap_text(current_line).split('\n')
60
- result1 = '\n'.join(wrapped[2:])
61
- if result1:
62
- moved_word = result1
63
- current_line = current_line.rstrip()
64
- if current_line.endswith(moved_word):
65
- current_line = current_line[:-(len(moved_word))].rstrip()
66
-
67
- result.append(current_line.strip())
68
- current_line = moved_word + " "
69
- else:
70
- result.append(current_line.strip())
71
- current_line = remaining_line + " " + word + " "
72
- else:
73
- current_line = remaining_line + " " + word + " "
74
-
75
- if current_line:
76
- result.append(current_line.strip())
77
-
78
- # Write segmented output
79
- with open(output_path, "w", encoding="utf-8") as f:
80
- for seg in result:
81
- f.write(seg.strip() + "\n")
82
-
83
-
84
- def convert_to_srt(fragments):
85
- def format_timestamp(seconds):
86
- h = int(seconds // 3600)
87
- m = int((seconds % 3600) // 60)
88
- s = int(seconds % 60)
89
- ms = int((seconds - int(seconds)) * 1000)
90
- return f"{h:02}:{m:02}:{s:02},{ms:03}"
91
-
92
- srt_output = []
93
- index = 1
94
- for f in fragments:
95
- start = float(f.begin)
96
- end = float(f.end)
97
- text = f.text.strip()
98
-
99
- if end <= start or not text:
100
- continue
101
-
102
-
103
- lines = wrap_text(text)
104
-
105
- srt_output.append(f"{index}")
106
- srt_output.append(f"{format_timestamp(start)} --> {format_timestamp(end)}")
107
- srt_output.append(lines)
108
- srt_output.append("") # Empty line
109
- index += 1
110
-
111
- return "\n".join(srt_output)
112
-
113
-
114
-
115
- def get_audio_file_path(audio_input):
116
- if audio_input is None:
117
- return None
118
-
119
- if isinstance(audio_input, str):
120
- return audio_input
121
- elif isinstance(audio_input, tuple) and len(audio_input) >= 2:
122
- return audio_input[1] if isinstance(audio_input[1], str) else audio_input[0]
123
- else:
124
- print(f"Debug: Unexpected audio input type: {type(audio_input)}")
125
- return str(audio_input)
126
-
127
- def get_text_file_path(text_input):
128
- if text_input is None:
129
- return None
130
-
131
- if isinstance(text_input, dict):
132
- return text_input['name']
133
- elif isinstance(text_input, str):
134
- return text_input
135
- else:
136
- print(f"Debug: Unexpected text input type: {type(text_input)}")
137
- return str(text_input)
138
-
139
- def process_alignment(audio_file, text_file, language, progress=gr.Progress()):
140
-
141
- if audio_file is None:
142
- return "❌ Please upload an audio file", None, None, ""
143
-
144
- if text_file is None:
145
- return "❌ Please upload a text file", None, None, ""
146
-
147
- # Initialize variables for cleanup
148
- temp_text_file_path = None
149
- output_file = None
150
-
151
- try:
152
- progress(0.1, desc="Initializing...")
153
-
154
- # Create temporary directory for better file handling
155
- temp_dir = tempfile.mkdtemp()
156
-
157
- # Get the text file path
158
- text_file_path = get_text_file_path(text_file)
159
- if not text_file_path:
160
- raise ValueError("Could not determine text file path")
161
-
162
- print(f"Debug: Text file path: {text_file_path}")
163
-
164
- # Verify text file exists and read content
165
- if not os.path.exists(text_file_path):
166
- raise FileNotFoundError(f"Text file not found: {text_file_path}")
167
-
168
- # Read and validate text content
169
- try:
170
- with open(text_file_path, 'r', encoding='utf-8') as f:
171
- text_content = f.read().strip()
172
- except UnicodeDecodeError:
173
- # Try with different encoding if UTF-8 fails
174
- with open(text_file_path, 'r', encoding='latin-1') as f:
175
- text_content = f.read().strip()
176
-
177
- if not text_content:
178
- raise ValueError("Text file is empty or contains only whitespace")
179
-
180
- temp_text_file_path = os.path.join(temp_dir, "input_text.txt")
181
- segment_text_file(text_content, temp_text_file_path)
182
- # Create a copy of the text file in our temp directory for Aeneas
183
-
184
- # with open(temp_text_file_path, 'w', encoding='utf-8') as f:
185
- # f.write(text_content)
186
-
187
- # Verify temp text file was created
188
- if not os.path.exists(temp_text_file_path):
189
- raise RuntimeError("Failed to create temporary text file")
190
-
191
- # Create output file path
192
- output_file = os.path.join(temp_dir, "alignment_output.json")
193
-
194
- progress(0.3, desc="Creating task configuration...")
195
-
196
- # Get the correct audio file path
197
- audio_file_path = get_audio_file_path(audio_file)
198
- if not audio_file_path:
199
- raise ValueError("Could not determine audio file path")
200
-
201
- # Verify audio file exists
202
- if not os.path.exists(audio_file_path):
203
- raise FileNotFoundError(f"Audio file not found: {audio_file_path}")
204
-
205
- # Create task configuration
206
- config_string = f"task_language={language}|is_text_type=plain|os_task_file_format=json"
207
-
208
- # Create and configure the task
209
- task = Task(config_string=config_string)
210
-
211
- # Set absolute paths
212
- task.audio_file_path_absolute = os.path.abspath(audio_file_path)
213
- task.text_file_path_absolute = os.path.abspath(temp_text_file_path)
214
- task.sync_map_file_path_absolute = os.path.abspath(output_file)
215
-
216
- progress(0.5, desc="Running alignment... This may take a while...")
217
-
218
- # Execute the alignment
219
- ExecuteTask(task).execute()
220
-
221
- progress(0.8, desc="Processing results...")
222
-
223
- # output sync map to file
224
- task.output_sync_map_file()
225
-
226
- # Check if output file was created
227
- if not os.path.exists(output_file):
228
- raise RuntimeError(f"Alignment output file was not created: {output_file}")
229
-
230
- # Read and process results
231
- with open(output_file, 'r', encoding='utf-8') as f:
232
- results = json.load(f)
233
-
234
-
235
- # Read output and convert to SRT
236
- fragments = task.sync_map.fragments
237
- srt_content = convert_to_srt(fragments)
238
-
239
-
240
- srt_path = os.path.join(temp_dir, "output.srt")
241
- vtt_path = os.path.join(temp_dir, "output.vtt")
242
- with open(srt_path, "w", encoding="utf-8") as f:
243
- f.write(srt_content)
244
-
245
- webvtt.from_srt(srt_path).save()
246
-
247
- if 'fragments' not in results or not results['fragments']:
248
- raise RuntimeError("No alignment fragments found in results")
249
-
250
- # Create DataFrame for display
251
- df_data = []
252
- for i, fragment in enumerate(results['fragments']):
253
- start_time = float(fragment['begin'])
254
- end_time = float(fragment['end'])
255
- duration = end_time - start_time
256
- text = fragment['lines'][0] if fragment['lines'] else ""
257
-
258
- df_data.append({
259
- 'Segment': i + 1,
260
- 'Start (s)': f"{start_time:.3f}",
261
- 'End (s)': f"{end_time:.3f}",
262
- 'Duration (s)': f"{duration:.3f}",
263
- 'Text': text
264
- })
265
-
266
- df = pd.DataFrame(df_data)
267
-
268
- # Create summary
269
- total_duration = float(results['fragments'][-1]['end']) if results['fragments'] else 0
270
- avg_segment_length = total_duration / len(results['fragments']) if results['fragments'] else 0
271
-
272
- summary = f"""
273
- πŸ“Š **Alignment Summary**
274
- - **Total segments:** {len(results['fragments'])}
275
- - **Total duration:** {total_duration:.3f} seconds
276
- - **Average segment length:** {avg_segment_length:.3f} seconds
277
- - **Language:** {language}
278
- """
279
-
280
- progress(1.0, desc="Complete!")
281
-
282
- print(f"Debug: Alignment completed successfully with {len(results['fragments'])} fragments")
283
-
284
- return (
285
- "βœ… Alignment completed successfully!",
286
- df,
287
- output_file, # For download
288
- summary,
289
- srt_path,
290
- vtt_path
291
- )
292
-
293
- except Exception as e:
294
- print(f"Debug: Exception occurred: {str(e)}")
295
- print(f"Debug: Traceback: {traceback.format_exc()}")
296
-
297
- error_msg = f"❌ Error during alignment: {str(e)}\n\n"
298
- error_msg += "**Troubleshooting tips:**\n"
299
- error_msg += "- Ensure audio file is in WAV format\n"
300
- error_msg += "- Ensure text file contains the spoken content\n"
301
- error_msg += "- Check that text file is in UTF-8 or Latin-1 encoding\n"
302
- error_msg += "- Verify both audio and text files are not corrupted\n"
303
- error_msg += "- Try with a shorter audio/text pair first\n"
304
- error_msg += "- Make sure Aeneas dependencies are properly installed\n"
305
-
306
- if temp_text_file_path:
307
- error_msg += f"- Text file was processed from: {text_file_path}\n"
308
-
309
- error_msg += f"\n**Technical details:**\n```\n{traceback.format_exc()}\n```"
310
-
311
- return error_msg, None, None, "", None
312
-
313
- finally:
314
- # Clean up temporary files
315
- try:
316
- if temp_text_file_path and os.path.exists(temp_text_file_path):
317
- os.unlink(temp_text_file_path)
318
- print(f"Debug: Cleaned up temp text file: {temp_text_file_path}")
319
- except Exception as cleanup_error:
320
- print(f"Debug: Error cleaning up temp text file: {cleanup_error}")
321
-
322
-
323
- def create_interface():
324
-
325
- with gr.Blocks(title="Aeneas Forced Alignment Tool", theme=gr.themes.Soft()) as interface:
326
- gr.Markdown("""
327
- # 🎯 Aeneas Forced Alignment Tool
328
-
329
- Upload an audio file and provide the corresponding text to generate precise time alignments.
330
- Perfect for creating subtitles, analyzing speech patterns, or preparing training data.
331
- """)
332
-
333
- with gr.Row():
334
- with gr.Column(scale=1):
335
- gr.Markdown("### πŸ“ Input Files")
336
-
337
- audio_input = gr.Audio(
338
- label="Audio File",
339
- type="filepath",
340
- format="wav"
341
- )
342
-
343
- text_input = gr.File(
344
- label="Text File (.txt)",
345
- file_types=[".txt"],
346
- file_count="single"
347
- )
348
-
349
-
350
- gr.Markdown("### βš™οΈ Configuration")
351
-
352
- language_input = gr.Dropdown(
353
- choices=["en", "es", "fr", "de", "it", "pt", "ru", "zh", "ja", "ar"],
354
- value="en",
355
- label="Language Code",
356
- info="ISO language code (en=English, es=Spanish, etc.)"
357
- )
358
-
359
-
360
- process_btn = gr.Button("πŸš€ Process Alignment", variant="primary", size="lg")
361
-
362
- with gr.Column(scale=2):
363
- gr.Markdown("### πŸ“Š Results")
364
-
365
- status_output = gr.Markdown()
366
- summary_output = gr.Markdown()
367
-
368
- results_output = gr.Dataframe(
369
- label="Alignment Results",
370
- headers=["Segment", "Start (s)", "End (s)", "Duration (s)", "Text"],
371
- datatype=["number", "str", "str", "str", "str"],
372
- interactive=False
373
- )
374
-
375
- download_output = gr.File(
376
- label="Download JSON Results",
377
- visible=False
378
- )
379
-
380
- srt_file_output = gr.File(
381
- label="Download SRT File",
382
- visible=False
383
- )
384
-
385
- vtt_file_output = gr.File(
386
- label="Download VTT File",
387
- visible=False
388
- )
389
-
390
-
391
- # Event handlers
392
-
393
- process_btn.click(
394
- fn=process_alignment,
395
- inputs=[
396
- audio_input,
397
- text_input,
398
- language_input,
399
- ],
400
- outputs=[
401
- status_output,
402
- results_output,
403
- download_output,
404
- summary_output,
405
- srt_file_output,
406
- vtt_file_output
407
- ]
408
- ).then(
409
- fn=lambda x: gr.update(visible=x is not None),
410
- inputs=download_output,
411
- outputs=download_output
412
- ).then(
413
- fn=lambda x: gr.update(visible=x is not None),
414
- inputs=srt_file_output,
415
- outputs=srt_file_output
416
- ).then(
417
- fn=lambda x: gr.update(visible=x is not None),
418
- inputs=vtt_file_output,
419
- outputs=vtt_file_output
420
- )
421
-
422
-
423
-
424
- return interface
425
-
426
- def run_fastapi():
427
- uvicorn.run(fastapi_app, host="0.0.0.0", port=8000)
428
-
429
- def main():
430
- try:
431
- threading.Thread(target=run_fastapi, daemon=True).start()
432
-
433
- interface = create_interface()
434
- print("πŸš€ Starting Gradio UI on http://localhost:7860")
435
- print("🧠 FastAPI JSON endpoint available at http://localhost:8000/align")
436
-
437
- interface.launch(
438
- server_name="0.0.0.0",
439
- server_port=7860,
440
- share=False,
441
- debug=False
442
- )
443
-
444
- except ImportError as e:
445
- print("❌ Missing dependency:", e)
446
- except Exception as e:
447
- print("❌ Error launching application:", e)
448
-
449
-
450
- from fastapi import FastAPI, UploadFile, File, Form
451
- from fastapi.responses import JSONResponse
452
- from fastapi.middleware.cors import CORSMiddleware
453
- import shutil
454
-
455
- fastapi_app = FastAPI()
456
-
457
- fastapi_app.add_middleware(
458
- CORSMiddleware,
459
- allow_origins=["*"],
460
- allow_credentials=True,
461
- allow_methods=["*"],
462
- allow_headers=["*"],
463
- )
464
-
465
- @fastapi_app.post("/align")
466
- async def align_api(
467
- audio_file: UploadFile = File(...),
468
- text_file: UploadFile = File(...),
469
- language: str = Form(default="en")
470
- ):
471
- try:
472
- if not text_file.filename.endswith(".txt"):
473
- return JSONResponse(
474
- status_code=400,
475
- content={"error": "Text file must be a .txt file"}
476
- )
477
-
478
- with tempfile.NamedTemporaryFile(delete=False, suffix=os.path.splitext(audio_file.filename)[-1]) as temp_audio:
479
- shutil.copyfileobj(audio_file.file, temp_audio)
480
- audio_path = temp_audio.name
481
-
482
- with tempfile.NamedTemporaryFile(delete=False, suffix=".txt", mode='w+', encoding='utf-8') as temp_text:
483
- content = (await text_file.read()).decode('utf-8', errors='ignore')
484
- temp_text.write(content)
485
- temp_text.flush()
486
- text_path = temp_text.name
487
-
488
- status, df, json_path, summary, srt_path, vtt_path = process_alignment(audio_path, text_path, language)
489
-
490
- if "Error" in status or status.startswith("❌"):
491
- return JSONResponse(status_code=500, content={"error": status})
492
-
493
- response = {
494
- "status": status,
495
- "summary": summary,
496
- "segments": df.to_dict(orient="records") if df is not None else [],
497
- "download_links": {
498
- "alignment_json": json_path,
499
- "srt": srt_path,
500
- "vtt": vtt_path
501
- }
502
- }
503
-
504
- return JSONResponse(status_code=200, content=response)
505
-
506
- except Exception as e:
507
- return JSONResponse(
508
- status_code=500,
509
- content={"error": f"Unexpected server error: {str(e)}"}
510
- )
511
-
512
-
513
- if __name__ == "__main__":
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
514
  main()
 
1
+ import os
2
+ import tempfile
3
+ import json
4
+ import pandas as pd
5
+ import gradio as gr
6
+ from aeneas.executetask import ExecuteTask
7
+ from aeneas.task import Task
8
+ import traceback
9
+ import re
10
+ import webvtt
11
+ import threading
12
+ import uvicorn
13
+ import subprocess
14
+ import shutil
15
+ from pathlib import Path
16
+
17
+
18
+
19
+ def wrap_text(text, max_line_length=29):
20
+ words = text.split()
21
+ lines = []
22
+ current_line = []
23
+
24
+ for word in words:
25
+ if len(' '.join(current_line + [word])) <= max_line_length:
26
+ current_line.append(word)
27
+ else:
28
+ if current_line:
29
+ lines.append(' '.join(current_line))
30
+ current_line = [word]
31
+
32
+ if current_line:
33
+ lines.append(' '.join(current_line))
34
+
35
+ return '\n'.join(lines)
36
+
37
+
38
+ def segment_text_file(input_content, output_path,):
39
+
40
+ words = re.findall(r'\S+', input_content)
41
+ if not words:
42
+ return ""
43
+
44
+ result = []
45
+ current_line = ""
46
+
47
+ for word in words:
48
+ remaining_line = ""
49
+ if len(current_line) + len(word) + 1 <= 58:
50
+ current_line += word + " "
51
+ else:
52
+ if current_line:
53
+ if '.' in current_line[29:]:
54
+ crr_line = current_line.split('.')
55
+ remaining_line = crr_line[-1].strip()
56
+ if len(crr_line) > 2:
57
+ current_line = ''.join([cr + "." for cr in crr_line[:-1]])
58
+ else:
59
+ current_line = crr_line[0].strip() + '.'
60
+
61
+ # Check wrapped lines and extract excess if any
62
+ wrapped = wrap_text(current_line).split('\n')
63
+ result1 = '\n'.join(wrapped[2:])
64
+ if result1:
65
+ moved_word = result1
66
+ current_line = current_line.rstrip()
67
+ if current_line.endswith(moved_word):
68
+ current_line = current_line[:-(len(moved_word))].rstrip()
69
+
70
+ result.append(current_line.strip())
71
+ current_line = moved_word + " "
72
+ else:
73
+ result.append(current_line.strip())
74
+ current_line = remaining_line + " " + word + " "
75
+ else:
76
+ current_line = remaining_line + " " + word + " "
77
+
78
+ if current_line:
79
+ result.append(current_line.strip())
80
+
81
+ # Write segmented output
82
+ with open(output_path, "w", encoding="utf-8") as f:
83
+ for seg in result:
84
+ f.write(seg.strip() + "\n")
85
+
86
+
87
+ def convert_to_srt(fragments):
88
+ def format_timestamp(seconds):
89
+ h = int(seconds // 3600)
90
+ m = int((seconds % 3600) // 60)
91
+ s = int(seconds % 60)
92
+ ms = int((seconds - int(seconds)) * 1000)
93
+ return f"{h:02}:{m:02}:{s:02},{ms:03}"
94
+
95
+ srt_output = []
96
+ index = 1
97
+ for f in fragments:
98
+ start = float(f.begin)
99
+ end = float(f.end)
100
+ text = f.text.strip()
101
+
102
+ if end <= start or not text:
103
+ continue
104
+
105
+ lines = wrap_text(text)
106
+
107
+ srt_output.append(f"{index}")
108
+ srt_output.append(f"{format_timestamp(start)} --> {format_timestamp(end)}")
109
+ srt_output.append(lines)
110
+ srt_output.append("") # Empty line
111
+ index += 1
112
+
113
+ return "\n".join(srt_output)
114
+
115
+
116
+ def check_ffmpeg():
117
+ """Check if FFmpeg is available on the system"""
118
+ try:
119
+ subprocess.run(['ffmpeg', '-version'], capture_output=True, check=True)
120
+ return True
121
+ except (subprocess.CalledProcessError, FileNotFoundError):
122
+ return False
123
+
124
+
125
+ def is_video_file(file_path):
126
+ """Check if the file is a video file based on extension"""
127
+ video_extensions = {'.mp4', '.avi', '.mkv', '.mov', '.wmv', '.flv', '.webm', '.m4v', '.3gp', '.mpg', '.mpeg'}
128
+ return Path(file_path).suffix.lower() in video_extensions
129
+
130
+
131
+ def is_audio_file(file_path):
132
+ """Check if the file is an audio file based on extension"""
133
+ audio_extensions = {'.wav', '.mp3', '.flac', '.aac', '.ogg', '.wma', '.m4a', '.opus'}
134
+ return Path(file_path).suffix.lower() in audio_extensions
135
+
136
+
137
+ def convert_video_to_audio(video_path, output_path):
138
+ """Convert video file to audio using FFmpeg"""
139
+ try:
140
+ # Use FFmpeg to extract audio from video
141
+ cmd = [
142
+ 'ffmpeg', '-i', video_path,
143
+ '-vn', # No video
144
+ '-acodec', 'libmp3lame', # MP3 codec
145
+ '-ab', '192k', # Audio bitrate
146
+ '-ar', '44100', # Sample rate
147
+ '-y', # Overwrite output file
148
+ output_path
149
+ ]
150
+
151
+ result = subprocess.run(cmd, capture_output=True, text=True)
152
+
153
+ if result.returncode != 0:
154
+ raise RuntimeError(f"FFmpeg conversion failed: {result.stderr}")
155
+
156
+ return True
157
+ except Exception as e:
158
+ raise RuntimeError(f"Error converting video to audio: {str(e)}")
159
+
160
+
161
+ def get_media_file_path(media_input):
162
+ """Get file path from media input (audio or video)"""
163
+ if media_input is None:
164
+ return None
165
+
166
+ if isinstance(media_input, str):
167
+ return media_input
168
+ elif isinstance(media_input, tuple) and len(media_input) >= 2:
169
+ return media_input[1] if isinstance(media_input[1], str) else media_input[0]
170
+ else:
171
+ print(f"Debug: Unexpected media input type: {type(media_input)}")
172
+ return str(media_input)
173
+
174
+
175
+ def get_text_file_path(text_input):
176
+ if text_input is None:
177
+ return None
178
+
179
+ if isinstance(text_input, dict):
180
+ return text_input['name']
181
+ elif isinstance(text_input, str):
182
+ return text_input
183
+ else:
184
+ print(f"Debug: Unexpected text input type: {type(text_input)}")
185
+ return str(text_input)
186
+
187
+
188
+ def process_alignment(media_file, text_file, language, progress=gr.Progress()):
189
+
190
+ if media_file is None:
191
+ return "❌ Please upload an audio or video file", None, None, "", None, None
192
+
193
+ if text_file is None:
194
+ return "❌ Please upload a text file", None, None, "", None, None
195
+
196
+ # Check if FFmpeg is available
197
+ if not check_ffmpeg():
198
+ return "❌ FFmpeg not found. Please install FFmpeg to process video files.", None, None, "", None, None
199
+
200
+ # Initialize variables for cleanup
201
+ temp_text_file_path = None
202
+ temp_audio_file_path = None
203
+ output_file = None
204
+
205
+ try:
206
+ progress(0.1, desc="Initializing...")
207
+
208
+ # Create temporary directory for better file handling
209
+ temp_dir = tempfile.mkdtemp()
210
+
211
+ # Get the media file path
212
+ media_file_path = get_media_file_path(media_file)
213
+ if not media_file_path:
214
+ raise ValueError("Could not determine media file path")
215
+
216
+ # Verify media file exists
217
+ if not os.path.exists(media_file_path):
218
+ raise FileNotFoundError(f"Media file not found: {media_file_path}")
219
+
220
+ # Process media file - convert video to audio if needed
221
+ if is_video_file(media_file_path):
222
+ progress(0.2, desc="Converting video to audio...")
223
+ temp_audio_file_path = os.path.join(temp_dir, "extracted_audio.mp3")
224
+ convert_video_to_audio(media_file_path, temp_audio_file_path)
225
+ audio_file_path = temp_audio_file_path
226
+ print(f"Debug: Video converted to audio: {audio_file_path}")
227
+ elif is_audio_file(media_file_path):
228
+ audio_file_path = media_file_path
229
+ print(f"Debug: Using audio file directly: {audio_file_path}")
230
+ else:
231
+ raise ValueError("Unsupported file format. Please provide an audio or video file.")
232
+
233
+ # Get the text file path
234
+ text_file_path = get_text_file_path(text_file)
235
+ if not text_file_path:
236
+ raise ValueError("Could not determine text file path")
237
+
238
+ print(f"Debug: Text file path: {text_file_path}")
239
+
240
+ # Verify text file exists and read content
241
+ if not os.path.exists(text_file_path):
242
+ raise FileNotFoundError(f"Text file not found: {text_file_path}")
243
+
244
+ # Read and validate text content
245
+ try:
246
+ with open(text_file_path, 'r', encoding='utf-8') as f:
247
+ text_content = f.read().strip()
248
+ except UnicodeDecodeError:
249
+ # Try with different encoding if UTF-8 fails
250
+ with open(text_file_path, 'r', encoding='latin-1') as f:
251
+ text_content = f.read().strip()
252
+
253
+ if not text_content:
254
+ raise ValueError("Text file is empty or contains only whitespace")
255
+
256
+ progress(0.3, desc="Processing text file...")
257
+
258
+ temp_text_file_path = os.path.join(temp_dir, "input_text.txt")
259
+ segment_text_file(text_content, temp_text_file_path)
260
+
261
+ # Verify temp text file was created
262
+ if not os.path.exists(temp_text_file_path):
263
+ raise RuntimeError("Failed to create temporary text file")
264
+
265
+ # Create output file path
266
+ output_file = os.path.join(temp_dir, "alignment_output.json")
267
+
268
+ progress(0.4, desc="Creating task configuration...")
269
+
270
+ # Create task configuration
271
+ config_string = f"task_language={language}|is_text_type=plain|os_task_file_format=json"
272
+
273
+ # Create and configure the task
274
+ task = Task(config_string=config_string)
275
+
276
+ # Set absolute paths
277
+ task.audio_file_path_absolute = os.path.abspath(audio_file_path)
278
+ task.text_file_path_absolute = os.path.abspath(temp_text_file_path)
279
+ task.sync_map_file_path_absolute = os.path.abspath(output_file)
280
+
281
+ progress(0.5, desc="Running alignment... This may take a while...")
282
+
283
+ # Execute the alignment
284
+ ExecuteTask(task).execute()
285
+
286
+ progress(0.8, desc="Processing results...")
287
+
288
+ # output sync map to file
289
+ task.output_sync_map_file()
290
+
291
+ # Check if output file was created
292
+ if not os.path.exists(output_file):
293
+ raise RuntimeError(f"Alignment output file was not created: {output_file}")
294
+
295
+ # Read and process results
296
+ with open(output_file, 'r', encoding='utf-8') as f:
297
+ results = json.load(f)
298
+
299
+ # Read output and convert to SRT
300
+ fragments = task.sync_map.fragments
301
+ srt_content = convert_to_srt(fragments)
302
+
303
+ srt_path = os.path.join(temp_dir, "output.srt")
304
+ vtt_path = os.path.join(temp_dir, "output.vtt")
305
+ with open(srt_path, "w", encoding="utf-8") as f:
306
+ f.write(srt_content)
307
+
308
+ webvtt.from_srt(srt_path).save()
309
+
310
+ if 'fragments' not in results or not results['fragments']:
311
+ raise RuntimeError("No alignment fragments found in results")
312
+
313
+ # Create DataFrame for display
314
+ df_data = []
315
+ for i, fragment in enumerate(results['fragments']):
316
+ start_time = float(fragment['begin'])
317
+ end_time = float(fragment['end'])
318
+ duration = end_time - start_time
319
+ text = fragment['lines'][0] if fragment['lines'] else ""
320
+
321
+ df_data.append({
322
+ 'Segment': i + 1,
323
+ 'Start (s)': f"{start_time:.3f}",
324
+ 'End (s)': f"{end_time:.3f}",
325
+ 'Duration (s)': f"{duration:.3f}",
326
+ 'Text': text
327
+ })
328
+
329
+ df = pd.DataFrame(df_data)
330
+
331
+ # Create summary
332
+ total_duration = float(results['fragments'][-1]['end']) if results['fragments'] else 0
333
+ avg_segment_length = total_duration / len(results['fragments']) if results['fragments'] else 0
334
+
335
+ file_type = "video" if is_video_file(media_file_path) else "audio"
336
+
337
+ summary = f"""
338
+ πŸ“Š **Alignment Summary**
339
+ - **Input type:** {file_type.title()} file
340
+ - **Total segments:** {len(results['fragments'])}
341
+ - **Total duration:** {total_duration:.3f} seconds
342
+ - **Average segment length:** {avg_segment_length:.3f} seconds
343
+ - **Language:** {language}
344
+ """
345
+
346
+ progress(1.0, desc="Complete!")
347
+
348
+ print(f"Debug: Alignment completed successfully with {len(results['fragments'])} fragments")
349
+
350
+ return (
351
+ "βœ… Alignment completed successfully!",
352
+ df,
353
+ output_file, # For download
354
+ summary,
355
+ srt_path,
356
+ vtt_path
357
+ )
358
+
359
+ except Exception as e:
360
+ print(f"Debug: Exception occurred: {str(e)}")
361
+ print(f"Debug: Traceback: {traceback.format_exc()}")
362
+
363
+ error_msg = f"❌ Error during alignment: {str(e)}\n\n"
364
+ error_msg += "**Troubleshooting tips:**\n"
365
+ error_msg += "- Ensure media file is in supported format (audio: WAV, MP3, FLAC, etc. | video: MP4, AVI, MKV, etc.)\n"
366
+ error_msg += "- Ensure text file contains the spoken content\n"
367
+ error_msg += "- Check that text file is in UTF-8 or Latin-1 encoding\n"
368
+ error_msg += "- Verify both media and text files are not corrupted\n"
369
+ error_msg += "- Try with a shorter audio/video/text pair first\n"
370
+ error_msg += "- Make sure FFmpeg and Aeneas dependencies are properly installed\n"
371
+ error_msg += "- For video files, ensure they contain audio tracks\n"
372
+
373
+ if temp_text_file_path:
374
+ error_msg += f"- Text file was processed from: {text_file_path}\n"
375
+
376
+ error_msg += f"\n**Technical details:**\n```\n{traceback.format_exc()}\n```"
377
+
378
+ return error_msg, None, None, "", None, None
379
+
380
+ finally:
381
+ # Clean up temporary files
382
+ try:
383
+ if temp_text_file_path and os.path.exists(temp_text_file_path):
384
+ os.unlink(temp_text_file_path)
385
+ if temp_audio_file_path and os.path.exists(temp_audio_file_path):
386
+ os.unlink(temp_audio_file_path)
387
+ print(f"Debug: Cleaned up temporary files")
388
+ except Exception as cleanup_error:
389
+ print(f"Debug: Error cleaning up temporary files: {cleanup_error}")
390
+
391
+
392
+ def create_interface():
393
+
394
+ with gr.Blocks(title="Aeneas Forced Alignment Tool", theme=gr.themes.Soft()) as interface:
395
+ gr.Markdown("""
396
+ # 🎯 Aeneas Forced Alignment Tool
397
+
398
+ Upload an audio or video file and provide the corresponding text to generate precise time alignments.
399
+ Perfect for creating subtitles, analyzing speech patterns, or preparing training data.
400
+
401
+ **Supported formats:**
402
+ - **Audio:** WAV, MP3, FLAC, AAC, OGG, WMA, M4A, OPUS
403
+ - **Video:** MP4, AVI, MKV, MOV, WMV, FLV, WebM, M4V, 3GP, MPG, MPEG
404
+ """)
405
+
406
+ with gr.Row():
407
+ with gr.Column(scale=1):
408
+ gr.Markdown("### πŸ“ Input Files")
409
+
410
+ media_input = gr.File(
411
+ label="Audio or Video File",
412
+ file_types=[
413
+ ".wav", ".mp3", ".flac", ".aac", ".ogg", ".wma", ".m4a", ".opus", # Audio
414
+ ".mp4", ".avi", ".mkv", ".mov", ".wmv", ".flv", ".webm", ".m4v", ".3gp", ".mpg", ".mpeg" # Video
415
+ ],
416
+ file_count="single"
417
+ )
418
+
419
+ text_input = gr.File(
420
+ label="Text File (.txt)",
421
+ file_types=[".txt"],
422
+ file_count="single"
423
+ )
424
+
425
+ gr.Markdown("### βš™οΈ Configuration")
426
+
427
+ language_input = gr.Dropdown(
428
+ choices=["en", "es", "fr", "de", "it", "pt", "ru", "zh", "ja", "ar"],
429
+ value="en",
430
+ label="Language Code",
431
+ info="ISO language code (en=English, es=Spanish, etc.)"
432
+ )
433
+
434
+ process_btn = gr.Button("πŸš€ Process Alignment", variant="primary", size="lg")
435
+
436
+ with gr.Column(scale=2):
437
+ gr.Markdown("### πŸ“Š Results")
438
+
439
+ status_output = gr.Markdown()
440
+ summary_output = gr.Markdown()
441
+
442
+ results_output = gr.Dataframe(
443
+ label="Alignment Results",
444
+ headers=["Segment", "Start (s)", "End (s)", "Duration (s)", "Text"],
445
+ datatype=["number", "str", "str", "str", "str"],
446
+ interactive=False
447
+ )
448
+
449
+ download_output = gr.File(
450
+ label="Download JSON Results",
451
+ visible=False
452
+ )
453
+
454
+ srt_file_output = gr.File(
455
+ label="Download SRT File",
456
+ visible=False
457
+ )
458
+
459
+ vtt_file_output = gr.File(
460
+ label="Download VTT File",
461
+ visible=False
462
+ )
463
+
464
+ # Event handlers
465
+ process_btn.click(
466
+ fn=process_alignment,
467
+ inputs=[
468
+ media_input,
469
+ text_input,
470
+ language_input,
471
+ ],
472
+ outputs=[
473
+ status_output,
474
+ results_output,
475
+ download_output,
476
+ summary_output,
477
+ srt_file_output,
478
+ vtt_file_output
479
+ ]
480
+ ).then(
481
+ fn=lambda x: gr.update(visible=x is not None),
482
+ inputs=download_output,
483
+ outputs=download_output
484
+ ).then(
485
+ fn=lambda x: gr.update(visible=x is not None),
486
+ inputs=srt_file_output,
487
+ outputs=srt_file_output
488
+ ).then(
489
+ fn=lambda x: gr.update(visible=x is not None),
490
+ inputs=vtt_file_output,
491
+ outputs=vtt_file_output
492
+ )
493
+
494
+ return interface
495
+
496
+
497
+ def run_fastapi():
498
+ uvicorn.run(fastapi_app, host="0.0.0.0", port=8000)
499
+
500
+
501
+ def main():
502
+ try:
503
+ threading.Thread(target=run_fastapi, daemon=True).start()
504
+
505
+ interface = create_interface()
506
+ print("πŸš€ Starting Gradio UI on http://localhost:7860")
507
+ print("🧠 FastAPI JSON endpoint available at http://localhost:8000/align")
508
+
509
+ interface.launch(
510
+ server_name="0.0.0.0",
511
+ server_port=7860,
512
+ share=False,
513
+ debug=False
514
+ )
515
+
516
+ except ImportError as e:
517
+ print("❌ Missing dependency:", e)
518
+ except Exception as e:
519
+ print("❌ Error launching application:", e)
520
+
521
+
522
+ from fastapi import FastAPI, UploadFile, File, Form, HTTPException
523
+ from fastapi.responses import JSONResponse
524
+ from fastapi.middleware.cors import CORSMiddleware
525
+
526
+ fastapi_app = FastAPI()
527
+
528
+ fastapi_app.add_middleware(
529
+ CORSMiddleware,
530
+ allow_origins=["*"],
531
+ allow_credentials=True,
532
+ allow_methods=["*"],
533
+ allow_headers=["*"],
534
+ )
535
+
536
+ @fastapi_app.post("/align")
537
+ async def align_api(
538
+ media_file: UploadFile = File(...),
539
+ text_file: UploadFile = File(...),
540
+ language: str = Form(default="en")
541
+ ):
542
+ try:
543
+ # Validate text file
544
+ if not text_file.filename.endswith(".txt"):
545
+ raise HTTPException(
546
+ status_code=400,
547
+ detail="Text file must be a .txt file"
548
+ )
549
+
550
+ # Check if media file is supported
551
+ media_filename = media_file.filename.lower()
552
+ audio_extensions = {'.wav', '.mp3', '.flac', '.aac', '.ogg', '.wma', '.m4a', '.opus'}
553
+ video_extensions = {'.mp4', '.avi', '.mkv', '.mov', '.wmv', '.flv', '.webm', '.m4v', '.3gp', '.mpg', '.mpeg'}
554
+
555
+ file_ext = Path(media_filename).suffix.lower()
556
+ if file_ext not in audio_extensions and file_ext not in video_extensions:
557
+ raise HTTPException(
558
+ status_code=400,
559
+ detail=f"Unsupported media file format: {file_ext}. Supported formats: {', '.join(sorted(audio_extensions | video_extensions))}"
560
+ )
561
+
562
+ # Save uploaded files temporarily
563
+ with tempfile.NamedTemporaryFile(delete=False, suffix=file_ext) as temp_media:
564
+ shutil.copyfileobj(media_file.file, temp_media)
565
+ media_path = temp_media.name
566
+
567
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".txt", mode='w+', encoding='utf-8') as temp_text:
568
+ content = (await text_file.read()).decode('utf-8', errors='ignore')
569
+ temp_text.write(content)
570
+ temp_text.flush()
571
+ text_path = temp_text.name
572
+
573
+ # Process alignment
574
+ status, df, json_path, summary, srt_path, vtt_path = process_alignment(media_path, text_path, language)
575
+
576
+ # Clean up uploaded files
577
+ try:
578
+ os.unlink(media_path)
579
+ os.unlink(text_path)
580
+ except Exception as cleanup_error:
581
+ print(f"Warning: Error cleaning up uploaded files: {cleanup_error}")
582
+
583
+ if "Error" in status or status.startswith("❌"):
584
+ raise HTTPException(status_code=500, detail=status)
585
+
586
+ response = {
587
+ "status": status,
588
+ "summary": summary,
589
+ "segments": df.to_dict(orient="records") if df is not None else [],
590
+ "download_links": {
591
+ "alignment_json": json_path,
592
+ "srt": srt_path,
593
+ "vtt": vtt_path
594
+ }
595
+ }
596
+
597
+ return JSONResponse(status_code=200, content=response)
598
+
599
+ except HTTPException:
600
+ raise
601
+ except Exception as e:
602
+ raise HTTPException(
603
+ status_code=500,
604
+ detail=f"Unexpected server error: {str(e)}"
605
+ )
606
+
607
+
608
+ if __name__ == "__main__":
609
  main()