Spaces:
Running
Running
# app.py - Main Gradio application | |
import gradio as gr | |
import whisper | |
import torch | |
from transformers import MarianMTModel, MarianTokenizer | |
import yt_dlp | |
import os | |
import tempfile | |
import subprocess | |
from pathlib import Path | |
import re | |
class SubtitleTranslator: | |
def __init__(self): | |
# Use the smallest Whisper model for speed | |
self.whisper_model = whisper.load_model("tiny") | |
# Translation model cache | |
self.translation_models = {} | |
self.tokenizers = {} | |
def download_youtube_audio(self, url): | |
"""Download audio from YouTube video""" | |
try: | |
ydl_opts = { | |
'format': 'bestaudio/best', | |
'outtmpl': 'temp_audio.%(ext)s', | |
'postprocessors': [{ | |
'key': 'FFmpegExtractAudio', | |
'preferredcodec': 'mp3', | |
'preferredquality': '192', | |
}], | |
} | |
with yt_dlp.YoutubeDL(ydl_opts) as ydl: | |
ydl.download([url]) | |
# Find the downloaded file | |
for file in os.listdir('.'): | |
if file.startswith('temp_audio') and file.endswith('.mp3'): | |
return file | |
return None | |
except Exception as e: | |
return None | |
def extract_audio_from_video(self, video_path): | |
"""Extract audio from uploaded video file""" | |
try: | |
audio_path = "temp_extracted_audio.wav" | |
cmd = [ | |
'ffmpeg', '-i', video_path, | |
'-acodec', 'pcm_s16le', | |
'-ac', '1', | |
'-ar', '16000', | |
audio_path, '-y' | |
] | |
subprocess.run(cmd, check=True, capture_output=True) | |
return audio_path | |
except Exception as e: | |
return None | |
def transcribe_audio(self, audio_path): | |
"""Transcribe audio using Whisper""" | |
result = self.whisper_model.transcribe(audio_path) | |
return result | |
def get_translation_model(self, source_lang, target_lang="en"): | |
"""Load translation model for language pair""" | |
model_name = f"Helsinki-NLP/opus-mt-{source_lang}-{target_lang}" | |
try: | |
if model_name not in self.translation_models: | |
self.tokenizers[model_name] = MarianTokenizer.from_pretrained(model_name) | |
self.translation_models[model_name] = MarianMTModel.from_pretrained(model_name) | |
return self.translation_models[model_name], self.tokenizers[model_name] | |
except: | |
# Fallback to multilingual model | |
fallback_model = "Helsinki-NLP/opus-mt-mul-en" | |
if fallback_model not in self.translation_models: | |
self.tokenizers[fallback_model] = MarianTokenizer.from_pretrained(fallback_model) | |
self.translation_models[fallback_model] = MarianMTModel.from_pretrained(fallback_model) | |
return self.translation_models[fallback_model], self.tokenizers[fallback_model] | |
def translate_text(self, text, source_lang, target_lang="en"): | |
"""Translate text using MarianMT""" | |
if source_lang == target_lang: | |
return text | |
try: | |
model, tokenizer = self.get_translation_model(source_lang, target_lang) | |
inputs = tokenizer.encode(text, return_tensors="pt", truncation=True, max_length=512) | |
translated = model.generate(inputs, max_length=512, num_beams=4, early_stopping=True) | |
return tokenizer.decode(translated[0], skip_special_tokens=True) | |
except: | |
return text # Return original if translation fails | |
def format_timestamp(self, seconds): | |
"""Convert seconds to SRT timestamp format""" | |
hours = int(seconds // 3600) | |
minutes = int((seconds % 3600) // 60) | |
secs = int(seconds % 60) | |
millisecs = int((seconds % 1) * 1000) | |
return f"{hours:02d}:{minutes:02d}:{secs:02d},{millisecs:03d}" | |
def create_srt(self, segments, source_lang): | |
"""Create SRT subtitle content""" | |
srt_content = "" | |
for i, segment in enumerate(segments, 1): | |
start_time = self.format_timestamp(segment['start']) | |
end_time = self.format_timestamp(segment['end']) | |
original_text = segment['text'].strip() | |
translated_text = self.translate_text(original_text, source_lang, "en") | |
srt_content += f"{i}\n" | |
srt_content += f"{start_time} --> {end_time}\n" | |
srt_content += f"{translated_text}\n\n" | |
return srt_content | |
def process_video(self, video_input, youtube_url): | |
"""Main processing function""" | |
try: | |
# Determine input source | |
if youtube_url and youtube_url.strip(): | |
audio_path = self.download_youtube_audio(youtube_url.strip()) | |
if not audio_path: | |
return "Error: Could not download YouTube video", None | |
elif video_input: | |
audio_path = self.extract_audio_from_video(video_input) | |
if not audio_path: | |
return "Error: Could not extract audio from video", None | |
else: | |
return "Please provide either a video file or YouTube URL", None | |
# Transcribe audio | |
result = self.transcribe_audio(audio_path) | |
# Detect language | |
detected_lang = result.get('language', 'unknown') | |
# Language code mapping for translation models | |
lang_mapping = { | |
'spanish': 'es', 'french': 'fr', 'german': 'de', 'italian': 'it', | |
'portuguese': 'pt', 'russian': 'ru', 'chinese': 'zh', 'japanese': 'ja', | |
'korean': 'ko', 'arabic': 'ar', 'hindi': 'hi', 'dutch': 'nl', | |
'swedish': 'sv', 'norwegian': 'no', 'danish': 'da', 'finnish': 'fi' | |
} | |
source_lang_code = lang_mapping.get(detected_lang, detected_lang) | |
# Create SRT content | |
srt_content = self.create_srt(result['segments'], source_lang_code) | |
# Save SRT file | |
srt_filename = "translated_subtitles.srt" | |
with open(srt_filename, 'w', encoding='utf-8') as f: | |
f.write(srt_content) | |
# Clean up temporary files | |
if os.path.exists(audio_path): | |
os.remove(audio_path) | |
status_msg = f"β Processing complete!\n" | |
status_msg += f"π Detected language: {detected_lang}\n" | |
status_msg += f"π Generated {len(result['segments'])} subtitle segments\n" | |
status_msg += f"π Translated to English" | |
return status_msg, srt_filename | |
except Exception as e: | |
return f"Error during processing: {str(e)}", None | |
# Initialize the translator | |
translator = SubtitleTranslator() | |
# Create Gradio interface | |
def process_video_interface(video_file, youtube_url, progress=gr.Progress()): | |
progress(0.1, desc="Starting processing...") | |
progress(0.3, desc="Extracting audio...") | |
result = translator.process_video(video_file, youtube_url) | |
progress(0.7, desc="Transcribing and translating...") | |
progress(1.0, desc="Complete!") | |
return result | |
# Custom CSS for better UI | |
css = """ | |
.gradio-container { | |
max-width: 900px !important; | |
} | |
.title { | |
text-align: center; | |
color: #2563eb; | |
font-size: 2.5rem; | |
font-weight: bold; | |
margin-bottom: 1rem; | |
} | |
.subtitle { | |
text-align: center; | |
color: #64748b; | |
font-size: 1.2rem; | |
margin-bottom: 2rem; | |
} | |
.feature-box { | |
background: linear-gradient(135deg, #667eea 0%, #764ba2 100%); | |
color: white; | |
padding: 1rem; | |
border-radius: 10px; | |
margin: 1rem 0; | |
} | |
""" | |
# Create the Gradio app | |
with gr.Blocks(css=css, title="Video Subtitle Translator") as app: | |
gr.HTML(""" | |
<div class="title">π¬ Video Subtitle Translator</div> | |
<div class="subtitle">Generate English subtitles from any language video using AI</div> | |
""") | |
with gr.Row(): | |
with gr.Column(): | |
gr.HTML(""" | |
<div class="feature-box"> | |
<h3>π Features:</h3> | |
<ul> | |
<li>πΉ Upload video files or paste YouTube links</li> | |
<li>π― Automatic speech recognition with Whisper AI</li> | |
<li>π Auto-detect source language</li> | |
<li>π Generate accurate English subtitles</li> | |
<li>β±οΈ Perfect timing synchronization</li> | |
<li>πΎ Download ready-to-use SRT files</li> | |
</ul> | |
</div> | |
""") | |
with gr.Row(): | |
with gr.Column(scale=1): | |
video_input = gr.File( | |
label="π Upload Video File", | |
file_types=[".mp4", ".avi", ".mov", ".mkv", ".webm", ".m4v"], | |
type="filepath" | |
) | |
youtube_input = gr.Textbox( | |
label="π Or paste YouTube URL", | |
placeholder="https://www.youtube.com/watch?v=...", | |
lines=1 | |
) | |
process_btn = gr.Button( | |
"π Generate Subtitles", | |
variant="primary", | |
size="lg" | |
) | |
with gr.Column(scale=1): | |
status_output = gr.Textbox( | |
label="π Processing Status", | |
lines=6, | |
interactive=False | |
) | |
srt_output = gr.File( | |
label="πΎ Download SRT File", | |
interactive=False | |
) | |
gr.HTML(""" | |
<div style="text-align: center; margin-top: 2rem; color: #64748b;"> | |
<p>β‘ Powered by Whisper AI & MarianMT | π€ Running on Hugging Face Spaces</p> | |
<p>π‘ Tip: For best results, use videos with clear audio and minimal background noise</p> | |
</div> | |
""") | |
# Connect the processing function | |
process_btn.click( | |
fn=process_video_interface, | |
inputs=[video_input, youtube_input], | |
outputs=[status_output, srt_output], | |
show_progress=True | |
) | |
if __name__ == "__main__": | |
app.launch() |