# app.py - Main Gradio application import gradio as gr import whisper import torch from transformers import MarianMTModel, MarianTokenizer import yt_dlp import os import tempfile import subprocess from pathlib import Path import re class SubtitleTranslator: def __init__(self): # Use the smallest Whisper model for speed self.whisper_model = whisper.load_model("tiny") # Translation model cache self.translation_models = {} self.tokenizers = {} def download_youtube_audio(self, url): """Download audio from YouTube video""" try: ydl_opts = { 'format': 'bestaudio/best', 'outtmpl': 'temp_audio.%(ext)s', 'postprocessors': [{ 'key': 'FFmpegExtractAudio', 'preferredcodec': 'mp3', 'preferredquality': '192', }], } with yt_dlp.YoutubeDL(ydl_opts) as ydl: ydl.download([url]) # Find the downloaded file for file in os.listdir('.'): if file.startswith('temp_audio') and file.endswith('.mp3'): return file return None except Exception as e: return None def extract_audio_from_video(self, video_path): """Extract audio from uploaded video file""" try: audio_path = "temp_extracted_audio.wav" cmd = [ 'ffmpeg', '-i', video_path, '-acodec', 'pcm_s16le', '-ac', '1', '-ar', '16000', audio_path, '-y' ] subprocess.run(cmd, check=True, capture_output=True) return audio_path except Exception as e: return None def transcribe_audio(self, audio_path): """Transcribe audio using Whisper""" result = self.whisper_model.transcribe(audio_path) return result def get_translation_model(self, source_lang, target_lang="en"): """Load translation model for language pair""" model_name = f"Helsinki-NLP/opus-mt-{source_lang}-{target_lang}" try: if model_name not in self.translation_models: self.tokenizers[model_name] = MarianTokenizer.from_pretrained(model_name) self.translation_models[model_name] = MarianMTModel.from_pretrained(model_name) return self.translation_models[model_name], self.tokenizers[model_name] except: # Fallback to multilingual model fallback_model = "Helsinki-NLP/opus-mt-mul-en" if fallback_model not in self.translation_models: self.tokenizers[fallback_model] = MarianTokenizer.from_pretrained(fallback_model) self.translation_models[fallback_model] = MarianMTModel.from_pretrained(fallback_model) return self.translation_models[fallback_model], self.tokenizers[fallback_model] def translate_text(self, text, source_lang, target_lang="en"): """Translate text using MarianMT""" if source_lang == target_lang: return text try: model, tokenizer = self.get_translation_model(source_lang, target_lang) inputs = tokenizer.encode(text, return_tensors="pt", truncation=True, max_length=512) translated = model.generate(inputs, max_length=512, num_beams=4, early_stopping=True) return tokenizer.decode(translated[0], skip_special_tokens=True) except: return text # Return original if translation fails def format_timestamp(self, seconds): """Convert seconds to SRT timestamp format""" hours = int(seconds // 3600) minutes = int((seconds % 3600) // 60) secs = int(seconds % 60) millisecs = int((seconds % 1) * 1000) return f"{hours:02d}:{minutes:02d}:{secs:02d},{millisecs:03d}" def create_srt(self, segments, source_lang): """Create SRT subtitle content""" srt_content = "" for i, segment in enumerate(segments, 1): start_time = self.format_timestamp(segment['start']) end_time = self.format_timestamp(segment['end']) original_text = segment['text'].strip() translated_text = self.translate_text(original_text, source_lang, "en") srt_content += f"{i}\n" srt_content += f"{start_time} --> {end_time}\n" srt_content += f"{translated_text}\n\n" return srt_content def process_video(self, video_input, youtube_url): """Main processing function""" try: # Determine input source if youtube_url and youtube_url.strip(): audio_path = self.download_youtube_audio(youtube_url.strip()) if not audio_path: return "Error: Could not download YouTube video", None elif video_input: audio_path = self.extract_audio_from_video(video_input) if not audio_path: return "Error: Could not extract audio from video", None else: return "Please provide either a video file or YouTube URL", None # Transcribe audio result = self.transcribe_audio(audio_path) # Detect language detected_lang = result.get('language', 'unknown') # Language code mapping for translation models lang_mapping = { 'spanish': 'es', 'french': 'fr', 'german': 'de', 'italian': 'it', 'portuguese': 'pt', 'russian': 'ru', 'chinese': 'zh', 'japanese': 'ja', 'korean': 'ko', 'arabic': 'ar', 'hindi': 'hi', 'dutch': 'nl', 'swedish': 'sv', 'norwegian': 'no', 'danish': 'da', 'finnish': 'fi' } source_lang_code = lang_mapping.get(detected_lang, detected_lang) # Create SRT content srt_content = self.create_srt(result['segments'], source_lang_code) # Save SRT file srt_filename = "translated_subtitles.srt" with open(srt_filename, 'w', encoding='utf-8') as f: f.write(srt_content) # Clean up temporary files if os.path.exists(audio_path): os.remove(audio_path) status_msg = f"✅ Processing complete!\n" status_msg += f"🔍 Detected language: {detected_lang}\n" status_msg += f"📝 Generated {len(result['segments'])} subtitle segments\n" status_msg += f"🌍 Translated to English" return status_msg, srt_filename except Exception as e: return f"Error during processing: {str(e)}", None # Initialize the translator translator = SubtitleTranslator() # Create Gradio interface def process_video_interface(video_file, youtube_url, progress=gr.Progress()): progress(0.1, desc="Starting processing...") progress(0.3, desc="Extracting audio...") result = translator.process_video(video_file, youtube_url) progress(0.7, desc="Transcribing and translating...") progress(1.0, desc="Complete!") return result # Custom CSS for better UI css = """ .gradio-container { max-width: 900px !important; } .title { text-align: center; color: #2563eb; font-size: 2.5rem; font-weight: bold; margin-bottom: 1rem; } .subtitle { text-align: center; color: #64748b; font-size: 1.2rem; margin-bottom: 2rem; } .feature-box { background: linear-gradient(135deg, #667eea 0%, #764ba2 100%); color: white; padding: 1rem; border-radius: 10px; margin: 1rem 0; } """ # Create the Gradio app with gr.Blocks(css=css, title="Video Subtitle Translator") as app: gr.HTML("""
⚡ Powered by Whisper AI & MarianMT | 🤗 Running on Hugging Face Spaces
💡 Tip: For best results, use videos with clear audio and minimal background noise