import os import gradio as gr import requests import json from moviepy import VideoFileClip import uuid ELEVENLABS_API_KEY = os.environ.get("ELEVENLABS_API_KEY", None) def extract_audio(video_path, output_format="mp3"): if not video_path: return None, "No video provided" output_path = f"extracted_audio_{uuid.uuid4().hex[:8]}.{output_format}" try: video = VideoFileClip(video_path) video.audio.write_audiofile(output_path, logger=None) video.close() return output_path, f"Audio extracted successfully" except Exception as e: return None, f"Error extracting audio: {str(e)}" def save_transcription(transcription): if "error" in transcription: return None, transcription["error"] transcript_filename = f"transcription_{uuid.uuid4().hex[:8]}.txt" try: with open(transcript_filename, "w", encoding="utf-8") as f: f.write(transcription.get('text', 'No text found')) return transcript_filename, "Transcription saved as text file" except Exception as e: return None, f"Error saving transcription: {str(e)}" def process_video_file(video_file, output_format, api_key, model_id): if video_file is None: return None, "Please upload a video file", None, "No video provided" audio_path, message = extract_audio(video_file, output_format) if audio_path and os.path.exists(audio_path): transcription = transcribe_audio(audio_path, api_key, model_id) transcript_file, transcript_message = save_transcription(transcription) return audio_path, message, transcript_file, transcript_message else: return None, message, None, "Audio extraction failed, cannot transcribe" def process_video_url(video_url, output_format, api_key, model_id): if not video_url.strip(): return None, "Please enter a video URL", None, "No URL provided" video_path, error = download_video_from_url(video_url) if error: return None, error, None, "Video download failed, cannot transcribe" audio_path, message = extract_audio(video_path, output_format) if video_path and os.path.exists(video_path): try: os.remove(video_path) except: pass if audio_path and os.path.exists(audio_path): transcription = transcribe_audio(audio_path, api_key, model_id) transcript_file, transcript_message = save_transcription(transcription) return audio_path, message, transcript_file, transcript_message else: return None, message, None, "Audio extraction failed, cannot transcribe" def transcribe_audio(audio_path, api_key, model_id="scribe_v1"): start_time = time.time() if not api_key: return {"error": "Please provide an API key"} url = "https://api.elevenlabs.io/v1/speech-to-text" headers = { "xi-api-key": api_key, "Content-Type": "multipart/form-data" # Explicitly set content type } try: with open(audio_path, "rb") as f: files = { "file": (os.path.basename(audio_path), f, "audio/mpeg"), "model_id": (None, model_id) } response = requests.post( url, headers=headers, files=files ) # More detailed error handling if response.status_code != 200: return { "error": f"API request failed with status {response.status_code}", "response_text": response.text } result = response.json() except requests.exceptions.RequestException as e: return {"error": f"API request failed: {str(e)}"} except json.JSONDecodeError: return {"error": "Failed to parse API response"} except Exception as e: return {"error": f"Unexpected error: {str(e)}"} end_time = time.time() processing_time = end_time - start_time # File size calculation file_size = os.path.getsize(audio_path) / (1024 * 1024) # Audio duration calculation with fallback try: # Attempt to get audio duration using soundfile audio_data, sample_rate = sf.read(audio_path) audio_duration = len(audio_data) / sample_rate except ImportError: try: import librosa audio_duration = librosa.get_duration(filename=audio_path) except: audio_duration = 0 # Prepare comprehensive return dictionary return { "service": "ElevenLabs Scribe", "text": result.get('text', ''), "processing_time": processing_time, "file_size_mb": round(file_size, 2), "audio_duration": round(audio_duration, 2), "real_time_factor": round(processing_time / audio_duration, 2) if audio_duration > 0 else None, "processing_speed": round(audio_duration / processing_time, 2) if processing_time > 0 else None, "raw_response": result } with gr.Blocks(title="Video to Audio to Transcription") as app: gr.Markdown("# Video => Audio => Transcription") api_key = gr.Textbox( placeholder="Enter your ElevenLabs API key", label="ElevenLabs API Key", type="password", value=ELEVENLABS_API_KEY ) model_id = gr.Dropdown( choices=["scribe_v1"], value="scribe_v1", label="Transcription Model" ) with gr.Tabs(): with gr.TabItem("Upload Video"): with gr.Row(): with gr.Column(): video_input = gr.Video(label="Upload Video") format_choice_file = gr.Radio(["mp3", "wav"], value="mp3", label="Output Format") extract_button_file = gr.Button("Extract Audio & Transcribe") with gr.Column(): audio_output_file = gr.Audio(label="Extracted Audio", type="filepath") status_output_file = gr.Textbox(label="Audio Extraction Status") transcript_file_output = gr.File(label="Transcription Text File") transcript_status_output = gr.Textbox(label="Transcription Status") extract_button_file.click( fn=process_video_file, inputs=[video_input, format_choice_file, api_key, model_id], outputs=[audio_output_file, status_output_file, transcript_file_output, transcript_status_output] ) if __name__ == "__main__": app.launch()