Spaces:
Runtime error
Runtime error
| I'll create an advanced Audio-Driven Video Generation application that supports both single-person and multi-person conversational video generation. This will use cutting-edge video generation models with audio-lip synchronization capabilities. | |
| ```python | |
| # app.py | |
| import gradio as gr | |
| import numpy as np | |
| import spaces | |
| import torch | |
| from diffusers import DiffusionPipeline, DDIMScheduler | |
| from diffusers.utils import export_to_video | |
| import time | |
| from typing import List, Tuple, Optional | |
| import tempfile | |
| import os | |
| from PIL import Image | |
| # Configuration | |
| MODEL_ID = "stabilityai/stable-video-diffusion-img2vid-xt-1-1" | |
| MAX_DURATION = 120 # 2 minutes in seconds | |
| AUDIO_SAMPLE_RATE = 16000 | |
| class VideoGenerator: | |
| def __init__(self): | |
| self.pipe = None | |
| self.device = "cuda" if torch.cuda.is_available() else "cpu" | |
| self._load_model() | |
| @spaces.GPU(duration=1500) | |
| def _load_model(self): | |
| """Load and compile the video generation model with AoT optimization""" | |
| print("Loading video generation model...") | |
| self.pipe = DiffusionPipeline.from_pretrained( | |
| MODEL_ID, | |
| torch_dtype=torch.float16, | |
| variant="fp16" | |
| ) | |
| self.pipe.scheduler = DDIMScheduler.from_config(self.pipe.scheduler.config) | |
| self.pipe = self.pipe.to(self.device) | |
| # AoT Compilation for performance boost | |
| print("Applying AoT compilation...") | |
| with spaces.aoti_capture(self.pipe.unet) as call: | |
| # Create dummy inputs for compilation | |
| dummy_prompt = "person talking" | |
| dummy_image = Image.new('RGB', (512, 512), color='white') | |
| self.pipe( | |
| prompt=dummy_prompt, | |
| image=dummy_image, | |
| num_inference_steps=1, | |
| height=512, | |
| width=512, | |
| num_frames=4 | |
| ) | |
| # Export and compile the UNet | |
| exported = torch.export.export( | |
| self.pipe.unet, | |
| args=call.args, | |
| kwargs=call.kwargs, | |
| ) | |
| compiled_unet = spaces.aoti_compile(exported) | |
| # Apply compiled model back to pipeline | |
| spaces.aoti_apply(compiled_unet, self.pipe.unet) | |
| print("Model loaded and compiled successfully!") | |
| def generate_video_segment( | |
| self, | |
| prompt: str, | |
| reference_image: Optional[np.ndarray], | |
| audio_features: dict, | |
| duration: int, | |
| fps: int = 24 | |
| ) -> List[np.ndarray]: | |
| """Generate a video segment with audio-driven animation""" | |
| if self.pipe is None: | |
| raise gr.Error("Model not loaded. Please wait...") | |
| num_frames = int(duration * fps) | |
| # Prepare initial frame from reference image or create default | |
| if reference_image is not None: | |
| initial_frame = Image.fromarray(reference_image) | |
| else: | |
| initial_frame = Image.new('RGB', (512, 512), color='white') | |
| # Generate video frames with audio conditioning | |
| print(f"Generating {duration}s video with {num_frames} frames...") | |
| frames = [] | |
| for i in range(0, num_frames, 8): # Generate in chunks of 8 frames | |
| chunk_frames = min(8, num_frames - i) | |
| # Audio-driven conditioning (simplified - in production use actual audio features) | |
| audio_conditioning = { | |
| "tempo": audio_features.get("tempo", 120), | |
| "energy": audio_features.get("energy", 0.5), | |
| "pitch": audio_features.get("pitch", 0.5) | |
| } | |
| # Generate frames with diffusion pipeline | |
| output = self.pipe( | |
| prompt=f"{prompt}, {audio_conditioning['tempo']} BPM tempo, realistic face, lip sync", | |
| image=initial_frame, | |
| num_inference_steps=25, | |
| height=512, | |
| width=512, | |
| num_frames=chunk_frames, | |
| guidance_scale=7.5, | |
| generator=torch.Generator().manual_seed(42 + i) | |
| ) | |
| # Extract frames | |
| for j in range(chunk_frames): | |
| frame = output.frames[0][j] | |
| frame_array = np.array(frame) | |
| frames.append(frame_array) | |
| return frames | |
| # Initialize global generator | |
| generator = VideoGenerator() | |
| def extract_audio_features(audio_data: Tuple[int, np.ndarray]) -> dict: | |
| """Extract basic features from audio for conditioning""" | |
| sample_rate, audio = audio_data | |
| if audio.size == 0: | |
| return {"tempo": 120, "energy": 0.5, "pitch": 0.5} | |
| # Calculate energy (RMS) | |
| energy = np.sqrt(np.mean(audio**2)) | |
| energy_normalized = min(1.0, energy / 0.1) # Normalize | |
| # Estimate pitch using zero crossing rate (simplified) | |
| zero_crossings = np.where(np.diff(np.sign(audio)))[0] | |
| estimated_freq = len(zero_crossings) / (len(audio) / sample_rate) * 60 # BPM | |
| tempo = np.clip(estimated_freq, 60, 200) | |
| # Simple spectral centroid for pitch estimation | |
| fft = np.fft.fft(audio) | |
| magnitude = np.abs(fft[:len(fft)//2]) | |
| freqs = np.fft.fftfreq(len(fft), 1/sample_rate)[:len(fft)//2] | |
| spectral_centroid = np.sum(freqs * magnitude) / (np.sum(magnitude) + 1e-10) | |
| pitch_normalized = min(1.0, spectral_centroid / 2000) | |
| return { | |
| "tempo": tempo, | |
| "energy": energy_normalized, | |
| "pitch": pitch_normalized | |
| } | |
| @spaces.GPU(duration=180) | |
| def generate_conversational_video( | |
| audio_1: Tuple[int, np.ndarray], | |
| prompt_1: str, | |
| audio_2: Optional[Tuple[int, np.ndarray]] = None, | |
| prompt_2: Optional[str] = None, | |
| reference_image_1: Optional[np.ndarray] = None, | |
| reference_image_2: Optional[np.ndarray] = None, | |
| duration: int = 30, | |
| mode: str = "single", | |
| fps: int = 24, | |
| progress=gr.Progress() | |
| ) -> str: | |
| """Generate conversational video from audio inputs""" | |
| try: | |
| progress(0.1, desc="Processing audio inputs...") | |
| # Extract features from audio(s) | |
| audio_features_1 = extract_audio_features(audio_1) | |
| if audio_2 is not None: | |
| audio_features_2 = extract_audio_features(audio_2) | |
| progress(0.2, desc="Initializing video generation...") | |
| # Generate video segments based on mode | |
| if mode == "single": | |
| progress(0.3, desc="Generating single-person video...") | |
| frames = generator.generate_video_segment( | |
| prompt=prompt_1, | |
| reference_image=reference_image_1, | |
| audio_features=audio_features_1, | |
| duration=duration, | |
| fps=fps | |
| ) | |
| else: # multi-person conversation | |
| progress(0.25, desc="Generating person 1 video...") | |
| frames_1 = generator.generate_video_segment( | |
| prompt=f"Person 1: {prompt_1}", | |
| reference_image=reference_image_1, | |
| audio_features=audio_features_1, | |
| duration=duration//2, | |
| fps=fps | |
| ) | |
| progress(0.5, desc="Generating person 2 video...") | |
| frames_2 = generator.generate_video_segment( | |
| prompt=f"Person 2: {prompt_2 or 'Responding'}", | |
| reference_image=reference_image_2, | |
| audio_features=audio_features_2 or {"tempo": 120, "energy": 0.5, "pitch": 0.5}, | |
| duration=duration//2, | |
| fps=fps | |
| ) | |
| progress(0.7, desc="Combining conversation...") | |
| # Interleave frames for conversation effect | |
| frames = [] | |
| for i in range(min(len(frames_1), len(frames_2))): | |
| frames.extend([frames_1[i], frames_2[i]]) | |
| progress(0.9, desc="Rendering video...") | |
| # Create temporary file for video | |
| with tempfile.NamedTemporaryFile(suffix='.mp4', delete=False) as tmp_file: | |
| video_path = tmp_file.name | |
| # Export frames to video | |
| export_to_video(frames, video_path, fps=fps) | |
| progress(1.0, desc="Video generation complete!") | |
| return video_path | |
| except Exception as e: | |
| raise gr.Error(f"Video generation failed: {str(e)}") | |
| def create_reference_image_from_prompt(prompt: str, seed: int = 42) -> np.ndarray: | |
| """Create a reference image from text prompt""" | |
| @spaces.GPU(duration=30) | |
| def generate_image(): | |
| # Use a simple image generation for reference | |
| from diffusers import StableDiffusionPipeline | |
| img_pipe = StableDiffusionPipeline.from_pretrained( | |
| "runwayml/stable-diffusion-v1-5", | |
| torch_dtype=torch.float16 | |
| ).to("cuda") | |
| image = img_pipe( | |
| prompt=f"portrait of {prompt}, photorealistic, neutral expression", | |
| num_inference_steps=20, | |
| guidance_scale=7.5, | |
| generator=torch.Generator().manual_seed(seed) | |
| ).images[0] | |
| return np.array(image) | |
| return generate_image() | |
| # Gradio Interface | |
| with gr.Blocks( | |
| title="Audio-Driven Conversational Video Generator", | |
| description="Generate realistic conversational videos from audio inputs with up to 2 minutes duration", | |
| theme=gr.themes.Soft(), | |
| css=""" | |
| .header { text-align: center; margin-bottom: 2rem; } | |
| .mode-toggle { margin: 1rem 0; } | |
| .person-section { border: 1px solid #e0e0e0; border-radius: 8px; padding: 1rem; margin: 1rem 0; } | |
| .warning { background-color: #fff3cd; border: 1px solid #ffeaa7; border-radius: 4px; padding: 0.75rem; margin: 0.5rem 0; } | |
| .success { background-color: #d4edda; border: 1px solid #c3e6cb; border-radius: 4px; padding: 0.75rem; margin: 0.5rem 0; } | |
| """ | |
| ) as demo: | |
| gr.HTML(""" | |
| <div class="header"> | |
| <h1>π¬ Audio-Driven Conversational Video Generator</h1> | |
| <p>Generate realistic talking videos from audio with support for single and multi-person conversations</p> | |
| <p><strong>Built with anycoder</strong> - <a href="https://huggingface.co/spaces/akhaliq/anycoder" target="_blank">Advanced AI Video Generation</a></p> | |
| </div> | |
| """) | |
| with gr.Row(): | |
| mode = gr.Radio( | |
| choices=["single", "multi-person"], | |
| value="single", | |
| label="Generation Mode", | |
| info="Choose between single person or conversational video" | |
| ) | |
| duration = gr.Slider( | |
| minimum=5, | |
| maximum=MAX_DURATION, | |
| value=30, | |
| step=5, | |
| label="Duration (seconds)", | |
| info="Video length up to 2 minutes" | |
| ) | |
| fps = gr.Slider( | |
| minimum=12, | |
| maximum=30, | |
| value=24, | |
| step=1, | |
| label="FPS", | |
| info="Frames per second for output video" | |
| ) | |
| # Person 1 inputs | |
| with gr.Group(elem_classes="person-section"): | |
| gr.Markdown("### π€ Person 1") | |
| with gr.Row(): | |
| audio_1 = gr.Audio( | |
| sources=["upload", "microphone"], | |
| type="numpy", | |
| label="Audio Input 1", | |
| info="Upload audio file or record directly" | |
| ) | |
| ref_img_1 = gr.Image( | |
| sources=["upload"], | |
| type="numpy", | |
| label="Reference Image 1 (Optional)", | |
| info="Upload a reference image for the first person" | |
| ) | |
| prompt_1 = gr.Textbox( | |
| label="Prompt for Person 1", | |
| placeholder="Describe the first person (e.g., 'young woman, professional attire')", | |
| value="friendly person speaking naturally" | |
| ) | |
| with gr.Row(): | |
| generate_ref_1 = gr.Button("Generate Reference Image 1", size="sm") | |
| use_placeholder_1 = gr.Button("Use Default Avatar 1", size="sm") | |
| # Person 2 inputs (for multi-person mode) | |
| with gr.Group(elem_classes="person-section", visible=False) as person_2_section: | |
| gr.Markdown("### π₯ Person 2") | |
| with gr.Row(): | |
| audio_2 = gr.Audio( | |
| sources=["upload", "microphone"], | |
| type="numpy", | |
| label="Audio Input 2", | |
| info="Upload or record second person's audio" | |
| ) | |
| ref_img_2 = gr.Image( | |
| sources=["upload"], | |
| type="numpy", | |
| label="Reference Image 2 (Optional)", | |
| info="Upload a reference image for the second person" | |
| ) | |
| prompt_2 = gr.Textbox( | |
| label="Prompt for Person 2", | |
| placeholder="Describe the second person", | |
| value="friendly person responding" | |
| ) | |
| with gr.Row(): | |
| generate_ref_2 = gr.Button("Generate Reference Image 2", size="sm") | |
| use_placeholder_2 = gr.Button("Use Default Avatar 2", size="sm") | |
| # Generation controls | |
| with gr.Row(): | |
| generate_btn = gr.Button( | |
| "π₯ Generate Video", | |
| variant="primary", | |
| size="lg" | |
| ) | |
| stop_btn = gr.Button("βΉ Stop Generation", variant="stop", size="lg", visible=False) | |
| # Output | |
| video_output = gr.Video( | |
| label="Generated Conversational Video", | |
| autoplay=True, | |
| show_label=True, | |
| show_share_button=True, | |
| show_download_button=True | |
| ) | |
| # Status and info | |
| status_info = gr.HTML( | |
| value='<div class="info">π§ Model loading... This may take a few minutes for initial setup.</div>', | |
| label="Status" | |
| ) | |
| # Example gallery | |
| gr.Examples( | |
| examples=[ | |
| [ | |
| "single", | |
| 30, | |
| 24, | |
| None, # Will use default audio | |
| "professional presenter in business attire", | |
| None, | |
| None | |
| ], | |
| [ | |
| "multi-person", | |
| 60, | |
| 24, | |
| None, | |
| "casual young woman", | |
| None, | |
| "casual young man" | |
| ] | |
| ], | |
| inputs=[mode, duration, fps, audio_1, prompt_1, audio_2, prompt_2], | |
| cache_examples=False | |
| ) | |
| # Event handlers | |
| def toggle_mode(selected_mode): | |
| """Show/hide person 2 section based on mode""" | |
| if selected_mode == "multi-person": | |
| return gr.update(visible=True), gr.update(value="π₯ Generate Conversation") | |
| else: | |
| return gr.update(visible=False), gr.update(value="π₯ Generate Video") | |
| mode.change( | |
| toggle_mode, | |
| inputs=[mode], | |
| outputs=[person_2_section, generate_btn] | |
| ) | |
| # Generate reference images | |
| generate_ref_1.click( | |
| create_reference_image_from_prompt, | |
| inputs=[prompt_1], | |
| outputs=[ref_img_1] | |
| ).then( | |
| lambda: gr.update(value='<div class="success">β Reference image generated for Person 1</div>'), | |
| outputs=[status_info] | |
| ) | |
| generate_ref_2.click( | |
| create_reference_image_from_prompt, | |
| inputs=[prompt_2], | |
| outputs=[ref_img_2] | |
| ).then( | |
| lambda: gr.update(value='<div class="success">β Reference image generated for Person 2</div>'), | |
| outputs=[status_info] | |
| ) | |
| # Use default avatars | |
| def create_default_avatar(person_id: int): | |
| """Create a simple default avatar""" | |
| color_map = {1: "#FFE4E1", 2: "#E1F4FF"} | |
| avatar = Image.new('RGB', (256, 256), color=color_map.get(person_id, "#FFFFFF")) | |
| # Add simple face features | |
| from PIL import ImageDraw | |
| draw = ImageDraw.Draw(avatar) | |
| # Simple face outline | |
| draw.ellipse([50, 50, 206, 206], outline="#000000", width=3) | |
| # Eyes | |
| draw.ellipse([80, 90, 110, 120], fill="#000000") | |
| draw.ellipse([146, 90, 176, 120], fill="#000000") | |
| # Smile | |
| draw.arc([100, 130, 156, 160], 0, 180, fill="#000000", width=2) | |
| return np.array(avatar) | |
| use_placeholder_1.click( | |
| lambda: create_default_avatar(1), | |
| outputs=[ref_img_1] | |
| ) | |
| use_placeholder_2.click( | |
| lambda: create_default_avatar(2), | |
| outputs=[ref_img_2] | |
| ) | |
| # Main generation function | |
| def start_generation(*args): | |
| """Start video generation with loading indicator""" | |
| return ( | |
| gr.update(visible=False), # Hide generate button | |
| gr.update(visible=True), # Show stop button | |
| gr.update(value='<div class="warning">βοΈ Generating video... This may take several minutes depending on duration.</div>'), | |
| None # Clear previous video | |
| ) | |
| def stop_generation(): | |
| """Handle stop generation""" | |
| return ( | |
| gr.update(visible=True), # Show generate button | |
| gr.update(visible=False), # Hide stop button | |
| gr.update(value='<div class="info">π§ Generation stopped. Ready for new video.</div>') | |
| ) | |
| generate_btn.click( | |
| start_generation, | |
| outputs=[generate_btn, stop_btn, status_info, video_output] | |
| ).then( | |
| generate_conversational_video, | |
| inputs=[ | |
| audio_1, prompt_1, audio_2, prompt_2, | |
| ref_img_1, ref_img_2, duration, mode, fps | |
| ], | |
| outputs=[video_output] | |
| ).then( | |
| lambda: gr.update(value='<div class="success">β Video generation complete! You can now download your video.</div>'), | |
| outputs=[status_info] | |
| ).then( | |
| lambda: [gr.update(visible=True), gr.update(visible=False)], | |
| outputs=[generate_btn, stop_btn] | |
| ) | |
| stop_btn.click( | |
| stop_generation, | |
| outputs=[generate_btn, stop_btn, status_info] | |
| ) | |
| # Update status on model load | |
| demo.load( | |
| lambda: gr.update(value='<div class="success">β Model loaded successfully! Ready to generate videos.</div>'), | |
| outputs=[status_info] | |
| ) | |
| if __name__ == "__main__": | |
| demo.queue(max_size=10, default_concurrency_limit=1) | |
| demo.launch(share=True) | |
| ``` | |
| ```python | |
| # requirements.txt | |
| gradio | |
| torch | |
| diffusers | |
| transformers | |
| accelerate | |
| numpy | |
| pillow | |
| opencv-python | |
| spaces | |
| torchvision | |
| ``` |