Spaces:

Nymbo
/

Serverless-VideoGen-Hub

Running

File size: 17,196 Bytes

26ed6a6

import gradio as gr
from huggingface_hub import InferenceClient
import os
import json
import base64
from PIL import Image
import io
import time
import tempfile
import uuid

# Access token from environment variable
ACCESS_TOKEN = os.getenv("HF_TOKEN")
print("Access token loaded.")

def generate_video(
    prompt,
    negative_prompt,
    num_frames,
    fps,
    width,
    height,
    num_inference_steps,
    guidance_scale,
    motion_bucket_id,
    seed,
    provider,
    custom_api_key,  
    custom_model,    
    model_search_term,
    selected_model
):
    """Generate a video based on the provided parameters"""
    print(f"Received prompt: {prompt}")
    print(f"Negative prompt: {negative_prompt}")
    print(f"Num frames: {num_frames}, FPS: {fps}")
    print(f"Width: {width}, Height: {height}")
    print(f"Steps: {num_inference_steps}, Guidance Scale: {guidance_scale}")
    print(f"Motion Bucket ID: {motion_bucket_id}, Seed: {seed}")
    print(f"Selected provider: {provider}")         
    print(f"Custom API Key provided: {bool(custom_api_key.strip())}")
    print(f"Selected model (custom_model): {custom_model}")  
    print(f"Model search term: {model_search_term}")
    print(f"Selected model from radio: {selected_model}")

    # Determine which token to use - custom API key if provided, otherwise the ACCESS_TOKEN
    token_to_use = custom_api_key if custom_api_key.strip() != "" else ACCESS_TOKEN
    
    # Log which token source we're using (without printing the actual token)
    if custom_api_key.strip() != "":
        print("USING CUSTOM API KEY: BYOK token provided by user is being used for authentication")
    else:
        print("USING DEFAULT API KEY: Environment variable HF_TOKEN is being used for authentication")
    
    # Initialize the Inference Client with the provider and appropriate token
    client = InferenceClient(token=token_to_use, provider=provider)
    print(f"Hugging Face Inference Client initialized with {provider} provider.")

    # Convert seed to None if -1 (meaning random)
    if seed == -1:
        seed = None
    else:
        # Ensure seed is an integer
        seed = int(seed)

    # Determine which model to use, prioritizing custom_model if provided
    model_to_use = custom_model.strip() if custom_model.strip() != "" else selected_model
    print(f"Model selected for inference: {model_to_use}")

    # Create a unique ID for this generation
    generation_id = uuid.uuid4().hex[:8]
    print(f"Generation ID: {generation_id}")

    # Prepare parameters for the video generation request
    # Note: Different providers may have different parameter requirements
    parameters = {
        "prompt": prompt,
        "negative_prompt": negative_prompt,
        "num_frames": num_frames,
        "fps": fps,
        "width": width,
        "height": height,
        "num_inference_steps": num_inference_steps,
        "guidance_scale": guidance_scale,
    }
    
    # Add motion_bucket_id if applicable (depends on the model)
    if motion_bucket_id is not None:
        parameters["motion_bucket_id"] = motion_bucket_id
    
    # Add seed if specified
    if seed is not None:
        parameters["seed"] = seed

    # For FalAI provider - may need specific formatting
    if provider == "fal-ai":
        print("Using FalAI provider, adapting parameters...")
        # FalAI might use different parameter formats or additional settings
        parameters = {
            "prompt": prompt,
            "negative_prompt": negative_prompt,
            "num_frames": num_frames,
            "seed": seed if seed is not None else -1,
            "width": width,
            "height": height,
            "num_inference_steps": num_inference_steps,
            "guidance_scale": guidance_scale,
        }
    
    # For Novita provider - may need specific formatting
    if provider == "novita":
        print("Using Novita provider, adapting parameters...")
        # Based on documentation, Novita uses text_to_video method
        try:
            # For Novita, we use a different method from the InferenceClient
            video_data = client.text_to_video(
                prompt=prompt,
                model=model_to_use,
                negative_prompt=negative_prompt,
                num_frames=num_frames,
                fps=fps,
                width=width,
                height=height,
                num_inference_steps=num_inference_steps,
                guidance_scale=guidance_scale,
                seed=seed
            )
            
            # Save the video to a temporary file
            temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".mp4")
            temp_file.write(video_data)
            video_path = temp_file.name
            temp_file.close()
            
            print(f"Video saved to temporary file: {video_path}")
            return video_path
            
        except Exception as e:
            print(f"Error during Novita video generation: {e}")
            return f"Error: {str(e)}"
    
    # For Replicate provider - may need specific formatting
    if provider == "replicate":
        print("Using Replicate provider, adapting parameters...")
        # Replicate might use different parameter formats
        try:
            # For Replicate, we use their specific method structure
            response = client.post(
                model=model_to_use,
                input={
                    "prompt": prompt,
                    "negative_prompt": negative_prompt,
                    "num_frames": num_frames,
                    "fps": fps,
                    "width": width,
                    "height": height,
                    "num_inference_steps": num_inference_steps,
                    "guidance_scale": guidance_scale,
                    "seed": seed if seed is not None else 0,
                },
            )
            
            # Replicate typically returns a URL to the generated video
            if isinstance(response, dict) and "output" in response:
                video_url = response["output"]
                print(f"Video generated, URL: {video_url}")
                return video_url
            else:
                return str(response)
                
        except Exception as e:
            print(f"Error during Replicate video generation: {e}")
            return f"Error: {str(e)}"
    
    # General approach for other providers
    try:
        print(f"Sending request to {provider} provider with model {model_to_use}.")
        print(f"Parameters: {parameters}")
        
        # Use the text_to_video method of the InferenceClient
        video_data = client.text_to_video(
            prompt=prompt,
            model=model_to_use,
            **parameters
        )
        
        # Save the video to a temporary file
        temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".mp4")
        temp_file.write(video_data)
        video_path = temp_file.name
        temp_file.close()
        
        print(f"Video saved to temporary file: {video_path}")
        return video_path
        
    except Exception as e:
        print(f"Error during video generation: {e}")
        return f"Error: {str(e)}"

# Function to validate provider selection based on BYOK
def validate_provider(api_key, provider):
    # If no custom API key is provided, only "hf-inference" can be used
    if not api_key.strip() and provider != "hf-inference":
        return gr.update(value="hf-inference")
    return gr.update(value=provider)

# Define the GRADIO UI
with gr.Blocks(theme="Nymbo/Nymbo_Theme") as demo:
    # Set a title for the application
    gr.Markdown("# 🎬 Serverless-VideoGen-Hub")
    gr.Markdown("Generate videos using Hugging Face Serverless Inference")
    
    with gr.Row():
        with gr.Column(scale=2):
            # Main video output area
            video_output = gr.Video(label="Generated Video", height=400)
            
            # Basic input components
            prompt_box = gr.Textbox(
                value="A beautiful sunset over a calm ocean", 
                placeholder="Enter a prompt for your video", 
                label="Prompt",
                lines=3
            )
            
            # Generate button
            generate_button = gr.Button("🎬 Generate Video", variant="primary")
            
        with gr.Column(scale=1):
            # Model selection components
            model_search_box = gr.Textbox(
                label="Filter Models",
                placeholder="Search for a model...",
                lines=1
            )
            
            models_list = [
                "stabilityai/stable-video-diffusion-img2vid-xt",
                "stabilityai/stable-video-diffusion-img2vid",
                "damo-vilab/text-to-video-ms-1.7b",
                "tencent/HunyuanVideo",
                "Wan-AI/Wan2.1-T2V-14B",
                "PixArt-alpha/PixArt-sigma-vid",
                "strangerbytesxyz/motion-animator-diffusion-video"
            ]
            
            featured_model_radio = gr.Radio(
                label="Select a model below",
                choices=models_list,
                value="stabilityai/stable-video-diffusion-img2vid",
                interactive=True
            )
            
            custom_model_box = gr.Textbox(
                value="",
                label="Custom Model",
                info="(Optional) Provide a custom Hugging Face model path. Overrides any selected featured model.",
                placeholder="damo-vilab/text-to-video-ms-1.7b"
            )
    
    # Advanced settings in an accordion
    with gr.Accordion("Advanced Settings", open=False):
        with gr.Row():
            with gr.Column():
                negative_prompt = gr.Textbox(
                    label="Negative Prompt",
                    placeholder="What should NOT be in the video",
                    value="poor quality, distortion, blurry, low resolution, grainy",
                    lines=2
                )
                
                with gr.Row():
                    width = gr.Slider(
                        minimum=256,
                        maximum=1024,
                        value=512,
                        step=64,
                        label="Width"
                    )
                    
                    height = gr.Slider(
                        minimum=256,
                        maximum=1024,
                        value=512,
                        step=64,
                        label="Height"
                    )
                
                with gr.Row():
                    num_frames = gr.Slider(
                        minimum=8,
                        maximum=64,
                        value=16,
                        step=1,
                        label="Number of Frames"
                    )
                    
                    fps = gr.Slider(
                        minimum=1,
                        maximum=30,
                        value=8,
                        step=1,
                        label="Frames Per Second"
                    )
                    
            with gr.Column():
                with gr.Row():
                    num_inference_steps = gr.Slider(
                        minimum=1,
                        maximum=100,
                        value=25,
                        step=1,
                        label="Inference Steps"
                    )
                    
                    guidance_scale = gr.Slider(
                        minimum=1.0,
                        maximum=20.0,
                        value=7.5,
                        step=0.5,
                        label="Guidance Scale"
                    )
                
                with gr.Row():
                    motion_bucket_id = gr.Slider(
                        minimum=1,
                        maximum=255,
                        value=127,
                        step=1,
                        label="Motion Bucket ID (for SVD models)"
                    )
                    
                    seed = gr.Slider(
                        minimum=-1,
                        maximum=2147483647,
                        value=-1,
                        step=1,
                        label="Seed (-1 for random)"
                    )
                
                # Provider selection
                providers_list = [
                    "hf-inference",  # Default Hugging Face Inference
                    "fal-ai",        # Fal AI provider
                    "novita",        # Novita provider
                    "replicate",     # Replicate provider
                ]
                
                provider_radio = gr.Radio(
                    choices=providers_list,
                    value="hf-inference",
                    label="Inference Provider",
                    info="Select an inference provider. Note: Requires provider-specific API key except for hf-inference"
                )
                
                # BYOK textbox
                byok_textbox = gr.Textbox(
                    value="",
                    label="BYOK (Bring Your Own Key)",
                    info="Enter a provider API key here. When empty, only 'hf-inference' provider can be used.",
                    placeholder="Enter your provider API token",
                    type="password"  # Hide the API key for security
                )
    
    # Set up the generation click event
    generate_button.click(
        fn=generate_video,
        inputs=[
            prompt_box,
            negative_prompt,
            num_frames,
            fps,
            width,
            height,
            num_inference_steps,
            guidance_scale,
            motion_bucket_id,
            seed,
            provider_radio,
            byok_textbox,
            custom_model_box,
            model_search_box,
            featured_model_radio
        ],
        outputs=video_output
    )
    
    # Connect the model filter to update the radio choices
    def filter_models(search_term):
        print(f"Filtering models with search term: {search_term}")
        filtered = [m for m in models_list if search_term.lower() in m.lower()]
        print(f"Filtered models: {filtered}")
        return gr.update(choices=filtered)
    
    model_search_box.change(
        fn=filter_models,
        inputs=model_search_box,
        outputs=featured_model_radio
    )
    
    # Connect the featured model radio to update the custom model box
    def set_custom_model_from_radio(selected):
        """
        This function will get triggered whenever someone picks a model from the 'Featured Models' radio.
        We will update the Custom Model text box with that selection automatically.
        """
        print(f"Featured model selected: {selected}")
        return selected
    
    featured_model_radio.change(
        fn=set_custom_model_from_radio,
        inputs=featured_model_radio,
        outputs=custom_model_box
    )
    
    # Connect the BYOK textbox to validate provider selection
    byok_textbox.change(
        fn=validate_provider,
        inputs=[byok_textbox, provider_radio],
        outputs=provider_radio
    )
    
    # Also validate provider when the radio changes to ensure consistency
    provider_radio.change(
        fn=validate_provider,
        inputs=[byok_textbox, provider_radio],
        outputs=provider_radio
    )
    
    # Information tab
    with gr.Accordion("Information & Help", open=False):
        gr.Markdown("""
        # 🎬 Serverless-VideoGen-Hub
        
        This application uses Hugging Face's Serverless Inference API to generate videos from text prompts.
        
        ## Supported Providers
        
        - **hf-inference**: Hugging Face's default inference API (free)
        - **fal-ai**: Fal AI provider (requires API key)
        - **novita**: Novita AI provider (requires API key)
        - **replicate**: Replicate provider (requires API key)
        
        ## Parameters Explained
        
        - **Prompt**: The text description of your desired video
        - **Negative Prompt**: What you DON'T want to see in the video
        - **Width/Height**: Dimensions of the generated video
        - **Number of Frames**: Total frames to generate
        - **FPS**: Frames per second for playback
        - **Inference Steps**: More steps = higher quality but slower generation
        - **Guidance Scale**: How closely to follow the prompt (higher values = more faithful)
        - **Motion Bucket ID**: Controls motion intensity (for Stable Video Diffusion models)
        - **Seed**: For reproducible results, -1 means random
        
        ## Models
        
        You can either select from the featured models or enter a custom model path.
        
        Check out [Hugging Face's models page](https://huggingface.co/models?pipeline_tag=text-to-video) for more video generation models.
        """)

# Launch the app
if __name__ == "__main__":
    print("Launching the demo application.")
    demo.launch(show_api=True)