import spaces import gradio as gr import sys import os import subprocess import uuid import shutil from tqdm import tqdm from huggingface_hub import snapshot_download, list_repo_files, hf_hub_download import importlib, site # Re-discover all .pth/.egg-link files for sitedir in site.getsitepackages(): site.addsitedir(sitedir) # Clear caches so importlib will pick up new modules importlib.invalidate_caches() def sh(cmd): subprocess.check_call(cmd, shell=True) flash_attention_installed = False try: flash_attention_wheel = hf_hub_download( repo_id="alexnasa/flash-attn-3", repo_type="model", filename="128/flash_attn_3-3.0.0b1-cp39-abi3-linux_x86_64.whl", ) sh(f"pip install {flash_attention_wheel}") print("Attempting to download and install FlashAttention wheel...") import importlib, site; site.addsitedir(site.getsitepackages()[0]); importlib.invalidate_caches() flash_attention_installed = True except Exception as e: print(f"⚠️ Could not install FlashAttention: {e}") print("Continuing without FlashAttention...") try: te_wheel = hf_hub_download( repo_id="alexnasa/transformer_engine_wheels", repo_type="model", filename="transformer_engine-2.5.0+f05f12c9-cp310-cp310-linux_x86_64.whl", ) sh(f"pip install {te_wheel}") print("Attempting to download and install Transformer Engine wheel...") import importlib, site; site.addsitedir(site.getsitepackages()[0]); importlib.invalidate_caches() except Exception as e: print(f"⚠️ Could not install Transformer Engine : {e}") print("Continuing without Transformer Engine ...") import torch print(f"Torch version: {torch.__version__}") print(f"FlashAttention available: {flash_attention_installed}") import tempfile from pathlib import Path from torch._inductor.runtime.runtime_utils import cache_dir as _inductor_cache_dir from huggingface_hub import HfApi snapshot_download(repo_id="bytedance-research/HuMo", local_dir="./weights/HuMo") snapshot_download(repo_id="Wan-AI/Wan2.1-T2V-1.3B", local_dir="./weights/Wan2.1-T2V-1.3B") snapshot_download(repo_id="openai/whisper-large-v3", local_dir="./weights/whisper-large-v3") os.environ["PROCESSED_RESULTS"] = f"{os.getcwd()}/proprocess_results" path_to_insert = "humo" if path_to_insert not in sys.path: sys.path.insert(0, path_to_insert) from common.config import load_config, create_object config = load_config( "./humo/configs/inference/generate.yaml", [ "dit.sp_size=1", "generation.frames=97", "generation.scale_t=5.5", "generation.scale_a=5.0", "generation.mode=TIA", "generation.height=480", "generation.width=832", ], ) runner = create_object(config) os.environ.setdefault("TORCHINDUCTOR_CACHE_DIR", f"{os.getcwd()}/torchinductor_space") # or another writable path def restore_inductor_cache_from_hub(repo_id: str, filename: str = "torch_compile_cache.zip", path_in_repo: str = "inductor_cache", repo_type: str = "model", hf_token: str | None = None): cache_root = Path(_inductor_cache_dir()).resolve() cache_root.mkdir(parents=True, exist_ok=True) zip_path = hf_hub_download(repo_id=repo_id, filename=f"{path_in_repo}/{filename}", repo_type=repo_type, token=hf_token) shutil.unpack_archive(zip_path, extract_dir=str(cache_root)) print(f"✓ Restored cache into {cache_root}") # restore_inductor_cache_from_hub("alexnasa/humo-compiled") def get_duration(prompt_text, steps, image_file, audio_file_path, max_duration, session_id, progress): return calculate_required_time(steps, max_duration) def calculate_required_time(steps, max_duration): warmup_s = 50 max_duration_duration_mapping = { 20: 3, 45: 7, 70: 13, 95: 21, } # Humo 1.7 # max_duration_duration_mapping = { # 20: 2, # 45: 2, # 70: 5, # 95: 6, # } each_step_s = max_duration_duration_mapping[max_duration] duration_s = (each_step_s * steps) + warmup_s print(f'estimated duration:{duration_s}') return int(duration_s) def get_required_time_string(steps, max_duration): duration_s = calculate_required_time(steps, max_duration) duration_m = duration_s / 60 return f"
⌚ Zero GPU Required: ~{duration_s}.0s ({duration_m:.1f} mins)
" def update_required_time(steps, max_duration): return get_required_time_string(steps, max_duration) def generate_scene(prompt_text, steps, image_paths, audio_file_path, max_duration = 3, session_id = None, progress=gr.Progress(),): prompt_text_check = (prompt_text or "").strip() if not prompt_text_check: raise gr.Error("Please enter a prompt.") if not audio_file_path and not image_paths: raise gr.Error("Please provide a reference image or a lipsync audio.") return run_pipeline(prompt_text, steps, image_paths, audio_file_path, max_duration, session_id, progress) def upload_inductor_cache_to_hub( repo_id: str, path_in_repo: str = "inductor_cache", repo_type: str = "model", # or "dataset" if you prefer hf_token: str | None = None, ): """ Zips the current TorchInductor cache and uploads it to the given repo path. Assumes the model was already run once with torch.compile() so the cache exists. """ cache_dir = Path(_inductor_cache_dir()).resolve() if not cache_dir.exists(): raise FileNotFoundError(f"TorchInductor cache not found at {cache_dir}. " "Run a compiled model once to populate it.") # Create a zip archive of the entire cache directory with tempfile.TemporaryDirectory() as tmpdir: archive_base = Path(tmpdir) / "torch_compile_cache" archive_path = shutil.make_archive(str(archive_base), "zip", root_dir=str(cache_dir)) archive_path = Path(archive_path) # Upload to Hub api = HfApi(token=hf_token) api.create_repo(repo_id=repo_id, repo_type=repo_type, exist_ok=True) # Put each artifact under path_in_repo, including a tiny metadata stamp for traceability # Upload the zip dest_path = f"{path_in_repo}/{archive_path.name}" api.upload_file( path_or_fileobj=str(archive_path), path_in_repo=dest_path, repo_id=repo_id, repo_type=repo_type, ) # Upload a small metadata file (optional but handy) meta_txt = ( f"pytorch={torch.__version__}\n" f"inductor_cache_dir={cache_dir}\n" f"cuda_available={torch.cuda.is_available()}\n" f"cuda_device={torch.cuda.get_device_name(0) if torch.cuda.is_available() else 'cpu'}\n" ) api.upload_file( path_or_fileobj=meta_txt.encode(), path_in_repo=f"{path_in_repo}/INDUCTOR_CACHE_METADATA.txt", repo_id=repo_id, repo_type=repo_type, ) print("✔ Uploaded TorchInductor cache to the Hub.") @spaces.GPU(duration=get_duration) def run_pipeline(prompt_text, steps, image_paths, audio_file_path, max_duration = 3, session_id = None, progress=gr.Progress(),): if session_id is None: session_id = uuid.uuid4().hex inference_mode = "TIA" # Validate inputs prompt_text = (prompt_text or "").strip() if not prompt_text: raise gr.Error("Please enter a prompt.") if not audio_file_path and not image_paths: raise gr.Error("Please provide a reference image or a lipsync audio.") if not audio_file_path: inference_mode = "TI" audio_path = None tmp_audio_path = None else: audio_path = audio_file_path if isinstance(audio_file_path, str) else getattr(audio_file_path, "name", str(audio_file_path)) if not image_paths: inference_mode = "TA" img_paths = None else: img_paths = [image_data[0] for image_data in image_paths] print(f'{session_id} is using inference_mode:{inference_mode} with steps:{steps} with {max_duration} frames') output_dir = os.path.join(os.environ["PROCESSED_RESULTS"], session_id) os.makedirs(output_dir, exist_ok=True) if audio_path: def add_silence_to_audio_ffmpeg(audio_path, tmp_audio_path, silence_duration_s=0.5): command = [ 'ffmpeg', '-i', audio_path, '-f', 'lavfi', '-t', str(silence_duration_s), '-i', 'anullsrc=r=16000:cl=stereo', '-filter_complex', '[1][0]concat=n=2:v=0:a=1[out]', '-map', '[out]', '-y', tmp_audio_path, '-loglevel', 'quiet' ] subprocess.run(command, check=True) tmp_audio_path = os.path.join(output_dir, "tmp_audio.wav") add_silence_to_audio_ffmpeg(audio_path, tmp_audio_path) # Random filename filename = f"gen_{uuid.uuid4().hex[:10]}" width, height = 832, 480 runner.inference_loop( prompt_text, img_paths, tmp_audio_path, output_dir, filename, inference_mode, width, height, steps, frames = int(max_duration), tea_cache_l1_thresh = 0.0, progress_bar_cmd=progress ) # Return resulting video path video_path = os.path.join(output_dir, f"{filename}.mp4") if os.path.exists(video_path): # upload_inductor_cache_to_hub("alexnasa/humo-compiled") return video_path else: candidates = [os.path.join(output_dir, f) for f in os.listdir(output_dir) if f.endswith(".mp4")] if candidates: return max(candidates, key=lambda p: os.path.getmtime(p)) return None css = """ #col-container { margin: 0 auto; width: 100%; max-width: 720px; } """ def cleanup(request: gr.Request): sid = request.session_hash if sid: d1 = os.path.join(os.environ["PROCESSED_RESULTS"], sid) shutil.rmtree(d1, ignore_errors=True) def start_session(request: gr.Request): return request.session_hash with gr.Blocks(css=css) as demo: session_state = gr.State() demo.load(start_session, outputs=[session_state]) with gr.Sidebar(width=400): gr.HTML( """

HuMo – Human-Centric Video Generation via Collaborative Multi-Modal Conditioning

[Github]
""" ) gr.Markdown("**REFERENCE IMAGES**") img_input = gr.Gallery( value=["./examples/ali.png"], show_label=False, label="", interactive=True, rows=1, columns=3, object_fit="contain", height="280", file_types=['image'] ) gr.Markdown("**LIPSYNC AUDIO**") audio_input = gr.Audio( value="./examples/life.wav", sources=["upload"], show_label=False, type="filepath", ) gr.Markdown("**SETTINGS**") default_steps = 10 default_max_duration = 45 max_duration = gr.Slider(minimum=45, maximum=95, value=default_max_duration, step=25, label="Frames") steps_input = gr.Slider(minimum=10, maximum=50, value=default_steps, step=5, label="Diffusion Steps") with gr.Column(elem_id="col-container"): gr.HTML( """
HF Space by: GitHub Repo
""" ) video_output = gr.Video(show_label=False) gr.Markdown("

PROMPT

") prompt_tb = gr.Textbox( value="A handheld tracking shot follows a female warrior walking through a cave. Her determined eyes are locked straight ahead as she grips a blazing torch tightly in her hand. She speaks with intensity.", show_label=False, lines=5, placeholder="Describe the scene and the person talking....", ) gr.Markdown("") time_required = gr.Markdown(get_required_time_string(default_steps, default_max_duration)) run_btn = gr.Button("🎬 Action", variant="primary") gr.Examples( examples=[ [ "A handheld tracking shot follows a female through a science lab. Her determined eyes are locked straight ahead. She is explaining something to someone standing opposite her", 10, ["./examples/naomi.png"], "./examples/science.wav", 70, ], [ "A handheld tracking shot follows a female warrior walking through a cave. Her determined eyes are locked straight ahead as she grips a blazing torch tightly in her hand. She speaks with intensity.", 10, ["./examples/ella.png"], "./examples/dream.mp3", 45, ], [ "A reddish-brown haired woman sits pensively against swirling blue-and-white brushstrokes, dressed in a blue coat and dark waistcoat. The artistic backdrop and her thoughtful pose evoke a Post-Impressionist style in a studio-like setting.", 10, ["./examples/art.png"], "./examples/art.wav", 70, ], [ "A handheld tracking shot follows a female warrior walking through a cave. Her determined eyes are locked straight ahead as she grips a blazing torch tightly in her hand. She speaks with intensity.", 10, ["./examples/ella.png"], "./examples/dream.mp3", 95, ], [ "A woman with long, wavy dark hair looking at a person sitting opposite her whilst holding a book, wearing a leather jacket, long-sleeved jacket with a semi purple color one seen on a photo. Warm, window-like light bathes her figure, highlighting the outfit's elegant design and her graceful movements.", 40, ["./examples/amber.png", "./examples/jacket.png"], "./examples/fictional.wav", 70, ], ], inputs=[prompt_tb, steps_input, img_input, audio_input, max_duration], outputs=[video_output], fn=run_pipeline, cache_examples=True, ) max_duration.change(update_required_time, [steps_input, max_duration], time_required) steps_input.change(update_required_time, [steps_input, max_duration], time_required) run_btn.click( fn=generate_scene, inputs=[prompt_tb, steps_input, img_input, audio_input, max_duration, session_state], outputs=[video_output], ) if __name__ == "__main__": demo.unload(cleanup) demo.queue() demo.launch(ssr_mode=False)