Spaces:

mooki0
/

HunyuanWorld-Demo

Build error

File size: 11,973 Bytes

57276d4

import gradio as gr
import torch
import os
import numpy as np
import cv2
from PIL import Image
import open3d as o3d
import shutil

# --- Model Classes (adapted from demo scripts) ---

# Panorama Generation
from hy3dworld import Text2PanoramaPipelines, Image2PanoramaPipelines, Perspective

class Text2PanoramaDemo:
    def __init__(self):
        self.pipe = Text2PanoramaPipelines.from_pretrained(
            "black-forest-labs/FLUX.1-dev",
            torch_dtype=torch.bfloat16
        ).to("cuda")
        self.pipe.load_lora_weights(
            "tencent/HunyuanWorld-1",
            subfolder="HunyuanWorld-PanoDiT-Text",
            weight_name="lora.safetensors",
            torch_dtype=torch.bfloat16
        )
        self.pipe.enable_model_cpu_offload()
        self.pipe.enable_vae_tiling()

    def run(self, prompt, negative_prompt, seed, height, width, guidance_scale, steps):
        image = self.pipe(
            prompt,
            height=height,
            width=width,
            negative_prompt=negative_prompt,
            generator=torch.Generator("cuda").manual_seed(seed),
            num_inference_steps=steps,
            guidance_scale=guidance_scale,
            blend_extend=6,
            true_cfg_scale=0.0,
        ).images[0]
        return image

class Image2PanoramaDemo:
    def __init__(self):
        self.pipe = Image2PanoramaPipelines.from_pretrained(
            "black-forest-labs/FLUX.1-dev",
            torch_dtype=torch.bfloat16
        ).to("cuda")
        self.pipe.load_lora_weights(
            "tencent/HunyuanWorld-1",
            subfolder="HunyuanWorld-PanoDiT-Image",
            weight_name="lora.safetensors",
            torch_dtype=torch.bfloat16
        )
        self.pipe.enable_model_cpu_offload()
        self.pipe.enable_vae_tiling()
        self.general_negative_prompt = "human, person, people, messy, low-quality, blur, noise, low-resolution"
        self.general_positive_prompt = "high-quality, high-resolution, sharp, clear, 8k"

    def run(self, prompt, negative_prompt, image, seed, height, width, guidance_scale, steps, fov):
        prompt = prompt + ", " + self.general_positive_prompt
        negative_prompt = self.general_negative_prompt + ", " + negative_prompt
        
        perspective_img = np.array(image)
        height_fov, width_fov = perspective_img.shape[:2]
        ratio = width_fov / height_fov
        w = int((fov / 360) * width)
        h = int(w / ratio)
        perspective_img = cv2.resize(perspective_img, (w, h), interpolation=cv2.INTER_AREA)

        equ = Perspective(perspective_img, fov, 0, 0, crop_bound=False)
        img, mask = equ.GetEquirec(height, width)
        mask = cv2.erode(mask.astype(np.uint8), np.ones((3, 3), np.uint8), iterations=5)
        img = img * mask
        mask = 255 - (mask.astype(np.uint8) * 255)
        mask = Image.fromarray(mask[:, :, 0])
        img = Image.fromarray(cv2.cvtColor(img.astype(np.uint8), cv2.COLOR_BGR2RGB))

        image = self.pipe(
            prompt=prompt, image=img, mask_image=mask, height=height, width=width,
            negative_prompt=negative_prompt, guidance_scale=guidance_scale, num_inference_steps=steps,
            generator=torch.Generator("cuda").manual_seed(seed), blend_extend=6, shifting_extend=0, true_cfg_scale=2.0,
        ).images[0]
        return image

# Scene Generation
from hy3dworld import LayerDecomposition, WorldComposer, process_file

class HYworldDemo:
    def __init__(self, seed=42):
        target_size = 3840
        kernel_scale = max(1, int(target_size / 1920))
        self.LayerDecomposer = LayerDecomposition()
        self.hy3d_world = WorldComposer(
            device=torch.device("cuda"), resolution=(target_size, target_size // 2),
            seed=seed, filter_mask=True, kernel_scale=kernel_scale,
        )

    def run(self, image_path, labels_fg1, labels_fg2, classes, output_dir):
        os.makedirs(output_dir, exist_ok=True)
        fg1_infos = [{"image_path": image_path, "output_path": output_dir, "labels": labels_fg1, "class": classes}]
        fg2_infos = [{"image_path": os.path.join(output_dir, 'remove_fg1_image.png'), "output_path": output_dir, "labels": labels_fg2, "class": classes}]

        self.LayerDecomposer(fg1_infos, layer=0)
        self.LayerDecomposer(fg2_infos, layer=1)
        self.LayerDecomposer(fg2_infos, layer=2)
        separate_pano, fg_bboxes = self.hy3d_world._load_separate_pano_from_dir(output_dir, sr=True)
        layered_world_mesh = self.hy3d_world.generate_world(separate_pano=separate_pano, fg_bboxes=fg_bboxes, world_type='mesh')

        mesh_files = []
        for layer_idx, layer_info in enumerate(layered_world_mesh):
            output_path = os.path.join(output_dir, f"mesh_layer{layer_idx}.ply")
            o3d.io.write_triangle_mesh(output_path, layer_info['mesh'])
            mesh_files.append(output_path)
        return mesh_files

# --- Gradio UI ---

# Instantiate models
t2p_demo = Text2PanoramaDemo()
i2p_demo = Image2PanoramaDemo()
hy_demo = HYworldDemo()

def generate_text_to_pano(prompt, neg_prompt, seed, height, width, scale, steps):
    image = t2p_demo.run(prompt, neg_prompt, seed, height, width, scale, steps)
    # Save to a temporary file to pass to the next stage
    temp_dir = "temp_outputs"
    os.makedirs(temp_dir, exist_ok=True)
    temp_path = os.path.join(temp_dir, f"pano_{seed}.png")
    image.save(temp_path)
    return image, temp_path

def generate_image_to_pano(prompt, neg_prompt, image, seed, height, width, scale, steps, fov):
    pil_image = Image.fromarray(image)
    result_image = i2p_demo.run(prompt, neg_prompt, pil_image, seed, height, width, scale, steps, fov)
    temp_dir = "temp_outputs"
    os.makedirs(temp_dir, exist_ok=True)
    temp_path = os.path.join(temp_dir, f"pano_i2p_{seed}.png")
    result_image.save(temp_path)
    return result_image, temp_path

def generate_scene(panorama_file_path, fg1, fg2, classes, seed):
    if panorama_file_path is None or not os.path.exists(panorama_file_path):
        raise gr.Error("Please generate or upload a panorama image first.")
    output_dir = f"output_scene_{seed}"
    shutil.rmtree(output_dir, ignore_errors=True)
    labels_fg1 = [label.strip() for label in fg1.split(',') if label.strip()]
    labels_fg2 = [label.strip() for label in fg2.split(',') if label.strip()]
    mesh_files = hy_demo.run(panorama_file_path, labels_fg1, labels_fg2, classes, output_dir)
    
    # For now, let's just display the first layer. Gradio's Model3D doesn't support multiple files well.
    # A better UI might zip and offer for download, or show multiple viewers.
    return mesh_files[0] if mesh_files else None

css = """
#col-container {margin-left: auto; margin-right: auto;}
#pano_output {min-height: 320px;}
#scene_output {min-height: 480px;}
"""

with gr.Blocks(css=css, theme=gr.themes.Soft()) as demo:
    gr.Markdown("<h1>HunyuanWorld-1.0: A One-Stop Solution for Text-driven 3D Scene Generation</h1>")
    gr.Markdown("Official Repo: [Tencent-Hunyuan/HunyuanWorld-1.0](https://github.com/Tencent-Hunyuan/HunyuanWorld-1.0)")

    # State to hold the path of the generated panorama
    panorama_path_state = gr.State(None)

    with gr.Tabs():
        with gr.TabItem("Step 1: Panorama Generation"):
            with gr.Row():
                with gr.Column():
                    with gr.Tabs():
                        with gr.TabItem("Text-to-Panorama") as t2p_tab:
                            t2p_prompt = gr.Textbox(label="Prompt", value="A beautiful sunset over a mountain range, fantasy style")
                            t2p_neg_prompt = gr.Textbox(label="Negative Prompt", value="blurry, low quality")
                            t2p_seed = gr.Slider(label="Seed", minimum=0, maximum=10000, step=1, value=42)
                            with gr.Accordion("Advanced Settings", open=False):
                                t2p_height = gr.Slider(label="Height", minimum=512, maximum=1024, step=64, value=960)
                                t2p_width = gr.Slider(label="Width", minimum=1024, maximum=2048, step=128, value=1920)
                                t2p_scale = gr.Slider(label="Guidance Scale", minimum=1, maximum=50, step=1, value=30)
                                t2p_steps = gr.Slider(label="Inference Steps", minimum=10, maximum=100, step=5, value=50)
                            t2p_button = gr.Button("Generate Panorama", variant="primary")

                        with gr.TabItem("Image-to-Panorama") as i2p_tab:
                            i2p_image = gr.Image(type="numpy", label="Input Image")
                            i2p_prompt = gr.Textbox(label="Prompt", value="A photo of a room, modern design")
                            i2p_neg_prompt = gr.Textbox(label="Negative Prompt", value="watermark, text")
                            i2p_seed = gr.Slider(label="Seed", minimum=0, maximum=10000, step=1, value=100)
                            with gr.Accordion("Advanced Settings", open=False):
                                i2p_fov = gr.Slider(label="Field of View (FOV)", minimum=40, maximum=120, step=5, value=80)
                                i2p_height = gr.Slider(label="Height", minimum=512, maximum=1024, step=64, value=960)
                                i2p_width = gr.Slider(label="Width", minimum=1024, maximum=2048, step=128, value=1920)
                                i2p_scale = gr.Slider(label="Guidance Scale", minimum=1, maximum=50, step=1, value=30)
                                i2p_steps = gr.Slider(label="Inference Steps", minimum=10, maximum=100, step=5, value=50)
                            i2p_button = gr.Button("Generate Panorama", variant="primary")
                
                with gr.Column():
                    pano_output = gr.Image(label="Panorama Output", elem_id="pano_output")
                    send_to_scene_btn = gr.Button("Step 2: Send to Scene Generation")
        
        with gr.TabItem("Step 2: Scene Generation") as scene_tab:
            with gr.Row():
                with gr.Column():
                    gr.Markdown("Load the panorama generated in Step 1, or upload your own.")
                    scene_input_image = gr.Image(type="filepath", label="Input Panorama")
                    scene_classes = gr.Radio(["outdoor", "indoor"], label="Scene Class", value="outdoor")
                    scene_fg1 = gr.Textbox(label="Foreground Labels (Layer 1)", placeholder="e.g., tree, car, person")
                    scene_fg2 = gr.Textbox(label="Foreground Labels (Layer 2)", placeholder="e.g., building, mountain")
                    scene_seed = gr.Slider(label="Seed", minimum=0, maximum=10000, step=1, value=2024)
                    scene_button = gr.Button("Generate 3D Scene", variant="primary")
                with gr.Column():
                    scene_output = gr.Model3D(label="3D Scene Output (.ply)", elem_id="scene_output")

    # Wire up components
    t2p_button.click(
        fn=generate_text_to_pano,
        inputs=[t2p_prompt, t2p_neg_prompt, t2p_seed, t2p_height, t2p_width, t2p_scale, t2p_steps],
        outputs=[pano_output, panorama_path_state]
    )
    i2p_button.click(
        fn=generate_image_to_pano,
        inputs=[i2p_prompt, i2p_neg_prompt, i2p_image, i2p_seed, i2p_height, i2p_width, i2p_scale, i2p_steps, i2p_fov],
        outputs=[pano_output, panorama_path_state]
    )

    def transfer_to_scene_gen(path):
        return {scene_input_image: gr.update(value=path)}

    send_to_scene_btn.click(
        fn=lambda path: path, 
        inputs=panorama_path_state,
        outputs=scene_input_image
    ).then(
        lambda: gr.Tabs.update(selected=scene_tab),
        outputs=demo.children[1] # This is a bit of a hack to select the tab
    )

    scene_button.click(
        fn=generate_scene,
        inputs=[scene_input_image, scene_fg1, scene_fg2, scene_classes, scene_seed],
        outputs=scene_output
    )

demo.queue().launch(debug=True)