Spaces:

olivercareyncl
/

3D-Scene-from-image

Build error

App Files Files Community

olivercareyncl commited on Mar 18

Commit

990b457

verified ·

1 Parent(s): 6e3f894

Create app.py

Browse files

Files changed (1) hide show

app.py +334 -0

app.py ADDED Viewed

	@@ -0,0 +1,334 @@

+import json
+import os
+import random
+import tempfile
+from typing import Any, List, Union
+import gradio as gr
+import numpy as np
+import spaces
+import torch
+import trimesh
+from gradio_image_prompter import ImagePrompter
+from gradio_litmodel3d import LitModel3D
+from huggingface_hub import snapshot_download
+from PIL import Image
+from skimage import measure
+from transformers import AutoModelForMaskGeneration, AutoProcessor
+from midi.pipelines.pipeline_midi import MIDIPipeline
+from midi.utils.smoothing import smooth_gpu
+from scripts.grounding_sam import plot_segmentation, segment
+from scripts.inference_midi import preprocess_image, split_rgb_mask
+# Constants
+MAX_SEED = np.iinfo(np.int32).max
+TMP_DIR = os.path.join(os.path.dirname(os.path.abspath(__file__)), "tmp")
+DTYPE = torch.bfloat16
+DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
+REPO_ID = "VAST-AI/MIDI-3D"
+MARKDOWN = """
+## Image to 3D Scene with [MIDI-3D](https://huanngzh.github.io/MIDI-Page/)
+<b>Important!</b> Please check out our [instruction video](https://github.com/user-attachments/assets/814c046e-f5c3-47cf-bb56-60154be8374c)!
+1. Upload an image, and draw bounding boxes for each instance by holding and dragging the mouse. Then clik "Run Segmentation" to generate the segmentation result. <b>Ensure instances should not be too small and bounding boxes fit snugly around each instance.</b>
+2. <b>Check "Do image padding" in "Generation Settings" if instances in your image are too close to the image border.</b> Then click "Run Generation" to generate a 3D scene from the image and segmentation result.
+3. If you find the generated 3D scene satisfactory, download it by clicking the "Download GLB" button.
+"""
+EXAMPLES = [
+    [
+        {
+            "image": "assets/example_data/Cartoon-Style/03_rgb.png",
+        },
+        "assets/example_data/Cartoon-Style/03_seg.png",
+        42,
+        False,
+        False,
+    ],
+    [
+        {
+            "image": "assets/example_data/Cartoon-Style/01_rgb.png",
+        },
+        "assets/example_data/Cartoon-Style/01_seg.png",
+        42,
+        False,
+        False,
+    ],
+    [
+        {
+            "image": "assets/example_data/Realistic-Style/02_rgb.png",
+        },
+        "assets/example_data/Realistic-Style/02_seg.png",
+        42,
+        False,
+        False,
+    ],
+    [
+        {
+            "image": "assets/example_data/Cartoon-Style/00_rgb.png",
+        },
+        "assets/example_data/Cartoon-Style/00_seg.png",
+        42,
+        False,
+        False,
+    ],
+    [
+        {
+            "image": "assets/example_data/Realistic-Style/00_rgb.png",
+        },
+        "assets/example_data/Realistic-Style/00_seg.png",
+        42,
+        False,
+        True,
+    ],
+    [
+        {
+            "image": "assets/example_data/Realistic-Style/01_rgb.png",
+        },
+        "assets/example_data/Realistic-Style/01_seg.png",
+        42,
+        False,
+        True,
+    ],
+    [
+        {
+            "image": "assets/example_data/Realistic-Style/05_rgb.png",
+        },
+        "assets/example_data/Realistic-Style/05_seg.png",
+        42,
+        False,
+        False,
+    ],
+]
+os.makedirs(TMP_DIR, exist_ok=True)
+# Prepare models
+## Grounding SAM
+segmenter_id = "facebook/sam-vit-base"
+sam_processor = AutoProcessor.from_pretrained(segmenter_id)
+sam_segmentator = AutoModelForMaskGeneration.from_pretrained(segmenter_id).to(
+    DEVICE, DTYPE
+)
+## MIDI-3D
+local_dir = "pretrained_weights/MIDI-3D"
+snapshot_download(repo_id=REPO_ID, local_dir=local_dir)
+pipe: MIDIPipeline = MIDIPipeline.from_pretrained(local_dir).to(DEVICE, DTYPE)
+pipe.init_custom_adapter(
+    set_self_attn_module_names=[
+        "blocks.8",
+        "blocks.9",
+        "blocks.10",
+        "blocks.11",
+        "blocks.12",
+    ]
+)
+# Utils
+def get_random_hex():
+    random_bytes = os.urandom(8)
+    random_hex = random_bytes.hex()
+    return random_hex
+@spaces.GPU()
+@torch.no_grad()
+@torch.autocast(device_type=DEVICE, dtype=torch.bfloat16)
+def run_segmentation(image_prompts: Any, polygon_refinement: bool) -> Image.Image:
+    rgb_image = image_prompts["image"].convert("RGB")
+    # pre-process the layers and get the xyxy boxes of each layer
+    if len(image_prompts["points"]) == 0:
+        gr.Error("Please draw bounding boxes for each instance on the image.")
+    boxes = [
+        [
+            [int(box[0]), int(box[1]), int(box[3]), int(box[4])]
+            for box in image_prompts["points"]
+        ]
+    ]
+    # run the segmentation
+    detections = segment(
+        sam_processor,
+        sam_segmentator,
+        rgb_image,
+        boxes=[boxes],
+        polygon_refinement=polygon_refinement,
+    )
+    seg_map_pil = plot_segmentation(rgb_image, detections)
+    torch.cuda.empty_cache()
+    return seg_map_pil
+@torch.no_grad()
+def run_midi(
+    pipe: Any,
+    rgb_image: Union[str, Image.Image],
+    seg_image: Union[str, Image.Image],
+    seed: int,
+    num_inference_steps: int = 50,
+    guidance_scale: float = 7.0,
+    do_image_padding: bool = False,
+) -> trimesh.Scene:
+    if do_image_padding:
+        rgb_image, seg_image = preprocess_image(rgb_image, seg_image)
+    instance_rgbs, instance_masks, scene_rgbs = split_rgb_mask(rgb_image, seg_image)
+    num_instances = len(instance_rgbs)
+    outputs = pipe(
+        image=instance_rgbs,
+        mask=instance_masks,
+        image_scene=scene_rgbs,
+        attention_kwargs={"num_instances": num_instances},
+        generator=torch.Generator(device=pipe.device).manual_seed(seed),
+        num_inference_steps=num_inference_steps,
+        guidance_scale=guidance_scale,
+        decode_progressive=True,
+        return_dict=False,
+    )
+    return outputs
+@spaces.GPU(duration=180)
+@torch.no_grad()
+@torch.autocast(device_type=DEVICE, dtype=torch.bfloat16)
+def run_generation(
+    rgb_image: Any,
+    seg_image: Union[str, Image.Image],
+    seed: int,
+    randomize_seed: bool = False,
+    num_inference_steps: int = 50,
+    guidance_scale: float = 7.0,
+    do_image_padding: bool = False,
+):
+    if randomize_seed:
+        seed = random.randint(0, MAX_SEED)
+    if not isinstance(rgb_image, Image.Image) and "image" in rgb_image:
+        rgb_image = rgb_image["image"]
+    outputs = run_midi(
+        pipe,
+        rgb_image,
+        seg_image,
+        seed,
+        num_inference_steps,
+        guidance_scale,
+        do_image_padding,
+    )
+    # marching cubes
+    trimeshes = []
+    for _, (logits_, grid_size, bbox_size, bbox_min, bbox_max) in enumerate(
+        zip(*outputs)
+    ):
+        grid_logits = logits_.view(grid_size)
+        grid_logits = smooth_gpu(grid_logits, method="gaussian", sigma=1)
+        torch.cuda.empty_cache()
+        vertices, faces, normals, _ = measure.marching_cubes(
+            grid_logits.float().cpu().numpy(), 0, method="lewiner"
+        )
+        vertices = vertices / grid_size * bbox_size + bbox_min
+        # Trimesh
+        mesh = trimesh.Trimesh(vertices.astype(np.float32), np.ascontiguousarray(faces))
+        trimeshes.append(mesh)
+    # compose the output meshes
+    scene = trimesh.Scene(trimeshes)
+    tmp_path = os.path.join(TMP_DIR, f"midi3d_{get_random_hex()}.glb")
+    scene.export(tmp_path)
+    torch.cuda.empty_cache()
+    return tmp_path, tmp_path, seed
+# Demo
+with gr.Blocks() as demo:
+    gr.Markdown(MARKDOWN)
+    with gr.Row():
+        with gr.Column():
+            with gr.Row():
+                image_prompts = ImagePrompter(label="Input Image", type="pil")
+                seg_image = gr.Image(
+                    label="Segmentation Result", type="pil", format="png"
+                )
+            with gr.Accordion("Segmentation Settings", open=False):
+                polygon_refinement = gr.Checkbox(
+                    label="Polygon Refinement", value=False
+                )
+            seg_button = gr.Button("Run Segmentation")
+            with gr.Accordion("Generation Settings", open=False):
+                do_image_padding = gr.Checkbox(label="Do image padding", value=False)
+                seed = gr.Slider(
+                    label="Seed",
+                    minimum=0,
+                    maximum=MAX_SEED,
+                    step=1,
+                    value=0,
+                )
+                randomize_seed = gr.Checkbox(label="Randomize seed", value=True)
+                num_inference_steps = gr.Slider(
+                    label="Number of inference steps",
+                    minimum=1,
+                    maximum=50,
+                    step=1,
+                    value=50,
+                )
+                guidance_scale = gr.Slider(
+                    label="CFG scale",
+                    minimum=0.0,
+                    maximum=10.0,
+                    step=0.1,
+                    value=7.0,
+                )
+            gen_button = gr.Button("Run Generation", variant="primary")
+        with gr.Column():
+            model_output = LitModel3D(label="Generated GLB", exposure=1.0, height=500)
+            download_glb = gr.DownloadButton(label="Download GLB", interactive=False)
+    with gr.Row():
+        gr.Examples(
+            examples=EXAMPLES,
+            fn=run_generation,
+            inputs=[image_prompts, seg_image, seed, randomize_seed, do_image_padding],
+            outputs=[model_output, download_glb, seed],
+            cache_examples=False,
+        )
+    seg_button.click(
+        run_segmentation,
+        inputs=[
+            image_prompts,
+            polygon_refinement,
+        ],
+        outputs=[seg_image],
+    ).then(lambda: gr.Button(interactive=True), outputs=[gen_button])
+    gen_button.click(
+        run_generation,
+        inputs=[
+            image_prompts,
+            seg_image,
+            seed,
+            randomize_seed,
+            num_inference_steps,
+            guidance_scale,
+            do_image_padding,
+        ],
+        outputs=[model_output, download_glb, seed],
+    ).then(lambda: gr.Button(interactive=True), outputs=[download_glb])
+demo.launch()