Spaces:

Swarmeta-AI
/

Twig-V0-Alpha-Demo-CPU

Runtime error

App Files Files Community

zzc0208 commited on Feb 14

Commit

18ce78c

verified ·

1 Parent(s): 0bc6b41

Upload 8 files

Browse files

Files changed (8) hide show

apps/app_sana.py +502 -0
apps/app_sana_4bit.py +409 -0
apps/app_sana_4bit_compare_bf16.py +313 -0
apps/app_sana_controlnet_hed.py +306 -0
apps/app_sana_multithread.py +565 -0
apps/safety_check.py +72 -0
apps/sana_controlnet_pipeline.py +353 -0
apps/sana_pipeline.py +304 -0

apps/app_sana.py ADDED Viewed

	@@ -0,0 +1,502 @@

+#!/usr/bin/env python
+# Copyright 2024 NVIDIA CORPORATION & AFFILIATES
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# SPDX-License-Identifier: Apache-2.0
+from __future__ import annotations
+import argparse
+import os
+import random
+import socket
+import sqlite3
+import time
+import uuid
+from datetime import datetime
+import gradio as gr
+import numpy as np
+import spaces
+import torch
+from PIL import Image
+from torchvision.utils import make_grid, save_image
+from transformers import AutoModelForCausalLM, AutoTokenizer
+from app import safety_check
+from app.sana_pipeline import SanaPipeline
+MAX_SEED = np.iinfo(np.int32).max
+CACHE_EXAMPLES = torch.cuda.is_available() and os.getenv("CACHE_EXAMPLES", "1") == "1"
+MAX_IMAGE_SIZE = int(os.getenv("MAX_IMAGE_SIZE", "4096"))
+USE_TORCH_COMPILE = os.getenv("USE_TORCH_COMPILE", "0") == "1"
+ENABLE_CPU_OFFLOAD = os.getenv("ENABLE_CPU_OFFLOAD", "0") == "1"
+DEMO_PORT = int(os.getenv("DEMO_PORT", "15432"))
+os.environ["GRADIO_EXAMPLES_CACHE"] = "./.gradio/cache"
+COUNTER_DB = os.getenv("COUNTER_DB", ".count.db")
+device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
+style_list = [
+    {
+        "name": "(No style)",
+        "prompt": "{prompt}",
+        "negative_prompt": "",
+    },
+    {
+        "name": "Cinematic",
+        "prompt": "cinematic still {prompt} . emotional, harmonious, vignette, highly detailed, high budget, bokeh, "
+        "cinemascope, moody, epic, gorgeous, film grain, grainy",
+        "negative_prompt": "anime, cartoon, graphic, text, painting, crayon, graphite, abstract, glitch, deformed, mutated, ugly, disfigured",
+    },
+    {
+        "name": "Photographic",
+        "prompt": "cinematic photo {prompt} . 35mm photograph, film, bokeh, professional, 4k, highly detailed",
+        "negative_prompt": "drawing, painting, crayon, sketch, graphite, impressionist, noisy, blurry, soft, deformed, ugly",
+    },
+    {
+        "name": "Anime",
+        "prompt": "anime artwork {prompt} . anime style, key visual, vibrant, studio anime,  highly detailed",
+        "negative_prompt": "photo, deformed, black and white, realism, disfigured, low contrast",
+    },
+    {
+        "name": "Manga",
+        "prompt": "manga style {prompt} . vibrant, high-energy, detailed, iconic, Japanese comic style",
+        "negative_prompt": "ugly, deformed, noisy, blurry, low contrast, realism, photorealistic, Western comic style",
+    },
+    {
+        "name": "Digital Art",
+        "prompt": "concept art {prompt} . digital artwork, illustrative, painterly, matte painting, highly detailed",
+        "negative_prompt": "photo, photorealistic, realism, ugly",
+    },
+    {
+        "name": "Pixel art",
+        "prompt": "pixel-art {prompt} . low-res, blocky, pixel art style, 8-bit graphics",
+        "negative_prompt": "sloppy, messy, blurry, noisy, highly detailed, ultra textured, photo, realistic",
+    },
+    {
+        "name": "Fantasy art",
+        "prompt": "ethereal fantasy concept art of  {prompt} . magnificent, celestial, ethereal, painterly, epic, "
+        "majestic, magical, fantasy art, cover art, dreamy",
+        "negative_prompt": "photographic, realistic, realism, 35mm film, dslr, cropped, frame, text, deformed, "
+        "glitch, noise, noisy, off-center, deformed, cross-eyed, closed eyes, bad anatomy, ugly, "
+        "disfigured, sloppy, duplicate, mutated, black and white",
+    },
+    {
+        "name": "Neonpunk",
+        "prompt": "neonpunk style {prompt} . cyberpunk, vaporwave, neon, vibes, vibrant, stunningly beautiful, crisp, "
+        "detailed, sleek, ultramodern, magenta highlights, dark purple shadows, high contrast, cinematic, "
+        "ultra detailed, intricate, professional",
+        "negative_prompt": "painting, drawing, illustration, glitch, deformed, mutated, cross-eyed, ugly, disfigured",
+    },
+    {
+        "name": "3D Model",
+        "prompt": "professional 3d model {prompt} . octane render, highly detailed, volumetric, dramatic lighting",
+        "negative_prompt": "ugly, deformed, noisy, low poly, blurry, painting",
+    },
+]
+styles = {k["name"]: (k["prompt"], k["negative_prompt"]) for k in style_list}
+STYLE_NAMES = list(styles.keys())
+DEFAULT_STYLE_NAME = "(No style)"
+SCHEDULE_NAME = ["Flow_DPM_Solver"]
+DEFAULT_SCHEDULE_NAME = "Flow_DPM_Solver"
+NUM_IMAGES_PER_PROMPT = 1
+INFER_SPEED = 0
+def norm_ip(img, low, high):
+    img.clamp_(min=low, max=high)
+    img.sub_(low).div_(max(high - low, 1e-5))
+    return img
+def open_db():
+    db = sqlite3.connect(COUNTER_DB)
+    db.execute("CREATE TABLE IF NOT EXISTS counter(app CHARS PRIMARY KEY UNIQUE, value INTEGER)")
+    db.execute('INSERT OR IGNORE INTO counter(app, value) VALUES("Sana", 0)')
+    return db
+def read_inference_count():
+    with open_db() as db:
+        cur = db.execute('SELECT value FROM counter WHERE app="Sana"')
+        db.commit()
+    return cur.fetchone()[0]
+def write_inference_count(count):
+    count = max(0, int(count))
+    with open_db() as db:
+        db.execute(f'UPDATE counter SET value=value+{count} WHERE app="Sana"')
+        db.commit()
+def run_inference(num_imgs=1):
+    write_inference_count(num_imgs)
+    count = read_inference_count()
+    return (
+        f"<span style='font-size: 16px; font-weight: bold;'>Total inference runs: </span><span style='font-size: "
+        f"16px; color:red; font-weight: bold;'>{count}</span>"
+    )
+def update_inference_count():
+    count = read_inference_count()
+    return (
+        f"<span style='font-size: 16px; font-weight: bold;'>Total inference runs: </span><span style='font-size: "
+        f"16px; color:red; font-weight: bold;'>{count}</span>"
+    )
+def apply_style(style_name: str, positive: str, negative: str = "") -> tuple[str, str]:
+    p, n = styles.get(style_name, styles[DEFAULT_STYLE_NAME])
+    if not negative:
+        negative = ""
+    return p.replace("{prompt}", positive), n + negative
+def get_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--config", type=str, help="config")
+    parser.add_argument(
+        "--model_path",
+        nargs="?",
+        default="hf://Efficient-Large-Model/Sana_1600M_1024px/checkpoints/Sana_1600M_1024px.pth",
+        type=str,
+        help="Path to the model file (positional)",
+    )
+    parser.add_argument("--output", default="./", type=str)
+    parser.add_argument("--bs", default=1, type=int)
+    parser.add_argument("--image_size", default=1024, type=int)
+    parser.add_argument("--cfg_scale", default=5.0, type=float)
+    parser.add_argument("--pag_scale", default=2.0, type=float)
+    parser.add_argument("--seed", default=42, type=int)
+    parser.add_argument("--step", default=-1, type=int)
+    parser.add_argument("--custom_image_size", default=None, type=int)
+    parser.add_argument("--share", action="store_true")
+    parser.add_argument(
+        "--shield_model_path",
+        type=str,
+        help="The path to shield model, we employ ShieldGemma-2B by default.",
+        default="google/shieldgemma-2b",
+    )
+    return parser.parse_known_args()[0]
+args = get_args()
+if torch.cuda.is_available():
+    model_path = args.model_path
+    pipe = SanaPipeline(args.config)
+    pipe.from_pretrained(model_path)
+    pipe.register_progress_bar(gr.Progress())
+    # safety checker
+    safety_checker_tokenizer = AutoTokenizer.from_pretrained(args.shield_model_path)
+    safety_checker_model = AutoModelForCausalLM.from_pretrained(
+        args.shield_model_path,
+        device_map="auto",
+        torch_dtype=torch.bfloat16,
+    ).to(device)
+def save_image_sana(img, seed="", save_img=False):
+    unique_name = f"{str(uuid.uuid4())}_{seed}.png"
+    save_path = os.path.join(f"output/online_demo_img/{datetime.now().date()}")
+    os.umask(0o000)  # file permission: 666; dir permission: 777
+    os.makedirs(save_path, exist_ok=True)
+    unique_name = os.path.join(save_path, unique_name)
+    if save_img:
+        save_image(img, unique_name, nrow=1, normalize=True, value_range=(-1, 1))
+    return unique_name
+def randomize_seed_fn(seed: int, randomize_seed: bool) -> int:
+    if randomize_seed:
+        seed = random.randint(0, MAX_SEED)
+    return seed
+@torch.no_grad()
+@torch.inference_mode()
+@spaces.GPU(enable_queue=True)
+def generate(
+    prompt: str = None,
+    negative_prompt: str = "",
+    style: str = DEFAULT_STYLE_NAME,
+    use_negative_prompt: bool = False,
+    num_imgs: int = 1,
+    seed: int = 0,
+    height: int = 1024,
+    width: int = 1024,
+    flow_dpms_guidance_scale: float = 5.0,
+    flow_dpms_pag_guidance_scale: float = 2.0,
+    flow_dpms_inference_steps: int = 20,
+    randomize_seed: bool = False,
+):
+    global INFER_SPEED
+    # seed = 823753551
+    box = run_inference(num_imgs)
+    seed = int(randomize_seed_fn(seed, randomize_seed))
+    generator = torch.Generator(device=device).manual_seed(seed)
+    print(f"PORT: {DEMO_PORT}, model_path: {model_path}")
+    if safety_check.is_dangerous(safety_checker_tokenizer, safety_checker_model, prompt, threshold=0.2):
+        prompt = "A red heart."
+    print(prompt)
+    num_inference_steps = flow_dpms_inference_steps
+    guidance_scale = flow_dpms_guidance_scale
+    pag_guidance_scale = flow_dpms_pag_guidance_scale
+    if not use_negative_prompt:
+        negative_prompt = None  # type: ignore
+    prompt, negative_prompt = apply_style(style, prompt, negative_prompt)
+    pipe.progress_fn(0, desc="Sana Start")
+    time_start = time.time()
+    images = pipe(
+        prompt=prompt,
+        height=height,
+        width=width,
+        negative_prompt=negative_prompt,
+        guidance_scale=guidance_scale,
+        pag_guidance_scale=pag_guidance_scale,
+        num_inference_steps=num_inference_steps,
+        num_images_per_prompt=num_imgs,
+        generator=generator,
+    )
+    pipe.progress_fn(1.0, desc="Sana End")
+    INFER_SPEED = (time.time() - time_start) / num_imgs
+    save_img = False
+    if save_img:
+        img = [save_image_sana(img, seed, save_img=save_image) for img in images]
+        print(img)
+    else:
+        img = [
+            Image.fromarray(
+                norm_ip(img, -1, 1)
+                .mul(255)
+                .add_(0.5)
+                .clamp_(0, 255)
+                .permute(1, 2, 0)
+                .to("cpu", torch.uint8)
+                .numpy()
+                .astype(np.uint8)
+            )
+            for img in images
+        ]
+    torch.cuda.empty_cache()
+    return (
+        img,
+        seed,
+        f"<span style='font-size: 16px; font-weight: bold;'>Inference Speed: {INFER_SPEED:.3f} s/Img</span>",
+        box,
+    )
+model_size = "1.6" if "1600M" in args.model_path else "0.6"
+title = f"""
+    <div style='display: flex; align-items: center; justify-content: center; text-align: center;'>
+        <img src="https://raw.githubusercontent.com/NVlabs/Sana/refs/heads/main/asset/logo.png" width="50%" alt="logo"/>
+    </div>
+"""
+DESCRIPTION = f"""
+        <p><span style="font-size: 36px; font-weight: bold;">Sana-{model_size}B</span><span style="font-size: 20px; font-weight: bold;">{args.image_size}px</span></p>
+        <p style="font-size: 16px; font-weight: bold;">Sana: Efficient High-Resolution Image Synthesis with Linear Diffusion Transformer</p>
+        <p><span style="font-size: 16px;"><a href="https://arxiv.org/abs/2410.10629">[Paper]</a></span> <span style="font-size: 16px;"><a href="https://github.com/NVlabs/Sana">[Github]</a></span> <span style="font-size: 16px;"><a href="https://nvlabs.github.io/Sana">[Project]</a></span</p>
+        <p style="font-size: 16px; font-weight: bold;">Powered by <a href="https://hanlab.mit.edu/projects/dc-ae">DC-AE</a> with 32x latent space, </p>running on node {socket.gethostname()}.
+        <p style="font-size: 16px; font-weight: bold;">Unsafe word will give you a 'Red Heart' in the image instead.</p>
+        """
+if model_size == "0.6":
+    DESCRIPTION += "\n<p>0.6B model's text rendering ability is limited.</p>"
+if not torch.cuda.is_available():
+    DESCRIPTION += "\n<p>Running on CPU 🥶 This demo does not work on CPU.</p>"
+examples = [
+    'a cyberpunk cat with a neon sign that says "Sana"',
+    "A very detailed and realistic full body photo set of a tall, slim, and athletic Shiba Inu in a white oversized straight t-shirt, white shorts, and short white shoes.",
+    "Pirate ship trapped in a cosmic maelstrom nebula, rendered in cosmic beach whirlpool engine, volumetric lighting, spectacular, ambient lights, light pollution, cinematic atmosphere, art nouveau style, illustration art artwork by SenseiJaye, intricate detail.",
+    "portrait photo of a girl, photograph, highly detailed face, depth of field",
+    'make me a logo that says "So Fast"  with a really cool flying dragon shape with lightning sparks all over the sides and all of it contains Indonesian language',
+    "🐶 Wearing 🕶 flying on the 🌈",
+    "👧 with 🌹 in the ❄️",
+    "an old rusted robot wearing pants and a jacket riding skis in a supermarket.",
+    "professional portrait photo of an anthropomorphic cat wearing fancy gentleman hat and jacket walking in autumn forest.",
+    "Astronaut in a jungle, cold color palette, muted colors, detailed",
+    "a stunning and luxurious bedroom carved into a rocky mountainside seamlessly blending nature with modern design with a plush earth-toned bed textured stone walls circular fireplace massive uniquely shaped window framing snow-capped mountains dense forests",
+]
+css = """
+.gradio-container{max-width: 640px !important}
+h1{text-align:center}
+"""
+with gr.Blocks(css=css, title="Sana") as demo:
+    gr.Markdown(title)
+    gr.HTML(DESCRIPTION)
+    gr.DuplicateButton(
+        value="Duplicate Space for private use",
+        elem_id="duplicate-button",
+        visible=os.getenv("SHOW_DUPLICATE_BUTTON") == "1",
+    )
+    info_box = gr.Markdown(
+        value=f"<span style='font-size: 16px; font-weight: bold;'>Total inference runs: </span><span style='font-size: 16px; color:red; font-weight: bold;'>{read_inference_count()}</span>"
+    )
+    demo.load(fn=update_inference_count, outputs=info_box)  # update the value when re-loading the page
+    # with gr.Row(equal_height=False):
+    with gr.Group():
+        with gr.Row():
+            prompt = gr.Text(
+                label="Prompt",
+                show_label=False,
+                max_lines=1,
+                placeholder="Enter your prompt",
+                container=False,
+            )
+            run_button = gr.Button("Run", scale=0)
+        result = gr.Gallery(label="Result", show_label=False, columns=NUM_IMAGES_PER_PROMPT, format="png")
+    speed_box = gr.Markdown(
+        value=f"<span style='font-size: 16px; font-weight: bold;'>Inference speed: {INFER_SPEED} s/Img</span>"
+    )
+    with gr.Accordion("Advanced options", open=False):
+        with gr.Group():
+            with gr.Row(visible=True):
+                height = gr.Slider(
+                    label="Height",
+                    minimum=256,
+                    maximum=MAX_IMAGE_SIZE,
+                    step=32,
+                    value=args.image_size,
+                )
+                width = gr.Slider(
+                    label="Width",
+                    minimum=256,
+                    maximum=MAX_IMAGE_SIZE,
+                    step=32,
+                    value=args.image_size,
+                )
+            with gr.Row():
+                flow_dpms_inference_steps = gr.Slider(
+                    label="Sampling steps",
+                    minimum=5,
+                    maximum=40,
+                    step=1,
+                    value=20,
+                )
+                flow_dpms_guidance_scale = gr.Slider(
+                    label="CFG Guidance scale",
+                    minimum=1,
+                    maximum=10,
+                    step=0.1,
+                    value=4.5,
+                )
+                flow_dpms_pag_guidance_scale = gr.Slider(
+                    label="PAG Guidance scale",
+                    minimum=1,
+                    maximum=4,
+                    step=0.5,
+                    value=1.0,
+                )
+            with gr.Row():
+                use_negative_prompt = gr.Checkbox(label="Use negative prompt", value=False, visible=True)
+            negative_prompt = gr.Text(
+                label="Negative prompt",
+                max_lines=1,
+                placeholder="Enter a negative prompt",
+                visible=True,
+            )
+            style_selection = gr.Radio(
+                show_label=True,
+                container=True,
+                interactive=True,
+                choices=STYLE_NAMES,
+                value=DEFAULT_STYLE_NAME,
+                label="Image Style",
+            )
+            seed = gr.Slider(
+                label="Seed",
+                minimum=0,
+                maximum=MAX_SEED,
+                step=1,
+                value=0,
+            )
+            randomize_seed = gr.Checkbox(label="Randomize seed", value=True)
+            with gr.Row(visible=True):
+                schedule = gr.Radio(
+                    show_label=True,
+                    container=True,
+                    interactive=True,
+                    choices=SCHEDULE_NAME,
+                    value=DEFAULT_SCHEDULE_NAME,
+                    label="Sampler Schedule",
+                    visible=True,
+                )
+                num_imgs = gr.Slider(
+                    label="Num Images",
+                    minimum=1,
+                    maximum=6,
+                    step=1,
+                    value=1,
+                )
+    gr.Examples(
+        examples=examples,
+        inputs=prompt,
+        outputs=[result, seed],
+        fn=generate,
+        cache_examples=CACHE_EXAMPLES,
+    )
+    use_negative_prompt.change(
+        fn=lambda x: gr.update(visible=x),
+        inputs=use_negative_prompt,
+        outputs=negative_prompt,
+        api_name=False,
+    )
+    gr.on(
+        triggers=[
+            prompt.submit,
+            negative_prompt.submit,
+            run_button.click,
+        ],
+        fn=generate,
+        inputs=[
+            prompt,
+            negative_prompt,
+            style_selection,
+            use_negative_prompt,
+            num_imgs,
+            seed,
+            height,
+            width,
+            flow_dpms_guidance_scale,
+            flow_dpms_pag_guidance_scale,
+            flow_dpms_inference_steps,
+            randomize_seed,
+        ],
+        outputs=[result, seed, speed_box, info_box],
+        api_name="run",
+    )
+if __name__ == "__main__":
+    demo.queue(max_size=20).launch(server_name="0.0.0.0", server_port=DEMO_PORT, debug=False, share=args.share)

apps/app_sana_4bit.py ADDED Viewed

	@@ -0,0 +1,409 @@

+#!/usr/bin/env python
+# Copyright 2024 NVIDIA CORPORATION & AFFILIATES
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+#!/usr/bin/env python
+# Copyright 2024 NVIDIA CORPORATION & AFFILIATES
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# SPDX-License-Identifier: Apache-2.0
+from __future__ import annotations
+import argparse
+import os
+import random
+import time
+import uuid
+from datetime import datetime
+import gradio as gr
+import numpy as np
+import spaces
+import torch
+from diffusers import SanaPipeline
+from nunchaku.models.transformer_sana import NunchakuSanaTransformer2DModel
+from torchvision.utils import save_image
+MAX_SEED = np.iinfo(np.int32).max
+CACHE_EXAMPLES = torch.cuda.is_available() and os.getenv("CACHE_EXAMPLES", "1") == "1"
+MAX_IMAGE_SIZE = int(os.getenv("MAX_IMAGE_SIZE", "4096"))
+USE_TORCH_COMPILE = os.getenv("USE_TORCH_COMPILE", "0") == "1"
+ENABLE_CPU_OFFLOAD = os.getenv("ENABLE_CPU_OFFLOAD", "0") == "1"
+DEMO_PORT = int(os.getenv("DEMO_PORT", "15432"))
+os.environ["GRADIO_EXAMPLES_CACHE"] = "./.gradio/cache"
+COUNTER_DB = os.getenv("COUNTER_DB", ".count.db")
+INFER_SPEED = 0
+device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
+style_list = [
+    {
+        "name": "(No style)",
+        "prompt": "{prompt}",
+        "negative_prompt": "",
+    },
+    {
+        "name": "Cinematic",
+        "prompt": "cinematic still {prompt} . emotional, harmonious, vignette, highly detailed, high budget, bokeh, "
+        "cinemascope, moody, epic, gorgeous, film grain, grainy",
+        "negative_prompt": "anime, cartoon, graphic, text, painting, crayon, graphite, abstract, glitch, deformed, mutated, ugly, disfigured",
+    },
+    {
+        "name": "Photographic",
+        "prompt": "cinematic photo {prompt} . 35mm photograph, film, bokeh, professional, 4k, highly detailed",
+        "negative_prompt": "drawing, painting, crayon, sketch, graphite, impressionist, noisy, blurry, soft, deformed, ugly",
+    },
+    {
+        "name": "Anime",
+        "prompt": "anime artwork {prompt} . anime style, key visual, vibrant, studio anime,  highly detailed",
+        "negative_prompt": "photo, deformed, black and white, realism, disfigured, low contrast",
+    },
+    {
+        "name": "Manga",
+        "prompt": "manga style {prompt} . vibrant, high-energy, detailed, iconic, Japanese comic style",
+        "negative_prompt": "ugly, deformed, noisy, blurry, low contrast, realism, photorealistic, Western comic style",
+    },
+    {
+        "name": "Digital Art",
+        "prompt": "concept art {prompt} . digital artwork, illustrative, painterly, matte painting, highly detailed",
+        "negative_prompt": "photo, photorealistic, realism, ugly",
+    },
+    {
+        "name": "Pixel art",
+        "prompt": "pixel-art {prompt} . low-res, blocky, pixel art style, 8-bit graphics",
+        "negative_prompt": "sloppy, messy, blurry, noisy, highly detailed, ultra textured, photo, realistic",
+    },
+    {
+        "name": "Fantasy art",
+        "prompt": "ethereal fantasy concept art of  {prompt} . magnificent, celestial, ethereal, painterly, epic, "
+        "majestic, magical, fantasy art, cover art, dreamy",
+        "negative_prompt": "photographic, realistic, realism, 35mm film, dslr, cropped, frame, text, deformed, "
+        "glitch, noise, noisy, off-center, deformed, cross-eyed, closed eyes, bad anatomy, ugly, "
+        "disfigured, sloppy, duplicate, mutated, black and white",
+    },
+    {
+        "name": "Neonpunk",
+        "prompt": "neonpunk style {prompt} . cyberpunk, vaporwave, neon, vibes, vibrant, stunningly beautiful, crisp, "
+        "detailed, sleek, ultramodern, magenta highlights, dark purple shadows, high contrast, cinematic, "
+        "ultra detailed, intricate, professional",
+        "negative_prompt": "painting, drawing, illustration, glitch, deformed, mutated, cross-eyed, ugly, disfigured",
+    },
+    {
+        "name": "3D Model",
+        "prompt": "professional 3d model {prompt} . octane render, highly detailed, volumetric, dramatic lighting",
+        "negative_prompt": "ugly, deformed, noisy, low poly, blurry, painting",
+    },
+]
+styles = {k["name"]: (k["prompt"], k["negative_prompt"]) for k in style_list}
+STYLE_NAMES = list(styles.keys())
+DEFAULT_STYLE_NAME = "(No style)"
+SCHEDULE_NAME = ["Flow_DPM_Solver"]
+DEFAULT_SCHEDULE_NAME = "Flow_DPM_Solver"
+NUM_IMAGES_PER_PROMPT = 1
+def apply_style(style_name: str, positive: str, negative: str = "") -> tuple[str, str]:
+    p, n = styles.get(style_name, styles[DEFAULT_STYLE_NAME])
+    if not negative:
+        negative = ""
+    return p.replace("{prompt}", positive), n + negative
+def get_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--model_path",
+        nargs="?",
+        default="Efficient-Large-Model/Sana_1600M_1024px_BF16_diffusers",
+        type=str,
+        help="Path to the model file (positional)",
+    )
+    parser.add_argument("--share", action="store_true")
+    return parser.parse_known_args()[0]
+args = get_args()
+if torch.cuda.is_available():
+    transformer = NunchakuSanaTransformer2DModel.from_pretrained("mit-han-lab/svdq-int4-sana-1600m")
+    pipe = SanaPipeline.from_pretrained(
+        "Efficient-Large-Model/Sana_1600M_1024px_BF16_diffusers",
+        transformer=transformer,
+        variant="bf16",
+        torch_dtype=torch.bfloat16,
+    ).to(device)
+    pipe.text_encoder.to(torch.bfloat16)
+    pipe.vae.to(torch.bfloat16)
+def save_image_sana(img, seed="", save_img=False):
+    unique_name = f"{str(uuid.uuid4())}_{seed}.png"
+    save_path = os.path.join(f"output/online_demo_img/{datetime.now().date()}")
+    os.umask(0o000)  # file permission: 666; dir permission: 777
+    os.makedirs(save_path, exist_ok=True)
+    unique_name = os.path.join(save_path, unique_name)
+    if save_img:
+        save_image(img, unique_name, nrow=1, normalize=True, value_range=(-1, 1))
+    return unique_name
+def randomize_seed_fn(seed: int, randomize_seed: bool) -> int:
+    if randomize_seed:
+        seed = random.randint(0, MAX_SEED)
+    return seed
+@torch.no_grad()
+@torch.inference_mode()
+@spaces.GPU(enable_queue=True)
+def generate(
+    prompt: str = None,
+    negative_prompt: str = "",
+    style: str = DEFAULT_STYLE_NAME,
+    use_negative_prompt: bool = False,
+    num_imgs: int = 1,
+    seed: int = 0,
+    height: int = 1024,
+    width: int = 1024,
+    flow_dpms_guidance_scale: float = 5.0,
+    flow_dpms_inference_steps: int = 20,
+    randomize_seed: bool = False,
+):
+    global INFER_SPEED
+    # seed = 823753551
+    seed = int(randomize_seed_fn(seed, randomize_seed))
+    generator = torch.Generator(device=device).manual_seed(seed)
+    print(f"PORT: {DEMO_PORT}, model_path: {args.model_path}")
+    print(prompt)
+    num_inference_steps = flow_dpms_inference_steps
+    guidance_scale = flow_dpms_guidance_scale
+    if not use_negative_prompt:
+        negative_prompt = None  # type: ignore
+    prompt, negative_prompt = apply_style(style, prompt, negative_prompt)
+    time_start = time.time()
+    images = pipe(
+        prompt=prompt,
+        height=height,
+        width=width,
+        negative_prompt=negative_prompt,
+        guidance_scale=guidance_scale,
+        num_inference_steps=num_inference_steps,
+        num_images_per_prompt=num_imgs,
+        generator=generator,
+    ).images
+    INFER_SPEED = (time.time() - time_start) / num_imgs
+    save_img = False
+    if save_img:
+        img = [save_image_sana(img, seed, save_img=save_image) for img in images]
+        print(img)
+    else:
+        img = images
+    torch.cuda.empty_cache()
+    return (
+        img,
+        seed,
+        f"<span style='font-size: 16px; font-weight: bold;'>Inference Speed: {INFER_SPEED:.3f} s/Img</span>",
+    )
+model_size = "1.6" if "1600M" in args.model_path else "0.6"
+title = f"""
+    <div style='display: flex; align-items: center; justify-content: center; text-align: center;'>
+        <img src="https://raw.githubusercontent.com/NVlabs/Sana/refs/heads/main/asset/logo.png" width="30%" alt="logo"/>
+    </div>
+"""
+DESCRIPTION = f"""
+        <p style="font-size: 30px; font-weight: bold; text-align: center;">Sana: Efficient High-Resolution Image Synthesis with Linear Diffusion Transformer (4bit version)</p>
+        """
+if model_size == "0.6":
+    DESCRIPTION += "\n<p>0.6B model's text rendering ability is limited.</p>"
+if not torch.cuda.is_available():
+    DESCRIPTION += "\n<p>Running on CPU 🥶 This demo does not work on CPU.</p>"
+examples = [
+    'a cyberpunk cat with a neon sign that says "Sana"',
+    "A very detailed and realistic full body photo set of a tall, slim, and athletic Shiba Inu in a white oversized straight t-shirt, white shorts, and short white shoes.",
+    "Pirate ship trapped in a cosmic maelstrom nebula, rendered in cosmic beach whirlpool engine, volumetric lighting, spectacular, ambient lights, light pollution, cinematic atmosphere, art nouveau style, illustration art artwork by SenseiJaye, intricate detail.",
+    "portrait photo of a girl, photograph, highly detailed face, depth of field",
+    'make me a logo that says "So Fast"  with a really cool flying dragon shape with lightning sparks all over the sides and all of it contains Indonesian language',
+    "🐶 Wearing 🕶 flying on the 🌈",
+    "👧 with 🌹 in the ❄️",
+    "an old rusted robot wearing pants and a jacket riding skis in a supermarket.",
+    "professional portrait photo of an anthropomorphic cat wearing fancy gentleman hat and jacket walking in autumn forest.",
+    "Astronaut in a jungle, cold color palette, muted colors, detailed",
+    "a stunning and luxurious bedroom carved into a rocky mountainside seamlessly blending nature with modern design with a plush earth-toned bed textured stone walls circular fireplace massive uniquely shaped window framing snow-capped mountains dense forests",
+]
+css = """
+.gradio-container {max-width: 850px !important; height: auto !important;}
+h1 {text-align: center;}
+"""
+theme = gr.themes.Base()
+with gr.Blocks(css=css, theme=theme, title="Sana") as demo:
+    gr.Markdown(title)
+    gr.HTML(DESCRIPTION)
+    gr.DuplicateButton(
+        value="Duplicate Space for private use",
+        elem_id="duplicate-button",
+        visible=os.getenv("SHOW_DUPLICATE_BUTTON") == "1",
+    )
+    # with gr.Row(equal_height=False):
+    with gr.Group():
+        with gr.Row():
+            prompt = gr.Text(
+                label="Prompt",
+                show_label=False,
+                max_lines=1,
+                placeholder="Enter your prompt",
+                container=False,
+            )
+            run_button = gr.Button("Run", scale=0)
+        result = gr.Gallery(
+            label="Result",
+            show_label=False,
+            height=750,
+            columns=NUM_IMAGES_PER_PROMPT,
+            format="jpeg",
+        )
+    speed_box = gr.Markdown(
+        value=f"<span style='font-size: 16px; font-weight: bold;'>Inference speed: {INFER_SPEED} s/Img</span>"
+    )
+    with gr.Accordion("Advanced options", open=False):
+        with gr.Group():
+            with gr.Row(visible=True):
+                height = gr.Slider(
+                    label="Height",
+                    minimum=256,
+                    maximum=MAX_IMAGE_SIZE,
+                    step=32,
+                    value=1024,
+                )
+                width = gr.Slider(
+                    label="Width",
+                    minimum=256,
+                    maximum=MAX_IMAGE_SIZE,
+                    step=32,
+                    value=1024,
+                )
+            with gr.Row():
+                flow_dpms_inference_steps = gr.Slider(
+                    label="Sampling steps",
+                    minimum=5,
+                    maximum=40,
+                    step=1,
+                    value=20,
+                )
+                flow_dpms_guidance_scale = gr.Slider(
+                    label="CFG Guidance scale",
+                    minimum=1,
+                    maximum=10,
+                    step=0.1,
+                    value=4.5,
+                )
+            with gr.Row():
+                use_negative_prompt = gr.Checkbox(label="Use negative prompt", value=False, visible=True)
+            negative_prompt = gr.Text(
+                label="Negative prompt",
+                max_lines=1,
+                placeholder="Enter a negative prompt",
+                visible=True,
+            )
+            style_selection = gr.Radio(
+                show_label=True,
+                container=True,
+                interactive=True,
+                choices=STYLE_NAMES,
+                value=DEFAULT_STYLE_NAME,
+                label="Image Style",
+            )
+            seed = gr.Slider(
+                label="Seed",
+                minimum=0,
+                maximum=MAX_SEED,
+                step=1,
+                value=0,
+            )
+            randomize_seed = gr.Checkbox(label="Randomize seed", value=True)
+            with gr.Row(visible=True):
+                schedule = gr.Radio(
+                    show_label=True,
+                    container=True,
+                    interactive=True,
+                    choices=SCHEDULE_NAME,
+                    value=DEFAULT_SCHEDULE_NAME,
+                    label="Sampler Schedule",
+                    visible=True,
+                )
+                num_imgs = gr.Slider(
+                    label="Num Images",
+                    minimum=1,
+                    maximum=6,
+                    step=1,
+                    value=1,
+                )
+    gr.Examples(
+        examples=examples,
+        inputs=prompt,
+        outputs=[result, seed],
+        fn=generate,
+        cache_examples=CACHE_EXAMPLES,
+    )
+    use_negative_prompt.change(
+        fn=lambda x: gr.update(visible=x),
+        inputs=use_negative_prompt,
+        outputs=negative_prompt,
+        api_name=False,
+    )
+    gr.on(
+        triggers=[
+            prompt.submit,
+            negative_prompt.submit,
+            run_button.click,
+        ],
+        fn=generate,
+        inputs=[
+            prompt,
+            negative_prompt,
+            style_selection,
+            use_negative_prompt,
+            num_imgs,
+            seed,
+            height,
+            width,
+            flow_dpms_guidance_scale,
+            flow_dpms_inference_steps,
+            randomize_seed,
+        ],
+        outputs=[result, seed, speed_box],
+        api_name="run",
+    )
+if __name__ == "__main__":
+    demo.queue(max_size=20).launch(server_name="0.0.0.0", server_port=DEMO_PORT, debug=False, share=args.share)

apps/app_sana_4bit_compare_bf16.py ADDED Viewed

	@@ -0,0 +1,313 @@

+# Changed from https://huggingface.co/spaces/playgroundai/playground-v2.5/blob/main/app.py
+import argparse
+import os
+import random
+import time
+from datetime import datetime
+import GPUtil
+# import gradio last to avoid conflicts with other imports
+import gradio as gr
+import safety_check
+import spaces
+import torch
+from diffusers import SanaPipeline
+from nunchaku.models.transformer_sana import NunchakuSanaTransformer2DModel
+from transformers import AutoModelForCausalLM, AutoTokenizer
+MAX_IMAGE_SIZE = 2048
+MAX_SEED = 1000000000
+DEFAULT_HEIGHT = 1024
+DEFAULT_WIDTH = 1024
+# num_inference_steps, guidance_scale, seed
+EXAMPLES = [
+    [
+        "🐶 Wearing 🕶 flying on the 🌈",
+        1024,
+        1024,
+        20,
+        5,
+        2,
+    ],
+    [
+        "大漠孤烟直, 长河落日圆",
+        1024,
+        1024,
+        20,
+        5,
+        23,
+    ],
+    [
+        "Pirate ship trapped in a cosmic maelstrom nebula, rendered in cosmic beach whirlpool engine, "
+        "volumetric lighting, spectacular, ambient lights, light pollution, cinematic atmosphere, "
+        "art nouveau style, illustration art artwork by SenseiJaye, intricate detail.",
+        1024,
+        1024,
+        20,
+        5,
+        233,
+    ],
+    [
+        "A photo of a Eurasian lynx in a sunlit forest, with tufted ears and a spotted coat. The lynx should be "
+        "sharply focused, gazing into the distance, while the background is softly blurred for depth. Use cinematic "
+        "lighting with soft rays filtering through the trees, and capture the scene with a shallow depth of field "
+        "for a natural, peaceful atmosphere. 8K resolution, highly detailed, photorealistic, "
+        "cinematic lighting, ultra-HD.",
+        1024,
+        1024,
+        20,
+        5,
+        2333,
+    ],
+    [
+        "A stylish woman walks down a Tokyo street filled with warm glowing neon and animated city signage. "
+        "She wears a black leather jacket, a long red dress, and black boots, and carries a black purse. "
+        "She wears sunglasses and red lipstick. She walks confidently and casually. "
+        "The street is damp and reflective, creating a mirror effect of the colorful lights. "
+        "Many pedestrians walk about.",
+        1024,
+        1024,
+        20,
+        5,
+        23333,
+    ],
+    [
+        "Cozy bedroom with vintage wooden furniture and a large circular window covered in lush green vines, "
+        "opening to a misty forest. Soft, ambient lighting highlights the bed with crumpled blankets, a bookshelf, "
+        "and a desk. The atmosphere is serene and natural. 8K resolution, highly detailed, photorealistic, "
+        "cinematic lighting, ultra-HD.",
+        1024,
+        1024,
+        20,
+        5,
+        233333,
+    ],
+]
+def hash_str_to_int(s: str) -> int:
+    """Hash a string to an integer."""
+    modulus = 10**9 + 7  # Large prime modulus
+    hash_int = 0
+    for char in s:
+        hash_int = (hash_int * 31 + ord(char)) % modulus
+    return hash_int
+def get_pipeline(
+    precision: str, use_qencoder: bool = False, device: str | torch.device = "cuda", pipeline_init_kwargs: dict = {}
+) -> SanaPipeline:
+    if precision == "int4":
+        assert torch.device(device).type == "cuda", "int4 only supported on CUDA devices"
+        transformer = NunchakuSanaTransformer2DModel.from_pretrained("mit-han-lab/svdq-int4-sana-1600m")
+        pipeline_init_kwargs["transformer"] = transformer
+        if use_qencoder:
+            raise NotImplementedError("Quantized encoder not supported for Sana for now")
+    else:
+        assert precision == "bf16"
+    pipeline = SanaPipeline.from_pretrained(
+        "Efficient-Large-Model/Sana_1600M_1024px_BF16_diffusers",
+        variant="bf16",
+        torch_dtype=torch.bfloat16,
+        **pipeline_init_kwargs,
+    )
+    pipeline = pipeline.to(device)
+    return pipeline
+def get_args() -> argparse.Namespace:
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "-p",
+        "--precisions",
+        type=str,
+        default=["int4"],
+        nargs="*",
+        choices=["int4", "bf16"],
+        help="Which precisions to use",
+    )
+    parser.add_argument("--use-qencoder", action="store_true", help="Whether to use 4-bit text encoder")
+    parser.add_argument("--no-safety-checker", action="store_true", help="Disable safety checker")
+    parser.add_argument("--count-use", action="store_true", help="Whether to count the number of uses")
+    return parser.parse_args()
+args = get_args()
+pipelines = []
+pipeline_init_kwargs = {}
+for i, precision in enumerate(args.precisions):
+    pipeline = get_pipeline(
+        precision=precision,
+        use_qencoder=args.use_qencoder,
+        device="cuda",
+        pipeline_init_kwargs={**pipeline_init_kwargs},
+    )
+    pipelines.append(pipeline)
+    if i == 0:
+        pipeline_init_kwargs["vae"] = pipeline.vae
+        pipeline_init_kwargs["text_encoder"] = pipeline.text_encoder
+# safety checker
+safety_checker_tokenizer = AutoTokenizer.from_pretrained(args.shield_model_path)
+safety_checker_model = AutoModelForCausalLM.from_pretrained(
+    args.shield_model_path,
+    device_map="auto",
+    torch_dtype=torch.bfloat16,
+).to(pipeline.device)
+@spaces.GPU(enable_queue=True)
+def generate(
+    prompt: str = None,
+    height: int = 1024,
+    width: int = 1024,
+    num_inference_steps: int = 4,
+    guidance_scale: float = 0,
+    seed: int = 0,
+):
+    print(f"Prompt: {prompt}")
+    is_unsafe_prompt = False
+    if safety_check.is_dangerous(safety_checker_tokenizer, safety_checker_model, prompt, threshold=0.2):
+        prompt = "A peaceful world."
+    images, latency_strs = [], []
+    for i, pipeline in enumerate(pipelines):
+        progress = gr.Progress(track_tqdm=True)
+        start_time = time.time()
+        image = pipeline(
+            prompt=prompt,
+            height=height,
+            width=width,
+            guidance_scale=guidance_scale,
+            num_inference_steps=num_inference_steps,
+            generator=torch.Generator().manual_seed(seed),
+        ).images[0]
+        end_time = time.time()
+        latency = end_time - start_time
+        if latency < 1:
+            latency = latency * 1000
+            latency_str = f"{latency:.2f}ms"
+        else:
+            latency_str = f"{latency:.2f}s"
+        images.append(image)
+        latency_strs.append(latency_str)
+    if is_unsafe_prompt:
+        for i in range(len(latency_strs)):
+            latency_strs[i] += " (Unsafe prompt detected)"
+    torch.cuda.empty_cache()
+    if args.count_use:
+        if os.path.exists("use_count.txt"):
+            with open("use_count.txt") as f:
+                count = int(f.read())
+        else:
+            count = 0
+        count += 1
+        current_time = datetime.now()
+        print(f"{current_time}: {count}")
+        with open("use_count.txt", "w") as f:
+            f.write(str(count))
+        with open("use_record.txt", "a") as f:
+            f.write(f"{current_time}: {count}\n")
+    return *images, *latency_strs
+with open("./assets/description.html") as f:
+    DESCRIPTION = f.read()
+gpus = GPUtil.getGPUs()
+if len(gpus) > 0:
+    gpu = gpus[0]
+    memory = gpu.memoryTotal / 1024
+    device_info = f"Running on {gpu.name} with {memory:.0f} GiB memory."
+else:
+    device_info = "Running on CPU 🥶 This demo does not work on CPU."
+notice = f'<strong>Notice:</strong>&nbsp;We will replace unsafe prompts with a default prompt: "A peaceful world."'
+with gr.Blocks(
+    css_paths=[f"assets/frame{len(args.precisions)}.css", "assets/common.css"],
+    title=f"SVDQuant SANA-1600M Demo",
+) as demo:
+    def get_header_str():
+        if args.count_use:
+            if os.path.exists("use_count.txt"):
+                with open("use_count.txt") as f:
+                    count = int(f.read())
+            else:
+                count = 0
+            count_info = (
+                f"<div style='display: flex; justify-content: center; align-items: center; text-align: center;'>"
+                f"<span style='font-size: 18px; font-weight: bold;'>Total inference runs: </span>"
+                f"<span style='font-size: 18px; color:red; font-weight: bold;'>&nbsp;{count}</span></div>"
+            )
+        else:
+            count_info = ""
+        header_str = DESCRIPTION.format(device_info=device_info, notice=notice, count_info=count_info)
+        return header_str
+    header = gr.HTML(get_header_str())
+    demo.load(fn=get_header_str, outputs=header)
+    with gr.Row():
+        image_results, latency_results = [], []
+        for i, precision in enumerate(args.precisions):
+            with gr.Column():
+                gr.Markdown(f"# {precision.upper()}", elem_id="image_header")
+                with gr.Group():
+                    image_result = gr.Image(
+                        format="png",
+                        image_mode="RGB",
+                        label="Result",
+                        show_label=False,
+                        show_download_button=True,
+                        interactive=False,
+                    )
+                    latency_result = gr.Text(label="Inference Latency", show_label=True)
+                    image_results.append(image_result)
+                    latency_results.append(latency_result)
+    with gr.Row():
+        prompt = gr.Text(
+            label="Prompt", show_label=False, max_lines=1, placeholder="Enter your prompt", container=False, scale=4
+        )
+        run_button = gr.Button("Run", scale=1)
+    with gr.Row():
+        seed = gr.Slider(label="Seed", show_label=True, minimum=0, maximum=MAX_SEED, value=233, step=1, scale=4)
+        randomize_seed = gr.Button("Random Seed", scale=1, min_width=50, elem_id="random_seed")
+    with gr.Accordion("Advanced options", open=False):
+        with gr.Group():
+            height = gr.Slider(label="Height", minimum=256, maximum=4096, step=32, value=1024)
+            width = gr.Slider(label="Width", minimum=256, maximum=4096, step=32, value=1024)
+        with gr.Group():
+            num_inference_steps = gr.Slider(label="Sampling Steps", minimum=10, maximum=50, step=1, value=20)
+            guidance_scale = gr.Slider(label="Guidance Scale", minimum=1, maximum=10, step=0.1, value=5)
+    input_args = [prompt, height, width, num_inference_steps, guidance_scale, seed]
+    gr.Examples(examples=EXAMPLES, inputs=input_args, outputs=[*image_results, *latency_results], fn=generate)
+    gr.on(
+        triggers=[prompt.submit, run_button.click],
+        fn=generate,
+        inputs=input_args,
+        outputs=[*image_results, *latency_results],
+        api_name="run",
+    )
+    randomize_seed.click(
+        lambda: random.randint(0, MAX_SEED), inputs=[], outputs=seed, api_name=False, queue=False
+    ).then(fn=generate, inputs=input_args, outputs=[*image_results, *latency_results], api_name=False, queue=False)
+    gr.Markdown("MIT Accessibility: https://accessibility.mit.edu/", elem_id="accessibility")
+if __name__ == "__main__":
+    demo.queue(max_size=20).launch(server_name="0.0.0.0", debug=True, share=True)

apps/app_sana_controlnet_hed.py ADDED Viewed

	@@ -0,0 +1,306 @@

+# Changed from https://github.com/GaParmar/img2img-turbo/blob/main/gradio_sketch2image.py
+import argparse
+import os
+import random
+import socket
+import tempfile
+import time
+import gradio as gr
+import numpy as np
+import torch
+from PIL import Image
+from transformers import AutoModelForCausalLM, AutoTokenizer
+from app import safety_check
+from app.sana_controlnet_pipeline import SanaControlNetPipeline
+STYLES = {
+    "None": "{prompt}",
+    "Cinematic": "cinematic still {prompt}. emotional, harmonious, vignette, highly detailed, high budget, bokeh, cinemascope, moody, epic, gorgeous, film grain, grainy",
+    "3D Model": "professional 3d model {prompt}. octane render, highly detailed, volumetric, dramatic lighting",
+    "Anime": "anime artwork {prompt}. anime style, key visual, vibrant, studio anime,  highly detailed",
+    "Digital Art": "concept art {prompt}. digital artwork, illustrative, painterly, matte painting, highly detailed",
+    "Photographic": "cinematic photo {prompt}. 35mm photograph, film, bokeh, professional, 4k, highly detailed",
+    "Pixel art": "pixel-art {prompt}. low-res, blocky, pixel art style, 8-bit graphics",
+    "Fantasy art": "ethereal fantasy concept art of  {prompt}. magnificent, celestial, ethereal, painterly, epic, majestic, magical, fantasy art, cover art, dreamy",
+    "Neonpunk": "neonpunk style {prompt}. cyberpunk, vaporwave, neon, vibes, vibrant, stunningly beautiful, crisp, detailed, sleek, ultramodern, magenta highlights, dark purple shadows, high contrast, cinematic, ultra detailed, intricate, professional",
+    "Manga": "manga style {prompt}. vibrant, high-energy, detailed, iconic, Japanese comic style",
+}
+DEFAULT_STYLE_NAME = "None"
+STYLE_NAMES = list(STYLES.keys())
+MAX_SEED = 1000000000
+DEFAULT_SKETCH_GUIDANCE = 0.28
+DEMO_PORT = int(os.getenv("DEMO_PORT", "15432"))
+device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
+blank_image = Image.new("RGB", (1024, 1024), (255, 255, 255))
+def get_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--config", type=str, help="config")
+    parser.add_argument(
+        "--model_path",
+        nargs="?",
+        default="hf://Efficient-Large-Model/Sana_1600M_1024px/checkpoints/Sana_1600M_1024px.pth",
+        type=str,
+        help="Path to the model file (positional)",
+    )
+    parser.add_argument("--output", default="./", type=str)
+    parser.add_argument("--bs", default=1, type=int)
+    parser.add_argument("--image_size", default=1024, type=int)
+    parser.add_argument("--cfg_scale", default=5.0, type=float)
+    parser.add_argument("--pag_scale", default=2.0, type=float)
+    parser.add_argument("--seed", default=42, type=int)
+    parser.add_argument("--step", default=-1, type=int)
+    parser.add_argument("--custom_image_size", default=None, type=int)
+    parser.add_argument("--share", action="store_true")
+    parser.add_argument(
+        "--shield_model_path",
+        type=str,
+        help="The path to shield model, we employ ShieldGemma-2B by default.",
+        default="google/shieldgemma-2b",
+    )
+    return parser.parse_known_args()[0]
+args = get_args()
+if torch.cuda.is_available():
+    model_path = args.model_path
+    pipe = SanaControlNetPipeline(args.config)
+    pipe.from_pretrained(model_path)
+    pipe.register_progress_bar(gr.Progress())
+    # safety checker
+    safety_checker_tokenizer = AutoTokenizer.from_pretrained(args.shield_model_path)
+    safety_checker_model = AutoModelForCausalLM.from_pretrained(
+        args.shield_model_path,
+        device_map="auto",
+        torch_dtype=torch.bfloat16,
+    ).to(device)
+def save_image(img):
+    if isinstance(img, dict):
+        img = img["composite"]
+    temp_file = tempfile.NamedTemporaryFile(suffix=".png", delete=False)
+    img.save(temp_file.name)
+    return temp_file.name
+def norm_ip(img, low, high):
+    img.clamp_(min=low, max=high)
+    img.sub_(low).div_(max(high - low, 1e-5))
+    return img
+@torch.no_grad()
+@torch.inference_mode()
+def run(
+    image,
+    prompt: str,
+    prompt_template: str,
+    sketch_thickness: int,
+    guidance_scale: float,
+    inference_steps: int,
+    seed: int,
+    blend_alpha: float,
+) -> tuple[Image, str]:
+    print(f"Prompt: {prompt}")
+    image_numpy = np.array(image["composite"].convert("RGB"))
+    if prompt.strip() == "" and (np.sum(image_numpy == 255) >= 3145628 or np.sum(image_numpy == 0) >= 3145628):
+        return blank_image, "Please input the prompt or draw something."
+    if safety_check.is_dangerous(safety_checker_tokenizer, safety_checker_model, prompt, threshold=0.2):
+        prompt = "A red heart."
+    prompt = prompt_template.format(prompt=prompt)
+    pipe.set_blend_alpha(blend_alpha)
+    start_time = time.time()
+    images = pipe(
+        prompt=prompt,
+        ref_image=image["composite"],
+        guidance_scale=guidance_scale,
+        num_inference_steps=inference_steps,
+        num_images_per_prompt=1,
+        sketch_thickness=sketch_thickness,
+        generator=torch.Generator(device=device).manual_seed(seed),
+    )
+    latency = time.time() - start_time
+    if latency < 1:
+        latency = latency * 1000
+        latency_str = f"{latency:.2f}ms"
+    else:
+        latency_str = f"{latency:.2f}s"
+    torch.cuda.empty_cache()
+    img = [
+        Image.fromarray(
+            norm_ip(img, -1, 1)
+            .mul(255)
+            .add_(0.5)
+            .clamp_(0, 255)
+            .permute(1, 2, 0)
+            .to("cpu", torch.uint8)
+            .numpy()
+            .astype(np.uint8)
+        )
+        for img in images
+    ]
+    img = img[0]
+    return img, latency_str
+model_size = "1.6" if "1600M" in args.model_path else "0.6"
+title = f"""
+    <div style='display: flex; align-items: center; justify-content: center; text-align: center;'>
+        <img src="https://raw.githubusercontent.com/NVlabs/Sana/refs/heads/main/asset/logo.png" width="50%" alt="logo"/>
+    </div>
+"""
+DESCRIPTION = f"""
+        <p><span style="font-size: 36px; font-weight: bold;">Sana-ControlNet-{model_size}B</span><span style="font-size: 20px; font-weight: bold;">{args.image_size}px</span></p>
+        <p style="font-size: 18px; font-weight: bold;">Sana: Efficient High-Resolution Image Synthesis with Linear Diffusion Transformer</p>
+        <p><span style="font-size: 16px;"><a href="https://arxiv.org/abs/2410.10629">[Paper]</a></span> <span style="font-size: 16px;"><a href="https://github.com/NVlabs/Sana">[Github]</a></span> <span style="font-size: 16px;"><a href="https://nvlabs.github.io/Sana">[Project]</a></span</p>
+        <p style="font-size: 18px; font-weight: bold;">Powered by <a href="https://hanlab.mit.edu/projects/dc-ae">DC-AE</a> with 32x latent space, </p>running on node {socket.gethostname()}.
+        <p style="font-size: 16px; font-weight: bold;">Unsafe word will give you a 'Red Heart' in the image instead.</p>
+        """
+if model_size == "0.6":
+    DESCRIPTION += "\n<p>0.6B model's text rendering ability is limited.</p>"
+if not torch.cuda.is_available():
+    DESCRIPTION += "\n<p>Running on CPU 🥶 This demo does not work on CPU.</p>"
+with gr.Blocks(css_paths="asset/app_styles/controlnet_app_style.css", title=f"Sana Sketch-to-Image Demo") as demo:
+    gr.Markdown(title)
+    gr.HTML(DESCRIPTION)
+    with gr.Row(elem_id="main_row"):
+        with gr.Column(elem_id="column_input"):
+            gr.Markdown("## INPUT", elem_id="input_header")
+            with gr.Group():
+                canvas = gr.Sketchpad(
+                    value=blank_image,
+                    height=640,
+                    image_mode="RGB",
+                    sources=["upload", "clipboard"],
+                    type="pil",
+                    label="Sketch",
+                    show_label=False,
+                    show_download_button=True,
+                    interactive=True,
+                    transforms=[],
+                    canvas_size=(1024, 1024),
+                    scale=1,
+                    brush=gr.Brush(default_size=3, colors=["#000000"], color_mode="fixed"),
+                    format="png",
+                    layers=False,
+                )
+                with gr.Row():
+                    prompt = gr.Text(label="Prompt", placeholder="Enter your prompt", scale=6)
+                    run_button = gr.Button("Run", scale=1, elem_id="run_button")
+            download_sketch = gr.DownloadButton("Download Sketch", scale=1, elem_id="download_sketch")
+            with gr.Row():
+                style = gr.Dropdown(label="Style", choices=STYLE_NAMES, value=DEFAULT_STYLE_NAME, scale=1)
+                prompt_template = gr.Textbox(
+                    label="Prompt Style Template", value=STYLES[DEFAULT_STYLE_NAME], scale=2, max_lines=1
+                )
+            with gr.Row():
+                sketch_thickness = gr.Slider(
+                    label="Sketch Thickness",
+                    minimum=1,
+                    maximum=4,
+                    step=1,
+                    value=2,
+                )
+            with gr.Row():
+                inference_steps = gr.Slider(
+                    label="Sampling steps",
+                    minimum=5,
+                    maximum=40,
+                    step=1,
+                    value=20,
+                )
+                guidance_scale = gr.Slider(
+                    label="CFG Guidance scale",
+                    minimum=1,
+                    maximum=10,
+                    step=0.1,
+                    value=4.5,
+                )
+                blend_alpha = gr.Slider(
+                    label="Blend Alpha",
+                    minimum=0,
+                    maximum=1,
+                    step=0.1,
+                    value=0,
+                )
+            with gr.Row():
+                seed = gr.Slider(label="Seed", show_label=True, minimum=0, maximum=MAX_SEED, value=233, step=1, scale=4)
+                randomize_seed = gr.Button("Random Seed", scale=1, min_width=50, elem_id="random_seed")
+        with gr.Column(elem_id="column_output"):
+            gr.Markdown("## OUTPUT", elem_id="output_header")
+            with gr.Group():
+                result = gr.Image(
+                    format="png",
+                    height=640,
+                    image_mode="RGB",
+                    type="pil",
+                    label="Result",
+                    show_label=False,
+                    show_download_button=True,
+                    interactive=False,
+                    elem_id="output_image",
+                )
+                latency_result = gr.Text(label="Inference Latency", show_label=True)
+            download_result = gr.DownloadButton("Download Result", elem_id="download_result")
+            gr.Markdown("### Instructions")
+            gr.Markdown("**1**. Enter a text prompt (e.g. a cat)")
+            gr.Markdown("**2**. Start sketching or upload a reference image")
+            gr.Markdown("**3**. Change the image style using a style template")
+            gr.Markdown("**4**. Try different seeds to generate different results")
+    run_inputs = [canvas, prompt, prompt_template, sketch_thickness, guidance_scale, inference_steps, seed, blend_alpha]
+    run_outputs = [result, latency_result]
+    randomize_seed.click(
+        lambda: random.randint(0, MAX_SEED),
+        inputs=[],
+        outputs=seed,
+        api_name=False,
+        queue=False,
+    ).then(run, inputs=run_inputs, outputs=run_outputs, api_name=False)
+    style.change(
+        lambda x: STYLES[x],
+        inputs=[style],
+        outputs=[prompt_template],
+        api_name=False,
+        queue=False,
+    ).then(fn=run, inputs=run_inputs, outputs=run_outputs, api_name=False)
+    gr.on(
+        triggers=[prompt.submit, run_button.click, canvas.change],
+        fn=run,
+        inputs=run_inputs,
+        outputs=run_outputs,
+        api_name=False,
+    )
+    download_sketch.click(fn=save_image, inputs=canvas, outputs=download_sketch)
+    download_result.click(fn=save_image, inputs=result, outputs=download_result)
+    gr.Markdown("MIT Accessibility: https://accessibility.mit.edu/", elem_id="accessibility")
+if __name__ == "__main__":
+    demo.queue(max_size=20).launch(server_name="0.0.0.0", server_port=DEMO_PORT, debug=False, share=args.share)

apps/app_sana_multithread.py ADDED Viewed

	@@ -0,0 +1,565 @@

+#!/usr/bin/env python
+# Copyright 2024 NVIDIA CORPORATION & AFFILIATES
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# SPDX-License-Identifier: Apache-2.0
+from __future__ import annotations
+import argparse
+import os
+import random
+import uuid
+from datetime import datetime
+import gradio as gr
+import numpy as np
+import spaces
+import torch
+from diffusers import FluxPipeline
+from PIL import Image
+from torchvision.utils import make_grid, save_image
+from transformers import AutoModelForCausalLM, AutoTokenizer
+from app import safety_check
+from app.sana_pipeline import SanaPipeline
+MAX_SEED = np.iinfo(np.int32).max
+CACHE_EXAMPLES = torch.cuda.is_available() and os.getenv("CACHE_EXAMPLES", "1") == "1"
+MAX_IMAGE_SIZE = int(os.getenv("MAX_IMAGE_SIZE", "4096"))
+USE_TORCH_COMPILE = os.getenv("USE_TORCH_COMPILE", "0") == "1"
+ENABLE_CPU_OFFLOAD = os.getenv("ENABLE_CPU_OFFLOAD", "0") == "1"
+DEMO_PORT = int(os.getenv("DEMO_PORT", "15432"))
+os.environ["GRADIO_EXAMPLES_CACHE"] = "./.gradio/cache"
+device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
+style_list = [
+    {
+        "name": "(No style)",
+        "prompt": "{prompt}",
+        "negative_prompt": "",
+    },
+    {
+        "name": "Cinematic",
+        "prompt": "cinematic still {prompt} . emotional, harmonious, vignette, highly detailed, high budget, bokeh, "
+        "cinemascope, moody, epic, gorgeous, film grain, grainy",
+        "negative_prompt": "anime, cartoon, graphic, text, painting, crayon, graphite, abstract, glitch, deformed, mutated, ugly, disfigured",
+    },
+    {
+        "name": "Photographic",
+        "prompt": "cinematic photo {prompt} . 35mm photograph, film, bokeh, professional, 4k, highly detailed",
+        "negative_prompt": "drawing, painting, crayon, sketch, graphite, impressionist, noisy, blurry, soft, deformed, ugly",
+    },
+    {
+        "name": "Anime",
+        "prompt": "anime artwork {prompt} . anime style, key visual, vibrant, studio anime,  highly detailed",
+        "negative_prompt": "photo, deformed, black and white, realism, disfigured, low contrast",
+    },
+    {
+        "name": "Manga",
+        "prompt": "manga style {prompt} . vibrant, high-energy, detailed, iconic, Japanese comic style",
+        "negative_prompt": "ugly, deformed, noisy, blurry, low contrast, realism, photorealistic, Western comic style",
+    },
+    {
+        "name": "Digital Art",
+        "prompt": "concept art {prompt} . digital artwork, illustrative, painterly, matte painting, highly detailed",
+        "negative_prompt": "photo, photorealistic, realism, ugly",
+    },
+    {
+        "name": "Pixel art",
+        "prompt": "pixel-art {prompt} . low-res, blocky, pixel art style, 8-bit graphics",
+        "negative_prompt": "sloppy, messy, blurry, noisy, highly detailed, ultra textured, photo, realistic",
+    },
+    {
+        "name": "Fantasy art",
+        "prompt": "ethereal fantasy concept art of  {prompt} . magnificent, celestial, ethereal, painterly, epic, "
+        "majestic, magical, fantasy art, cover art, dreamy",
+        "negative_prompt": "photographic, realistic, realism, 35mm film, dslr, cropped, frame, text, deformed, "
+        "glitch, noise, noisy, off-center, deformed, cross-eyed, closed eyes, bad anatomy, ugly, "
+        "disfigured, sloppy, duplicate, mutated, black and white",
+    },
+    {
+        "name": "Neonpunk",
+        "prompt": "neonpunk style {prompt} . cyberpunk, vaporwave, neon, vibes, vibrant, stunningly beautiful, crisp, "
+        "detailed, sleek, ultramodern, magenta highlights, dark purple shadows, high contrast, cinematic, "
+        "ultra detailed, intricate, professional",
+        "negative_prompt": "painting, drawing, illustration, glitch, deformed, mutated, cross-eyed, ugly, disfigured",
+    },
+    {
+        "name": "3D Model",
+        "prompt": "professional 3d model {prompt} . octane render, highly detailed, volumetric, dramatic lighting",
+        "negative_prompt": "ugly, deformed, noisy, low poly, blurry, painting",
+    },
+]
+styles = {k["name"]: (k["prompt"], k["negative_prompt"]) for k in style_list}
+STYLE_NAMES = list(styles.keys())
+DEFAULT_STYLE_NAME = "(No style)"
+SCHEDULE_NAME = ["Flow_DPM_Solver"]
+DEFAULT_SCHEDULE_NAME = "Flow_DPM_Solver"
+NUM_IMAGES_PER_PROMPT = 1
+TEST_TIMES = 0
+FILENAME = f"output/port{DEMO_PORT}_inference_count.txt"
+def set_env(seed=0):
+    torch.manual_seed(seed)
+    torch.set_grad_enabled(False)
+def read_inference_count():
+    global TEST_TIMES
+    try:
+        with open(FILENAME) as f:
+            count = int(f.read().strip())
+    except FileNotFoundError:
+        count = 0
+    TEST_TIMES = count
+    return count
+def write_inference_count(count):
+    with open(FILENAME, "w") as f:
+        f.write(str(count))
+def run_inference(num_imgs=1):
+    TEST_TIMES = read_inference_count()
+    TEST_TIMES += int(num_imgs)
+    write_inference_count(TEST_TIMES)
+    return (
+        f"<span style='font-size: 16px; font-weight: bold;'>Total inference runs: </span><span style='font-size: "
+        f"16px; color:red; font-weight: bold;'>{TEST_TIMES}</span>"
+    )
+def update_inference_count():
+    count = read_inference_count()
+    return (
+        f"<span style='font-size: 16px; font-weight: bold;'>Total inference runs: </span><span style='font-size: "
+        f"16px; color:red; font-weight: bold;'>{count}</span>"
+    )
+def apply_style(style_name: str, positive: str, negative: str = "") -> tuple[str, str]:
+    p, n = styles.get(style_name, styles[DEFAULT_STYLE_NAME])
+    if not negative:
+        negative = ""
+    return p.replace("{prompt}", positive), n + negative
+def get_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--config", type=str, help="config")
+    parser.add_argument(
+        "--model_path",
+        nargs="?",
+        default="output/Sana_D20/SANA.pth",
+        type=str,
+        help="Path to the model file (positional)",
+    )
+    parser.add_argument("--output", default="./", type=str)
+    parser.add_argument("--bs", default=1, type=int)
+    parser.add_argument("--image_size", default=1024, type=int)
+    parser.add_argument("--cfg_scale", default=5.0, type=float)
+    parser.add_argument("--pag_scale", default=2.0, type=float)
+    parser.add_argument("--seed", default=42, type=int)
+    parser.add_argument("--step", default=-1, type=int)
+    parser.add_argument("--custom_image_size", default=None, type=int)
+    parser.add_argument(
+        "--shield_model_path",
+        type=str,
+        help="The path to shield model, we employ ShieldGemma-2B by default.",
+        default="google/shieldgemma-2b",
+    )
+    return parser.parse_args()
+args = get_args()
+if torch.cuda.is_available():
+    weight_dtype = torch.float16
+    model_path = args.model_path
+    pipe = SanaPipeline(args.config)
+    pipe.from_pretrained(model_path)
+    pipe.register_progress_bar(gr.Progress())
+    repo_name = "black-forest-labs/FLUX.1-dev"
+    pipe2 = FluxPipeline.from_pretrained(repo_name, torch_dtype=torch.float16).to("cuda")
+    # safety checker
+    safety_checker_tokenizer = AutoTokenizer.from_pretrained(args.shield_model_path)
+    safety_checker_model = AutoModelForCausalLM.from_pretrained(
+        args.shield_model_path,
+        device_map="auto",
+        torch_dtype=torch.bfloat16,
+    ).to(device)
+    set_env(42)
+def save_image_sana(img, seed="", save_img=False):
+    unique_name = f"{str(uuid.uuid4())}_{seed}.png"
+    save_path = os.path.join(f"output/online_demo_img/{datetime.now().date()}")
+    os.umask(0o000)  # file permission: 666; dir permission: 777
+    os.makedirs(save_path, exist_ok=True)
+    unique_name = os.path.join(save_path, unique_name)
+    if save_img:
+        save_image(img, unique_name, nrow=1, normalize=True, value_range=(-1, 1))
+    return unique_name
+def randomize_seed_fn(seed: int, randomize_seed: bool) -> int:
+    if randomize_seed:
+        seed = random.randint(0, MAX_SEED)
+    return seed
+@spaces.GPU(enable_queue=True)
+async def generate_2(
+    prompt: str = None,
+    negative_prompt: str = "",
+    style: str = DEFAULT_STYLE_NAME,
+    use_negative_prompt: bool = False,
+    num_imgs: int = 1,
+    seed: int = 0,
+    height: int = 1024,
+    width: int = 1024,
+    flow_dpms_guidance_scale: float = 5.0,
+    flow_dpms_pag_guidance_scale: float = 2.0,
+    flow_dpms_inference_steps: int = 20,
+    randomize_seed: bool = False,
+):
+    seed = int(randomize_seed_fn(seed, randomize_seed))
+    generator = torch.Generator(device=device).manual_seed(seed)
+    print(f"PORT: {DEMO_PORT}, model_path: {model_path}")
+    if safety_check.is_dangerous(safety_checker_tokenizer, safety_checker_model, prompt):
+        prompt = "A red heart."
+    print(prompt)
+    if not use_negative_prompt:
+        negative_prompt = None  # type: ignore
+    prompt, negative_prompt = apply_style(style, prompt, negative_prompt)
+    with torch.no_grad():
+        images = pipe2(
+            prompt=prompt,
+            height=height,
+            width=width,
+            guidance_scale=3.5,
+            num_inference_steps=50,
+            num_images_per_prompt=num_imgs,
+            max_sequence_length=256,
+            generator=generator,
+        ).images
+    save_img = False
+    img = images
+    if save_img:
+        img = [save_image_sana(img, seed, save_img=save_image) for img in images]
+        print(img)
+    torch.cuda.empty_cache()
+    return img
+@spaces.GPU(enable_queue=True)
+async def generate(
+    prompt: str = None,
+    negative_prompt: str = "",
+    style: str = DEFAULT_STYLE_NAME,
+    use_negative_prompt: bool = False,
+    num_imgs: int = 1,
+    seed: int = 0,
+    height: int = 1024,
+    width: int = 1024,
+    flow_dpms_guidance_scale: float = 5.0,
+    flow_dpms_pag_guidance_scale: float = 2.0,
+    flow_dpms_inference_steps: int = 20,
+    randomize_seed: bool = False,
+):
+    global TEST_TIMES
+    # seed = 823753551
+    seed = int(randomize_seed_fn(seed, randomize_seed))
+    generator = torch.Generator(device=device).manual_seed(seed)
+    print(f"PORT: {DEMO_PORT}, model_path: {model_path}, time_times: {TEST_TIMES}")
+    if safety_check.is_dangerous(safety_checker_tokenizer, safety_checker_model, prompt):
+        prompt = "A red heart."
+    print(prompt)
+    num_inference_steps = flow_dpms_inference_steps
+    guidance_scale = flow_dpms_guidance_scale
+    pag_guidance_scale = flow_dpms_pag_guidance_scale
+    if not use_negative_prompt:
+        negative_prompt = None  # type: ignore
+    prompt, negative_prompt = apply_style(style, prompt, negative_prompt)
+    pipe.progress_fn(0, desc="Sana Start")
+    with torch.no_grad():
+        images = pipe(
+            prompt=prompt,
+            height=height,
+            width=width,
+            negative_prompt=negative_prompt,
+            guidance_scale=guidance_scale,
+            pag_guidance_scale=pag_guidance_scale,
+            num_inference_steps=num_inference_steps,
+            num_images_per_prompt=num_imgs,
+            generator=generator,
+        )
+    pipe.progress_fn(1.0, desc="Sana End")
+    save_img = False
+    if save_img:
+        img = [save_image_sana(img, seed, save_img=save_image) for img in images]
+        print(img)
+    else:
+        if num_imgs > 1:
+            nrow = 2
+        else:
+            nrow = 1
+        img = make_grid(images, nrow=nrow, normalize=True, value_range=(-1, 1))
+        img = img.mul(255).add_(0.5).clamp_(0, 255).permute(1, 2, 0).to("cpu", torch.uint8).numpy()
+        img = [Image.fromarray(img.astype(np.uint8))]
+    torch.cuda.empty_cache()
+    return img
+TEST_TIMES = read_inference_count()
+model_size = "1.6" if "D20" in args.model_path else "0.6"
+title = f"""
+    <div style='display: flex; align-items: center; justify-content: center; text-align: center;'>
+        <img src="https://raw.githubusercontent.com/NVlabs/Sana/refs/heads/main/asset/logo.png" width="50%" alt="logo"/>
+    </div>
+"""
+DESCRIPTION = f"""
+        <p><span style="font-size: 36px; font-weight: bold;">Sana-{model_size}B</span><span style="font-size: 20px; font-weight: bold;">{args.image_size}px</span></p>
+        <p style="font-size: 16px; font-weight: bold;">Sana: Efficient High-Resolution Image Synthesis with Linear Diffusion Transformer</p>
+        <p><span style="font-size: 16px;"><a href="https://arxiv.org/abs/2410.10629">[Paper]</a></span> <span style="font-size: 16px;"><a href="https://github.com/NVlabs/Sana">[Github]</a></span> <span style="font-size: 16px;"><a href="https://nvlabs.github.io/Sana">[Project]</a></span</p>
+        <p style="font-size: 16px; font-weight: bold;">Powered by <a href="https://hanlab.mit.edu/projects/dc-ae">DC-AE</a> with 32x latent space</p>
+        <p style="font-size: 16px; font-weight: bold;">Unsafe word will give you a 'Red Heart' in the image instead.</p>
+        """
+if model_size == "0.6":
+    DESCRIPTION += "\n<p>0.6B model's text rendering ability is limited.</p>"
+if not torch.cuda.is_available():
+    DESCRIPTION += "\n<p>Running on CPU 🥶 This demo does not work on CPU.</p>"
+examples = [
+    'a cyberpunk cat with a neon sign that says "Sana"',
+    "A very detailed and realistic full body photo set of a tall, slim, and athletic Shiba Inu in a white oversized straight t-shirt, white shorts, and short white shoes.",
+    "Pirate ship trapped in a cosmic maelstrom nebula, rendered in cosmic beach whirlpool engine, volumetric lighting, spectacular, ambient lights, light pollution, cinematic atmosphere, art nouveau style, illustration art artwork by SenseiJaye, intricate detail.",
+    "portrait photo of a girl, photograph, highly detailed face, depth of field",
+    'make me a logo that says "So Fast"  with a really cool flying dragon shape with lightning sparks all over the sides and all of it contains Indonesian language',
+    "🐶 Wearing 🕶 flying on the 🌈",
+    # "👧 with 🌹 in the ❄️",
+    # "an old rusted robot wearing pants and a jacket riding skis in a supermarket.",
+    # "professional portrait photo of an anthropomorphic cat wearing fancy gentleman hat and jacket walking in autumn forest.",
+    # "Astronaut in a jungle, cold color palette, muted colors, detailed",
+    # "a stunning and luxurious bedroom carved into a rocky mountainside seamlessly blending nature with modern design with a plush earth-toned bed textured stone walls circular fireplace massive uniquely shaped window framing snow-capped mountains dense forests",
+]
+css = """
+.gradio-container{max-width: 1024px !important}
+h1{text-align:center}
+"""
+with gr.Blocks(css=css) as demo:
+    gr.Markdown(title)
+    gr.Markdown(DESCRIPTION)
+    gr.DuplicateButton(
+        value="Duplicate Space for private use",
+        elem_id="duplicate-button",
+        visible=os.getenv("SHOW_DUPLICATE_BUTTON") == "1",
+    )
+    info_box = gr.Markdown(
+        value=f"<span style='font-size: 16px; font-weight: bold;'>Total inference runs: </span><span style='font-size: 16px; color:red; font-weight: bold;'>{read_inference_count()}</span>"
+    )
+    demo.load(fn=update_inference_count, outputs=info_box)  # update the value when re-loading the page
+    # with gr.Row(equal_height=False):
+    with gr.Group():
+        with gr.Row():
+            prompt = gr.Text(
+                label="Prompt",
+                show_label=False,
+                max_lines=1,
+                placeholder="Enter your prompt",
+                container=False,
+            )
+            run_button = gr.Button("Run-sana", scale=0)
+            run_button2 = gr.Button("Run-flux", scale=0)
+        with gr.Row():
+            result = gr.Gallery(label="Result from Sana", show_label=True, columns=NUM_IMAGES_PER_PROMPT, format="webp")
+            result_2 = gr.Gallery(
+                label="Result from FLUX", show_label=True, columns=NUM_IMAGES_PER_PROMPT, format="webp"
+            )
+    with gr.Accordion("Advanced options", open=False):
+        with gr.Group():
+            with gr.Row(visible=True):
+                height = gr.Slider(
+                    label="Height",
+                    minimum=256,
+                    maximum=MAX_IMAGE_SIZE,
+                    step=32,
+                    value=1024,
+                )
+                width = gr.Slider(
+                    label="Width",
+                    minimum=256,
+                    maximum=MAX_IMAGE_SIZE,
+                    step=32,
+                    value=1024,
+                )
+            with gr.Row():
+                flow_dpms_inference_steps = gr.Slider(
+                    label="Sampling steps",
+                    minimum=5,
+                    maximum=40,
+                    step=1,
+                    value=18,
+                )
+                flow_dpms_guidance_scale = gr.Slider(
+                    label="CFG Guidance scale",
+                    minimum=1,
+                    maximum=10,
+                    step=0.1,
+                    value=5.0,
+                )
+                flow_dpms_pag_guidance_scale = gr.Slider(
+                    label="PAG Guidance scale",
+                    minimum=1,
+                    maximum=4,
+                    step=0.5,
+                    value=2.0,
+                )
+            with gr.Row():
+                use_negative_prompt = gr.Checkbox(label="Use negative prompt", value=False, visible=True)
+            negative_prompt = gr.Text(
+                label="Negative prompt",
+                max_lines=1,
+                placeholder="Enter a negative prompt",
+                visible=True,
+            )
+            style_selection = gr.Radio(
+                show_label=True,
+                container=True,
+                interactive=True,
+                choices=STYLE_NAMES,
+                value=DEFAULT_STYLE_NAME,
+                label="Image Style",
+            )
+            seed = gr.Slider(
+                label="Seed",
+                minimum=0,
+                maximum=MAX_SEED,
+                step=1,
+                value=0,
+            )
+            randomize_seed = gr.Checkbox(label="Randomize seed", value=True)
+            with gr.Row(visible=True):
+                schedule = gr.Radio(
+                    show_label=True,
+                    container=True,
+                    interactive=True,
+                    choices=SCHEDULE_NAME,
+                    value=DEFAULT_SCHEDULE_NAME,
+                    label="Sampler Schedule",
+                    visible=True,
+                )
+                num_imgs = gr.Slider(
+                    label="Num Images",
+                    minimum=1,
+                    maximum=6,
+                    step=1,
+                    value=1,
+                )
+    run_button.click(fn=run_inference, inputs=num_imgs, outputs=info_box)
+    gr.Examples(
+        examples=examples,
+        inputs=prompt,
+        outputs=[result],
+        fn=generate,
+        cache_examples=CACHE_EXAMPLES,
+    )
+    gr.Examples(
+        examples=examples,
+        inputs=prompt,
+        outputs=[result_2],
+        fn=generate_2,
+        cache_examples=CACHE_EXAMPLES,
+    )
+    use_negative_prompt.change(
+        fn=lambda x: gr.update(visible=x),
+        inputs=use_negative_prompt,
+        outputs=negative_prompt,
+        api_name=False,
+    )
+    run_button.click(
+        fn=generate,
+        inputs=[
+            prompt,
+            negative_prompt,
+            style_selection,
+            use_negative_prompt,
+            num_imgs,
+            seed,
+            height,
+            width,
+            flow_dpms_guidance_scale,
+            flow_dpms_pag_guidance_scale,
+            flow_dpms_inference_steps,
+            randomize_seed,
+        ],
+        outputs=[result],
+        queue=True,
+    )
+    run_button2.click(
+        fn=generate_2,
+        inputs=[
+            prompt,
+            negative_prompt,
+            style_selection,
+            use_negative_prompt,
+            num_imgs,
+            seed,
+            height,
+            width,
+            flow_dpms_guidance_scale,
+            flow_dpms_pag_guidance_scale,
+            flow_dpms_inference_steps,
+            randomize_seed,
+        ],
+        outputs=[result_2],
+        queue=True,
+    )
+if __name__ == "__main__":
+    demo.queue(max_size=20).launch(server_name="0.0.0.0", server_port=DEMO_PORT, debug=True, share=True)

apps/safety_check.py ADDED Viewed

	@@ -0,0 +1,72 @@

+# Copyright 2024 NVIDIA CORPORATION & AFFILIATES
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# SPDX-License-Identifier: Apache-2.0
+# This file is copy from https://github.com/mit-han-lab/hart/blob/main/hart/utils/safety_check.py
+import json
+from functools import lru_cache
+from typing import Dict, Sequence
+import torch
+import transformers
+from torch.nn.functional import softmax
+from transformers import AutoModelForCausalLM, AutoTokenizer
+__all__ = ["is_dangerous"]
+__rank, __local_rank, __world_size, __device = (
+    0,
+    0,
+    1,
+    "cuda" if torch.cuda.is_available() else "cpu",
+)
+safety_policy = """
+    The prompt must not seek to generate harmful, abusive, hateful, sexually explicit, or otherwise inappropriate content targeting individuals or protected groups.
+"""
+def is_dangerous(tokenizer, model, user_prompt, threshold=0.5):
+    prompt = f"""You are a policy expert trying to help determine whether a user
+    prompt is in violation of the defined safety policies.
+    <start_of_turn>
+    Human Question: {user_prompt.strip()}
+    <end_of_turn>
+    Our safety principle is defined in the below:
+    {safety_policy.strip()}
+    Does the human question violate the above principle? Your answer must start
+    with 'Yes' or 'No'. And then walk through step by step to be sure we answer
+    correctly.
+    """
+    inputs = tokenizer(prompt, return_tensors="pt").to("cuda")
+    with torch.no_grad():
+        logits = model(**inputs).logits
+    # Extract the logits for the Yes and No tokens
+    vocab = tokenizer.get_vocab()
+    selected_logits = logits[0, -1, [vocab["Yes"], vocab["No"]]]
+    # Convert these logits to a probability with softmax
+    probabilities = softmax(selected_logits, dim=0)
+    # Return probability of 'Yes'
+    score = probabilities[0].item()
+    return score > threshold

apps/sana_controlnet_pipeline.py ADDED Viewed

	@@ -0,0 +1,353 @@

+# Copyright 2024 NVIDIA CORPORATION & AFFILIATES
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# SPDX-License-Identifier: Apache-2.0
+import warnings
+from dataclasses import dataclass, field
+from typing import Optional, Tuple
+import numpy as np
+import pyrallis
+import torch
+import torch.nn as nn
+from PIL import Image
+warnings.filterwarnings("ignore")  # ignore warning
+from diffusion import DPMS, FlowEuler
+from diffusion.data.datasets.utils import (
+    ASPECT_RATIO_512_TEST,
+    ASPECT_RATIO_1024_TEST,
+    ASPECT_RATIO_2048_TEST,
+    ASPECT_RATIO_4096_TEST,
+)
+from diffusion.model.builder import build_model, get_tokenizer_and_text_encoder, get_vae, vae_decode, vae_encode
+from diffusion.model.utils import get_weight_dtype, prepare_prompt_ar, resize_and_crop_tensor
+from diffusion.utils.config import SanaConfig, model_init_config
+from diffusion.utils.logger import get_root_logger
+from tools.controlnet.utils import get_scribble_map, transform_control_signal
+from tools.download import find_model
+def guidance_type_select(default_guidance_type, pag_scale, attn_type):
+    guidance_type = default_guidance_type
+    if not (pag_scale > 1.0 and attn_type == "linear"):
+        guidance_type = "classifier-free"
+    elif pag_scale > 1.0 and attn_type == "linear":
+        guidance_type = "classifier-free_PAG"
+    return guidance_type
+def classify_height_width_bin(height: int, width: int, ratios: dict) -> Tuple[int, int]:
+    """Returns binned height and width."""
+    ar = float(height / width)
+    closest_ratio = min(ratios.keys(), key=lambda ratio: abs(float(ratio) - ar))
+    default_hw = ratios[closest_ratio]
+    return int(default_hw[0]), int(default_hw[1])
+def get_ar_from_ref_image(ref_image):
+    def reduce_ratio(h, w):
+        def gcd(a, b):
+            while b:
+                a, b = b, a % b
+            return a
+        divisor = gcd(h, w)
+        return f"{h // divisor}:{w // divisor}"
+    if isinstance(ref_image, str):
+        ref_image = Image.open(ref_image)
+    w, h = ref_image.size
+    return reduce_ratio(h, w)
+@dataclass
+class SanaControlNetInference(SanaConfig):
+    config: Optional[str] = "configs/sana_config/1024ms/Sana_1600M_img1024.yaml"  # config
+    model_path: str = field(
+        default="output/Sana_D20/SANA.pth", metadata={"help": "Path to the model file (positional)"}
+    )
+    output: str = "./output"
+    bs: int = 1
+    image_size: int = 1024
+    cfg_scale: float = 5.0
+    pag_scale: float = 2.0
+    seed: int = 42
+    step: int = -1
+    custom_image_size: Optional[int] = None
+    shield_model_path: str = field(
+        default="google/shieldgemma-2b",
+        metadata={"help": "The path to shield model, we employ ShieldGemma-2B by default."},
+    )
+class SanaControlNetPipeline(nn.Module):
+    def __init__(
+        self,
+        config: Optional[str] = "configs/sana_config/1024ms/Sana_1600M_img1024.yaml",
+    ):
+        super().__init__()
+        config = pyrallis.load(SanaControlNetInference, open(config))
+        self.args = self.config = config
+        # set some hyper-parameters
+        self.image_size = self.config.model.image_size
+        self.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
+        logger = get_root_logger()
+        self.logger = logger
+        self.progress_fn = lambda progress, desc: None
+        self.thickness = 2
+        self.blend_alpha = 0.0
+        self.latent_size = self.image_size // config.vae.vae_downsample_rate
+        self.max_sequence_length = config.text_encoder.model_max_length
+        self.flow_shift = config.scheduler.flow_shift
+        guidance_type = "classifier-free_PAG"
+        weight_dtype = get_weight_dtype(config.model.mixed_precision)
+        self.weight_dtype = weight_dtype
+        self.vae_dtype = get_weight_dtype(config.vae.weight_dtype)
+        self.base_ratios = eval(f"ASPECT_RATIO_{self.image_size}_TEST")
+        self.vis_sampler = self.config.scheduler.vis_sampler
+        logger.info(f"Sampler {self.vis_sampler}, flow_shift: {self.flow_shift}")
+        self.guidance_type = guidance_type_select(guidance_type, self.args.pag_scale, config.model.attn_type)
+        logger.info(f"Inference with {self.weight_dtype}, PAG guidance layer: {self.config.model.pag_applied_layers}")
+        # 1. build vae and text encoder
+        self.vae = self.build_vae(config.vae)
+        self.tokenizer, self.text_encoder = self.build_text_encoder(config.text_encoder)
+        # 2. build Sana model
+        self.model = self.build_sana_model(config).to(self.device)
+        # 3. pre-compute null embedding
+        with torch.no_grad():
+            null_caption_token = self.tokenizer(
+                "", max_length=self.max_sequence_length, padding="max_length", truncation=True, return_tensors="pt"
+            ).to(self.device)
+            self.null_caption_embs = self.text_encoder(null_caption_token.input_ids, null_caption_token.attention_mask)[
+                0
+            ]
+    def build_vae(self, config):
+        vae = get_vae(config.vae_type, config.vae_pretrained, self.device).to(self.vae_dtype)
+        return vae
+    def build_text_encoder(self, config):
+        tokenizer, text_encoder = get_tokenizer_and_text_encoder(name=config.text_encoder_name, device=self.device)
+        return tokenizer, text_encoder
+    def build_sana_model(self, config):
+        # model setting
+        model_kwargs = model_init_config(config, latent_size=self.latent_size)
+        model = build_model(
+            config.model.model,
+            use_fp32_attention=config.model.get("fp32_attention", False) and config.model.mixed_precision != "bf16",
+            **model_kwargs,
+        )
+        self.logger.info(f"use_fp32_attention: {model.fp32_attention}")
+        self.logger.info(
+            f"{model.__class__.__name__}:{config.model.model},"
+            f"Model Parameters: {sum(p.numel() for p in model.parameters()):,}"
+        )
+        return model
+    def from_pretrained(self, model_path):
+        state_dict = find_model(model_path)
+        state_dict = state_dict.get("state_dict", state_dict)
+        if "pos_embed" in state_dict:
+            del state_dict["pos_embed"]
+        missing, unexpected = self.model.load_state_dict(state_dict, strict=False)
+        self.model.eval().to(self.weight_dtype)
+        self.logger.info("Generating sample from ckpt: %s" % model_path)
+        self.logger.warning(f"Missing keys: {missing}")
+        self.logger.warning(f"Unexpected keys: {unexpected}")
+    def register_progress_bar(self, progress_fn=None):
+        self.progress_fn = progress_fn if progress_fn is not None else self.progress_fn
+    def set_blend_alpha(self, blend_alpha):
+        self.blend_alpha = blend_alpha
+    @torch.inference_mode()
+    def forward(
+        self,
+        prompt=None,
+        ref_image=None,
+        negative_prompt="",
+        num_inference_steps=20,
+        guidance_scale=5,
+        pag_guidance_scale=2.5,
+        num_images_per_prompt=1,
+        sketch_thickness=2,
+        generator=torch.Generator().manual_seed(42),
+        latents=None,
+    ):
+        self.ori_height, self.ori_width = ref_image.height, ref_image.width
+        self.guidance_type = guidance_type_select(self.guidance_type, pag_guidance_scale, self.config.model.attn_type)
+        # 1. pre-compute negative embedding
+        if negative_prompt != "":
+            null_caption_token = self.tokenizer(
+                negative_prompt,
+                max_length=self.max_sequence_length,
+                padding="max_length",
+                truncation=True,
+                return_tensors="pt",
+            ).to(self.device)
+            self.null_caption_embs = self.text_encoder(null_caption_token.input_ids, null_caption_token.attention_mask)[
+                0
+            ]
+        if prompt is None:
+            prompt = [""]
+        prompts = prompt if isinstance(prompt, list) else [prompt]
+        samples = []
+        for prompt in prompts:
+            # data prepare
+            prompts, hw, ar = (
+                [],
+                torch.tensor([[self.image_size, self.image_size]], dtype=torch.float, device=self.device).repeat(
+                    num_images_per_prompt, 1
+                ),
+                torch.tensor([[1.0]], device=self.device).repeat(num_images_per_prompt, 1),
+            )
+            ar = get_ar_from_ref_image(ref_image)
+            prompt += f" --ar {ar}"
+            for _ in range(num_images_per_prompt):
+                prompt_clean, _, hw, ar, custom_hw = prepare_prompt_ar(
+                    prompt, self.base_ratios, device=self.device, show=False
+                )
+                prompts.append(prompt_clean.strip())
+            self.latent_size_h, self.latent_size_w = (
+                int(hw[0, 0] // self.config.vae.vae_downsample_rate),
+                int(hw[0, 1] // self.config.vae.vae_downsample_rate),
+            )
+            with torch.no_grad():
+                # prepare text feature
+                if not self.config.text_encoder.chi_prompt:
+                    max_length_all = self.config.text_encoder.model_max_length
+                    prompts_all = prompts
+                else:
+                    chi_prompt = "\n".join(self.config.text_encoder.chi_prompt)
+                    prompts_all = [chi_prompt + prompt for prompt in prompts]
+                    num_chi_prompt_tokens = len(self.tokenizer.encode(chi_prompt))
+                    max_length_all = (
+                        num_chi_prompt_tokens + self.config.text_encoder.model_max_length - 2
+                    )  # magic number 2: [bos], [_]
+                caption_token = self.tokenizer(
+                    prompts_all,
+                    max_length=max_length_all,
+                    padding="max_length",
+                    truncation=True,
+                    return_tensors="pt",
+                ).to(device=self.device)
+                select_index = [0] + list(range(-self.config.text_encoder.model_max_length + 1, 0))
+                caption_embs = self.text_encoder(caption_token.input_ids, caption_token.attention_mask)[0][:, None][
+                    :, :, select_index
+                ].to(self.weight_dtype)
+                emb_masks = caption_token.attention_mask[:, select_index]
+                null_y = self.null_caption_embs.repeat(len(prompts), 1, 1)[:, None].to(self.weight_dtype)
+                n = len(prompts)
+                if latents is None:
+                    z = torch.randn(
+                        n,
+                        self.config.vae.vae_latent_dim,
+                        self.latent_size_h,
+                        self.latent_size_w,
+                        generator=generator,
+                        device=self.device,
+                    )
+                else:
+                    z = latents.to(self.device)
+                model_kwargs = dict(data_info={"img_hw": hw, "aspect_ratio": ar}, mask=emb_masks)
+                # control signal
+                if isinstance(ref_image, str):
+                    ref_image = cv2.imread(ref_image)
+                elif isinstance(ref_image, Image.Image):
+                    ref_image = np.array(ref_image)
+                control_signal = get_scribble_map(
+                    input_image=ref_image,
+                    det="Scribble_HED",
+                    detect_resolution=int(hw.min()),
+                    thickness=sketch_thickness,
+                )
+                control_signal = transform_control_signal(control_signal, hw).to(self.device).to(self.weight_dtype)
+                control_signal_latent = vae_encode(
+                    self.config.vae.vae_type, self.vae, control_signal, self.config.vae.sample_posterior, self.device
+                )
+                model_kwargs["control_signal"] = control_signal_latent
+                if self.vis_sampler == "flow_euler":
+                    flow_solver = FlowEuler(
+                        self.model,
+                        condition=caption_embs,
+                        uncondition=null_y,
+                        cfg_scale=guidance_scale,
+                        model_kwargs=model_kwargs,
+                    )
+                    sample = flow_solver.sample(
+                        z,
+                        steps=num_inference_steps,
+                    )
+                elif self.vis_sampler == "flow_dpm-solver":
+                    scheduler = DPMS(
+                        self.model.forward_with_dpmsolver,
+                        condition=caption_embs,
+                        uncondition=null_y,
+                        guidance_type=self.guidance_type,
+                        cfg_scale=guidance_scale,
+                        model_type="flow",
+                        model_kwargs=model_kwargs,
+                        schedule="FLOW",
+                    )
+                    scheduler.register_progress_bar(self.progress_fn)
+                    sample = scheduler.sample(
+                        z,
+                        steps=num_inference_steps,
+                        order=2,
+                        skip_type="time_uniform_flow",
+                        method="multistep",
+                        flow_shift=self.flow_shift,
+                    )
+            sample = sample.to(self.vae_dtype)
+            with torch.no_grad():
+                sample = vae_decode(self.config.vae.vae_type, self.vae, sample)
+            if self.blend_alpha > 0:
+                print(f"blend image and mask with alpha: {self.blend_alpha}")
+                sample = sample * (1 - self.blend_alpha) + control_signal * self.blend_alpha
+            sample = resize_and_crop_tensor(sample, self.ori_width, self.ori_height)
+            samples.append(sample)
+            return sample
+        return samples

apps/sana_pipeline.py ADDED Viewed

	@@ -0,0 +1,304 @@

+# Copyright 2024 NVIDIA CORPORATION & AFFILIATES
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# SPDX-License-Identifier: Apache-2.0
+import argparse
+import warnings
+from dataclasses import dataclass, field
+from typing import Optional, Tuple
+import pyrallis
+import torch
+import torch.nn as nn
+warnings.filterwarnings("ignore")  # ignore warning
+from diffusion import DPMS, FlowEuler
+from diffusion.data.datasets.utils import (
+    ASPECT_RATIO_512_TEST,
+    ASPECT_RATIO_1024_TEST,
+    ASPECT_RATIO_2048_TEST,
+    ASPECT_RATIO_4096_TEST,
+)
+from diffusion.model.builder import build_model, get_tokenizer_and_text_encoder, get_vae, vae_decode
+from diffusion.model.utils import get_weight_dtype, prepare_prompt_ar, resize_and_crop_tensor
+from diffusion.utils.config import SanaConfig, model_init_config
+from diffusion.utils.logger import get_root_logger
+# from diffusion.utils.misc import read_config
+from tools.download import find_model
+def guidance_type_select(default_guidance_type, pag_scale, attn_type):
+    guidance_type = default_guidance_type
+    if not (pag_scale > 1.0 and attn_type == "linear"):
+        guidance_type = "classifier-free"
+    elif pag_scale > 1.0 and attn_type == "linear":
+        guidance_type = "classifier-free_PAG"
+    return guidance_type
+def classify_height_width_bin(height: int, width: int, ratios: dict) -> Tuple[int, int]:
+    """Returns binned height and width."""
+    ar = float(height / width)
+    closest_ratio = min(ratios.keys(), key=lambda ratio: abs(float(ratio) - ar))
+    default_hw = ratios[closest_ratio]
+    return int(default_hw[0]), int(default_hw[1])
+@dataclass
+class SanaInference(SanaConfig):
+    config: Optional[str] = "configs/sana_config/1024ms/Sana_1600M_img1024.yaml"  # config
+    model_path: str = field(
+        default="output/Sana_D20/SANA.pth", metadata={"help": "Path to the model file (positional)"}
+    )
+    output: str = "./output"
+    bs: int = 1
+    image_size: int = 1024
+    cfg_scale: float = 5.0
+    pag_scale: float = 2.0
+    seed: int = 42
+    step: int = -1
+    custom_image_size: Optional[int] = None
+    shield_model_path: str = field(
+        default="google/shieldgemma-2b",
+        metadata={"help": "The path to shield model, we employ ShieldGemma-2B by default."},
+    )
+class SanaPipeline(nn.Module):
+    def __init__(
+        self,
+        config: Optional[str] = "configs/sana_config/1024ms/Sana_1600M_img1024.yaml",
+    ):
+        super().__init__()
+        config = pyrallis.load(SanaInference, open(config))
+        self.args = self.config = config
+        # set some hyper-parameters
+        self.image_size = self.config.model.image_size
+        self.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
+        logger = get_root_logger()
+        self.logger = logger
+        self.progress_fn = lambda progress, desc: None
+        self.latent_size = self.image_size // config.vae.vae_downsample_rate
+        self.max_sequence_length = config.text_encoder.model_max_length
+        self.flow_shift = config.scheduler.flow_shift
+        guidance_type = "classifier-free_PAG"
+        weight_dtype = get_weight_dtype(config.model.mixed_precision)
+        self.weight_dtype = weight_dtype
+        self.vae_dtype = get_weight_dtype(config.vae.weight_dtype)
+        self.base_ratios = eval(f"ASPECT_RATIO_{self.image_size}_TEST")
+        self.vis_sampler = self.config.scheduler.vis_sampler
+        logger.info(f"Sampler {self.vis_sampler}, flow_shift: {self.flow_shift}")
+        self.guidance_type = guidance_type_select(guidance_type, self.args.pag_scale, config.model.attn_type)
+        logger.info(f"Inference with {self.weight_dtype}, PAG guidance layer: {self.config.model.pag_applied_layers}")
+        # 1. build vae and text encoder
+        self.vae = self.build_vae(config.vae)
+        self.tokenizer, self.text_encoder = self.build_text_encoder(config.text_encoder)
+        # 2. build Sana model
+        self.model = self.build_sana_model(config).to(self.device)
+        # 3. pre-compute null embedding
+        with torch.no_grad():
+            null_caption_token = self.tokenizer(
+                "", max_length=self.max_sequence_length, padding="max_length", truncation=True, return_tensors="pt"
+            ).to(self.device)
+            self.null_caption_embs = self.text_encoder(null_caption_token.input_ids, null_caption_token.attention_mask)[
+                0
+            ]
+    def build_vae(self, config):
+        vae = get_vae(config.vae_type, config.vae_pretrained, self.device).to(self.vae_dtype)
+        return vae
+    def build_text_encoder(self, config):
+        tokenizer, text_encoder = get_tokenizer_and_text_encoder(name=config.text_encoder_name, device=self.device)
+        return tokenizer, text_encoder
+    def build_sana_model(self, config):
+        # model setting
+        model_kwargs = model_init_config(config, latent_size=self.latent_size)
+        model = build_model(
+            config.model.model,
+            use_fp32_attention=config.model.get("fp32_attention", False) and config.model.mixed_precision != "bf16",
+            **model_kwargs,
+        )
+        self.logger.info(f"use_fp32_attention: {model.fp32_attention}")
+        self.logger.info(
+            f"{model.__class__.__name__}:{config.model.model},"
+            f"Model Parameters: {sum(p.numel() for p in model.parameters()):,}"
+        )
+        return model
+    def from_pretrained(self, model_path):
+        state_dict = find_model(model_path)
+        state_dict = state_dict.get("state_dict", state_dict)
+        if "pos_embed" in state_dict:
+            del state_dict["pos_embed"]
+        missing, unexpected = self.model.load_state_dict(state_dict, strict=False)
+        self.model.eval().to(self.weight_dtype)
+        self.logger.info("Generating sample from ckpt: %s" % model_path)
+        self.logger.warning(f"Missing keys: {missing}")
+        self.logger.warning(f"Unexpected keys: {unexpected}")
+    def register_progress_bar(self, progress_fn=None):
+        self.progress_fn = progress_fn if progress_fn is not None else self.progress_fn
+    @torch.inference_mode()
+    def forward(
+        self,
+        prompt=None,
+        height=1024,
+        width=1024,
+        negative_prompt="",
+        num_inference_steps=20,
+        guidance_scale=5,
+        pag_guidance_scale=2.5,
+        num_images_per_prompt=1,
+        generator=torch.Generator().manual_seed(42),
+        latents=None,
+    ):
+        self.ori_height, self.ori_width = height, width
+        self.height, self.width = classify_height_width_bin(height, width, ratios=self.base_ratios)
+        self.latent_size_h, self.latent_size_w = (
+            self.height // self.config.vae.vae_downsample_rate,
+            self.width // self.config.vae.vae_downsample_rate,
+        )
+        self.guidance_type = guidance_type_select(self.guidance_type, pag_guidance_scale, self.config.model.attn_type)
+        # 1. pre-compute negative embedding
+        if negative_prompt != "":
+            null_caption_token = self.tokenizer(
+                negative_prompt,
+                max_length=self.max_sequence_length,
+                padding="max_length",
+                truncation=True,
+                return_tensors="pt",
+            ).to(self.device)
+            self.null_caption_embs = self.text_encoder(null_caption_token.input_ids, null_caption_token.attention_mask)[
+                0
+            ]
+        if prompt is None:
+            prompt = [""]
+        prompts = prompt if isinstance(prompt, list) else [prompt]
+        samples = []
+        for prompt in prompts:
+            # data prepare
+            prompts, hw, ar = (
+                [],
+                torch.tensor([[self.image_size, self.image_size]], dtype=torch.float, device=self.device).repeat(
+                    num_images_per_prompt, 1
+                ),
+                torch.tensor([[1.0]], device=self.device).repeat(num_images_per_prompt, 1),
+            )
+            for _ in range(num_images_per_prompt):
+                prompts.append(prepare_prompt_ar(prompt, self.base_ratios, device=self.device, show=False)[0].strip())
+            with torch.no_grad():
+                # prepare text feature
+                if not self.config.text_encoder.chi_prompt:
+                    max_length_all = self.config.text_encoder.model_max_length
+                    prompts_all = prompts
+                else:
+                    chi_prompt = "\n".join(self.config.text_encoder.chi_prompt)
+                    prompts_all = [chi_prompt + prompt for prompt in prompts]
+                    num_chi_prompt_tokens = len(self.tokenizer.encode(chi_prompt))
+                    max_length_all = (
+                        num_chi_prompt_tokens + self.config.text_encoder.model_max_length - 2
+                    )  # magic number 2: [bos], [_]
+                caption_token = self.tokenizer(
+                    prompts_all,
+                    max_length=max_length_all,
+                    padding="max_length",
+                    truncation=True,
+                    return_tensors="pt",
+                ).to(device=self.device)
+                select_index = [0] + list(range(-self.config.text_encoder.model_max_length + 1, 0))
+                caption_embs = self.text_encoder(caption_token.input_ids, caption_token.attention_mask)[0][:, None][
+                    :, :, select_index
+                ].to(self.weight_dtype)
+                emb_masks = caption_token.attention_mask[:, select_index]
+                null_y = self.null_caption_embs.repeat(len(prompts), 1, 1)[:, None].to(self.weight_dtype)
+                n = len(prompts)
+                if latents is None:
+                    z = torch.randn(
+                        n,
+                        self.config.vae.vae_latent_dim,
+                        self.latent_size_h,
+                        self.latent_size_w,
+                        generator=generator,
+                        device=self.device,
+                    )
+                else:
+                    z = latents.to(self.device)
+                model_kwargs = dict(data_info={"img_hw": hw, "aspect_ratio": ar}, mask=emb_masks)
+                if self.vis_sampler == "flow_euler":
+                    flow_solver = FlowEuler(
+                        self.model,
+                        condition=caption_embs,
+                        uncondition=null_y,
+                        cfg_scale=guidance_scale,
+                        model_kwargs=model_kwargs,
+                    )
+                    sample = flow_solver.sample(
+                        z,
+                        steps=num_inference_steps,
+                    )
+                elif self.vis_sampler == "flow_dpm-solver":
+                    scheduler = DPMS(
+                        self.model,
+                        condition=caption_embs,
+                        uncondition=null_y,
+                        guidance_type=self.guidance_type,
+                        cfg_scale=guidance_scale,
+                        pag_scale=pag_guidance_scale,
+                        pag_applied_layers=self.config.model.pag_applied_layers,
+                        model_type="flow",
+                        model_kwargs=model_kwargs,
+                        schedule="FLOW",
+                    )
+                    scheduler.register_progress_bar(self.progress_fn)
+                    sample = scheduler.sample(
+                        z,
+                        steps=num_inference_steps,
+                        order=2,
+                        skip_type="time_uniform_flow",
+                        method="multistep",
+                        flow_shift=self.flow_shift,
+                    )
+            sample = sample.to(self.vae_dtype)
+            with torch.no_grad():
+                sample = vae_decode(self.config.vae.vae_type, self.vae, sample)
+            sample = resize_and_crop_tensor(sample, self.ori_width, self.ori_height)
+            samples.append(sample)
+            return sample
+        return samples