Spaces:

Shuang59
/

Composable-Diffusion

Runtime error

App Files Files Community

Shuang59 commited on Jul 28, 2022

Commit

eb601c1

1 Parent(s): 5219f50

Update app.py

Browse files

Files changed (1) hide show

app.py +153 -150

app.py CHANGED Viewed

@@ -7,7 +7,6 @@ Original file is located at
     https://colab.research.google.com/drive/19xx6Nu4FeiGj-TzTUFxBf-15IkeuFx_F
 """
 # from PIL import Image
 # from IPython.display import display
 import torch as th
@@ -25,6 +24,7 @@ from composable_diffusion.model_creation import create_model_and_diffusion as cr
 from composable_diffusion.model_creation import model_and_diffusion_defaults as model_and_diffusion_defaults_for_clevr
 # This notebook supports both CPU and GPU.
 # On CPU, generating one sample may take on the order of 20 minutes.
 # On a GPU, it should be under a minute.
@@ -34,10 +34,10 @@ device = th.device('cpu' if not has_cuda else 'cuda')
 print(device)
 # Create base model.
-timestep_respacing =  100 #@param{type: 'number'}
 options = model_and_diffusion_defaults()
 options['use_fp16'] = has_cuda
-options['timestep_respacing'] = str(timestep_respacing) # use 100 diffusion steps for fast sampling
 model, diffusion = create_model_and_diffusion(**options)
 model.eval()
 if has_cuda:
@@ -49,7 +49,7 @@ print('total base parameters', sum(x.numel() for x in model.parameters()))
 # Create upsampler model.
 options_up = model_and_diffusion_defaults_upsampler()
 options_up['use_fp16'] = has_cuda
-options_up['timestep_respacing'] = 'fast27' # use 27 diffusion steps for very fast sampling
 model_up, diffusion_up = create_model_and_diffusion(**options_up)
 model_up.eval()
 if has_cuda:
@@ -58,146 +58,145 @@ model_up.to(device)
 model_up.load_state_dict(load_checkpoint('upsample', device))
 print('total upsampler parameters', sum(x.numel() for x in model_up.parameters()))
 def show_images(batch: th.Tensor):
     """ Display a batch of images inline. """
-    scaled = ((batch + 1)*127.5).round().clamp(0,255).to(th.uint8).cpu()
     reshaped = scaled.permute(2, 0, 3, 1).reshape([batch.shape[2], -1, 3])
     display(Image.fromarray(reshaped.numpy()))
 def compose_language_descriptions(prompt, guidance_scale):
-  #@markdown `prompt`: when composing  multiple sentences, using `|` as the delimiter.
-  prompts = [x.strip() for x in prompt.split('|')]
-  batch_size = 1
-  # Tune this parameter to control the sharpness of 256x256 images.
-  # A value of 1.0 is sharper, but sometimes results in grainy artifacts.
-  upsample_temp = 0.980 #@param{type: 'number'}
-  masks = [True] * len(prompts) + [False]
-  # coefficients = th.tensor([0.5, 0.5], device=device).reshape(-1, 1, 1, 1)
-  masks = th.tensor(masks, dtype=th.bool, device=device)
-  # sampling function
-  def model_fn(x_t, ts, **kwargs):
-    half = x_t[:1]
-    combined = th.cat([half] * x_t.size(0), dim=0)
-    model_out = model(combined, ts, **kwargs)
-    eps, rest = model_out[:, :3], model_out[:, 3:]
-    cond_eps = eps[masks].mean(dim=0, keepdim=True)
-    # cond_eps = (coefficients * eps[masks]).sum(dim=0)[None]
-    uncond_eps = eps[~masks].mean(dim=0, keepdim=True)
-    half_eps = uncond_eps + guidance_scale * (cond_eps - uncond_eps)
-    eps = th.cat([half_eps] * x_t.size(0), dim=0)
-    return th.cat([eps, rest], dim=1)
-  ##############################
-  # Sample from the base model #
-  ##############################
-  # Create the text tokens to feed to the model.
-  def sample_64(prompts):
-    tokens_list = [model.tokenizer.encode(prompt) for prompt in prompts]
-    outputs = [model.tokenizer.padded_tokens_and_mask(
-        tokens, options['text_ctx']
-    ) for tokens in tokens_list]
-    cond_tokens, cond_masks = zip(*outputs)
-    cond_tokens, cond_masks = list(cond_tokens), list(cond_masks)
-    full_batch_size = batch_size * (len(prompts) + 1)
-    uncond_tokens, uncond_mask = model.tokenizer.padded_tokens_and_mask(
-        [], options['text_ctx']
-    )
-    # Pack the tokens together into model kwargs.
-    model_kwargs = dict(
-        tokens=th.tensor(
-            cond_tokens + [uncond_tokens], device=device
-        ),
-        mask=th.tensor(
-            cond_masks + [uncond_mask],
-            dtype=th.bool,
             device=device,
-        ),
-    )
-    # Sample from the base model.
-    model.del_cache()
-    samples = diffusion.p_sample_loop(
-        model_fn,
-        (full_batch_size, 3, options["image_size"], options["image_size"]),
-        device=device,
-        clip_denoised=True,
-        progress=True,
-        model_kwargs=model_kwargs,
-        cond_fn=None,
-    )[:batch_size]
-    model.del_cache()
-    # Show the output
-    return samples
-  ##############################
-  # Upsample the 64x64 samples #
-  ##############################
-  def upsampling_256(prompts, samples):
-    tokens = model_up.tokenizer.encode("".join(prompts))
-    tokens, mask = model_up.tokenizer.padded_tokens_and_mask(
-        tokens, options_up['text_ctx']
-    )
-    # Create the model conditioning dict.
-    model_kwargs = dict(
-        # Low-res image to upsample.
-        low_res=((samples+1)*127.5).round()/127.5 - 1,
-        # Text tokens
-        tokens=th.tensor(
-            [tokens] * batch_size, device=device
-        ),
-        mask=th.tensor(
-            [mask] * batch_size,
-            dtype=th.bool,
             device=device,
-        ),
-    )
-    # Sample from the base model.
-    model_up.del_cache()
-    up_shape = (batch_size, 3, options_up["image_size"], options_up["image_size"])
-    up_samples = diffusion_up.ddim_sample_loop(
-        model_up,
-        up_shape,
-        noise=th.randn(up_shape, device=device) * upsample_temp,
-        device=device,
-        clip_denoised=True,
-        progress=True,
-        model_kwargs=model_kwargs,
-        cond_fn=None,
-    )[:batch_size]
-    model_up.del_cache()
-    # Show the output
-    return up_samples
-  # sampling 64x64 images
-  samples = sample_64(prompts)
-  # show_images(samples)
-  # upsample from 64x64 to 256x256
-  upsamples = upsampling_256(prompts, samples)
-  # show_images(upsamples)
-  out_img = upsamples[0].permute(1,2,0)
-  out_img = (out_img+1)/2
-  out_img = (out_img.detach().cpu() * 255.).to(th.uint8)
-  out_img = out_img.numpy()
-  return out_img
 # create model for CLEVR Objects
 clevr_options = model_and_diffusion_defaults_for_clevr()
@@ -219,24 +218,24 @@ flags = {
 }
 for key, val in flags.items():
-  clevr_options[key] = val
 clevr_model, clevr_diffusion = create_model_and_diffusion_for_clevr(**clevr_options)
 clevr_model.eval()
 if has_cuda:
     clevr_model.convert_to_fp16()
 clevr_model.to(device)
 clevr_model.load_state_dict(th.load(download_model('clevr_pos'), device))
 print('total clevr_pos parameters', sum(x.numel() for x in clevr_model.parameters()))
 def compose_clevr_objects(prompt, guidance_scale):
-    print(prompt)
-    coordinates = [[float(x.split(',')[0].strip()), float(x.split(',')[1].strip())]
-               for x in prompt.split('|')]
-    coordinates += [[-1, -1]] # add unconditional score label
     batch_size = 1
     def model_fn(x_t, ts, **kwargs):
         half = x_t[:1]
         combined = th.cat([half] * kwargs['y'].size(0), dim=0)
@@ -248,7 +247,7 @@ def compose_clevr_objects(prompt, guidance_scale):
         half_eps = uncond_eps + guidance_scale * (cond_eps - uncond_eps)
         eps = th.cat([half_eps] * x_t.size(0), dim=0)
         return th.cat([eps, rest], dim=1)
     def sample(coordinates):
         masks = [True] * (len(coordinates) - 1) + [False]
         model_kwargs = dict(
@@ -257,21 +256,23 @@ def compose_clevr_objects(prompt, guidance_scale):
         )
         samples = clevr_diffusion.p_sample_loop(
             model_fn,
-            (len(coordinates), 3, options["image_size"], options["image_size"]),
             device=device,
             clip_denoised=True,
             progress=True,
             model_kwargs=model_kwargs,
             cond_fn=None,
         )[:batch_size]
         return samples
     samples = sample(coordinates)
-    out_img = samples[0].permute(1,2,0)
-    out_img = (out_img+1)/2
     out_img = (out_img.detach().cpu() * 255.).to(th.uint8)
     out_img = out_img.numpy()
     return out_img
@@ -281,6 +282,7 @@ def compose(prompt, version, guidance_scale):
     else:
         return compose_clevr_objects(prompt, guidance_scale)
 examples_1 = 'a camel | a forest'
 examples_2 = 'A cloudy blue sky  | A mountain in the horizon | Cherry Blossoms in front of the mountain'
 examples_3 = '0.1, 0.5 | 0.3, 0.5 | 0.5, 0.5 | 0.7, 0.5 | 0.9, 0.5'
@@ -289,8 +291,9 @@ examples = [[examples_1, 'GLIDE', 10], [examples_2, 'GLIDE', 10], [examples_3, '
 import gradio as gr
 title = 'Compositional Visual Generation with Composable Diffusion Models'
-description = '<p>Demo for Composable Diffusion (~20s per example if gpu is used, otherwise it will take quite a bit of time.)</p><p>See more information from our <a href="https://energy-based-model.github.io/Compositional-Visual-Generation-with-Composable-Diffusion-Models/">Project Page</a>.</p><ul><li>One version is based on the released <a href="https://github.com/openai/glide-text2im">GLIDE</a> for composing natural language description.</li><li>Another is based on our pre-trained CLEVR Object Model for composing objects. <br>(<b>Note</b>: We recommend using <b><i>x</i></b> in range <b><i>[0.1, 0.9]</i></b> and <b><i>y</i></b> in range <b><i>[0.25, 0.7]</i></b>, since the training dataset labels are in given ranges.)</li></ul><p>When composing  multiple sentences, use `|` as the delimiter, see given examples below.</p>'
-iface = gr.Interface(compose, inputs=["text", gr.inputs.Radio(['GLIDE','CLEVR Objects'], type="value", default='GLIDE', label='version'), gr.Slider(1, 10)], outputs='image', title=title, description=description, examples=examples)
 iface.launch()

     https://colab.research.google.com/drive/19xx6Nu4FeiGj-TzTUFxBf-15IkeuFx_F
 """
 # from PIL import Image
 # from IPython.display import display
 import torch as th
 from composable_diffusion.model_creation import model_and_diffusion_defaults as model_and_diffusion_defaults_for_clevr
+from PIL import Image
 # This notebook supports both CPU and GPU.
 # On CPU, generating one sample may take on the order of 20 minutes.
 # On a GPU, it should be under a minute.
 print(device)
 # Create base model.
+timestep_respacing = 100  # @param{type: 'number'}
 options = model_and_diffusion_defaults()
 options['use_fp16'] = has_cuda
+options['timestep_respacing'] = str(timestep_respacing)  # use 100 diffusion steps for fast sampling
 model, diffusion = create_model_and_diffusion(**options)
 model.eval()
 if has_cuda:
 # Create upsampler model.
 options_up = model_and_diffusion_defaults_upsampler()
 options_up['use_fp16'] = has_cuda
+options_up['timestep_respacing'] = 'fast27'  # use 27 diffusion steps for very fast sampling
 model_up, diffusion_up = create_model_and_diffusion(**options_up)
 model_up.eval()
 if has_cuda:
 model_up.load_state_dict(load_checkpoint('upsample', device))
 print('total upsampler parameters', sum(x.numel() for x in model_up.parameters()))
 def show_images(batch: th.Tensor):
     """ Display a batch of images inline. """
+    scaled = ((batch + 1) * 127.5).round().clamp(0, 255).to(th.uint8).cpu()
     reshaped = scaled.permute(2, 0, 3, 1).reshape([batch.shape[2], -1, 3])
     display(Image.fromarray(reshaped.numpy()))
 def compose_language_descriptions(prompt, guidance_scale):
+    # @markdown `prompt`: when composing  multiple sentences, using `|` as the delimiter.
+    prompts = [x.strip() for x in prompt.split('|')]
+    batch_size = 1
+    # Tune this parameter to control the sharpness of 256x256 images.
+    # A value of 1.0 is sharper, but sometimes results in grainy artifacts.
+    upsample_temp = 0.980  # @param{type: 'number'}
+    masks = [True] * len(prompts) + [False]
+    # coefficients = th.tensor([0.5, 0.5], device=device).reshape(-1, 1, 1, 1)
+    masks = th.tensor(masks, dtype=th.bool, device=device)
+    # sampling function
+    def model_fn(x_t, ts, **kwargs):
+        half = x_t[:1]
+        combined = th.cat([half] * x_t.size(0), dim=0)
+        model_out = model(combined, ts, **kwargs)
+        eps, rest = model_out[:, :3], model_out[:, 3:]
+        cond_eps = eps[masks].mean(dim=0, keepdim=True)
+        # cond_eps = (coefficients * eps[masks]).sum(dim=0)[None]
+        uncond_eps = eps[~masks].mean(dim=0, keepdim=True)
+        half_eps = uncond_eps + guidance_scale * (cond_eps - uncond_eps)
+        eps = th.cat([half_eps] * x_t.size(0), dim=0)
+        return th.cat([eps, rest], dim=1)
+    ##############################
+    # Sample from the base model #
+    ##############################
+    # Create the text tokens to feed to the model.
+    def sample_64(prompts):
+        tokens_list = [model.tokenizer.encode(prompt) for prompt in prompts]
+        outputs = [model.tokenizer.padded_tokens_and_mask(
+            tokens, options['text_ctx']
+        ) for tokens in tokens_list]
+        cond_tokens, cond_masks = zip(*outputs)
+        cond_tokens, cond_masks = list(cond_tokens), list(cond_masks)
+        full_batch_size = batch_size * (len(prompts) + 1)
+        uncond_tokens, uncond_mask = model.tokenizer.padded_tokens_and_mask(
+            [], options['text_ctx']
+        )
+        # Pack the tokens together into model kwargs.
+        model_kwargs = dict(
+            tokens=th.tensor(
+                cond_tokens + [uncond_tokens], device=device
+            ),
+            mask=th.tensor(
+                cond_masks + [uncond_mask],
+                dtype=th.bool,
+                device=device,
+            ),
+        )
+        # Sample from the base model.
+        model.del_cache()
+        samples = diffusion.p_sample_loop(
+            model_fn,
+            (full_batch_size, 3, options["image_size"], options["image_size"]),
             device=device,
+            clip_denoised=True,
+            progress=True,
+            model_kwargs=model_kwargs,
+            cond_fn=None,
+        )[:batch_size]
+        model.del_cache()
+        # Show the output
+        return samples
+    ##############################
+    # Upsample the 64x64 samples #
+    ##############################
+    def upsampling_256(prompts, samples):
+        tokens = model_up.tokenizer.encode("".join(prompts))
+        tokens, mask = model_up.tokenizer.padded_tokens_and_mask(
+            tokens, options_up['text_ctx']
+        )
+        # Create the model conditioning dict.
+        model_kwargs = dict(
+            # Low-res image to upsample.
+            low_res=((samples + 1) * 127.5).round() / 127.5 - 1,
+            # Text tokens
+            tokens=th.tensor(
+                [tokens] * batch_size, device=device
+            ),
+            mask=th.tensor(
+                [mask] * batch_size,
+                dtype=th.bool,
+                device=device,
+            ),
+        )
+        # Sample from the base model.
+        model_up.del_cache()
+        up_shape = (batch_size, 3, options_up["image_size"], options_up["image_size"])
+        up_samples = diffusion_up.ddim_sample_loop(
+            model_up,
+            up_shape,
+            noise=th.randn(up_shape, device=device) * upsample_temp,
             device=device,
+            clip_denoised=True,
+            progress=True,
+            model_kwargs=model_kwargs,
+            cond_fn=None,
+        )[:batch_size]
+        model_up.del_cache()
+        # Show the output
+        return up_samples
+    # sampling 64x64 images
+    samples = sample_64(prompts)
+    # show_images(samples)
+    # upsample from 64x64 to 256x256
+    upsamples = upsampling_256(prompts, samples)
+    # show_images(upsamples)
+    out_img = upsamples[0].permute(1, 2, 0)
+    out_img = (out_img + 1) / 2
+    out_img = (out_img.detach().cpu() * 255.).to(th.uint8)
+    out_img = out_img.numpy()
+    return out_img
 # create model for CLEVR Objects
 clevr_options = model_and_diffusion_defaults_for_clevr()
 }
 for key, val in flags.items():
+    clevr_options[key] = val
 clevr_model, clevr_diffusion = create_model_and_diffusion_for_clevr(**clevr_options)
 clevr_model.eval()
 if has_cuda:
     clevr_model.convert_to_fp16()
 clevr_model.to(device)
 clevr_model.load_state_dict(th.load(download_model('clevr_pos'), device))
 print('total clevr_pos parameters', sum(x.numel() for x in clevr_model.parameters()))
 def compose_clevr_objects(prompt, guidance_scale):
+    coordinates = [[float(x.split(',')[0].strip()), float(x.split(',')[1].strip())]
+                   for x in prompt.split('|')]
+    coordinates += [[-1, -1]]  # add unconditional score label
     batch_size = 1
     def model_fn(x_t, ts, **kwargs):
         half = x_t[:1]
         combined = th.cat([half] * kwargs['y'].size(0), dim=0)
         half_eps = uncond_eps + guidance_scale * (cond_eps - uncond_eps)
         eps = th.cat([half_eps] * x_t.size(0), dim=0)
         return th.cat([eps, rest], dim=1)
     def sample(coordinates):
         masks = [True] * (len(coordinates) - 1) + [False]
         model_kwargs = dict(
         )
         samples = clevr_diffusion.p_sample_loop(
             model_fn,
+            (len(coordinates), 3, clevr_options["image_size"], clevr_options["image_size"]),
             device=device,
             clip_denoised=True,
             progress=True,
             model_kwargs=model_kwargs,
             cond_fn=None,
         )[:batch_size]
         return samples
     samples = sample(coordinates)
+    out_img = samples[0].permute(1, 2, 0)
+    out_img = (out_img + 1) / 2
     out_img = (out_img.detach().cpu() * 255.).to(th.uint8)
     out_img = out_img.numpy()
+    Image.fromarray(out_img).convert('RGB').save('test.png')
     return out_img
     else:
         return compose_clevr_objects(prompt, guidance_scale)
 examples_1 = 'a camel | a forest'
 examples_2 = 'A cloudy blue sky  | A mountain in the horizon | Cherry Blossoms in front of the mountain'
 examples_3 = '0.1, 0.5 | 0.3, 0.5 | 0.5, 0.5 | 0.7, 0.5 | 0.9, 0.5'
 import gradio as gr
 title = 'Compositional Visual Generation with Composable Diffusion Models'
+description = '<p>Demo for Composable Diffusion<ul><li>~30s per GLIDE example</li><li>~10s per CLEVR Object example</li>(<b>Note</b>: time is measured by per example if gpu is used, otherwise it will take quite a bit of time.)</ul></p><p>See more information from our <a href="https://energy-based-model.github.io/Compositional-Visual-Generation-with-Composable-Diffusion-Models/">Project Page</a>.</p><ul><li>One version is based on the released <a href="https://github.com/openai/glide-text2im">GLIDE</a> for composing natural language description.</li><li>Another is based on our pre-trained CLEVR Object Model for composing objects. <br>(<b>Note</b>: We recommend using <b><i>x</i></b> in range <b><i>[0.1, 0.9]</i></b> and <b><i>y</i></b> in range <b><i>[0.25, 0.7]</i></b>, since the training dataset labels are in given ranges.)</li></ul><p>When composing  multiple sentences, use `|` as the delimiter, see given examples below.</p>'
+iface = gr.Interface(compose, inputs=["text", gr.Radio(['GLIDE', 'CLEVR Objects'], type="value", label='version'), gr.Slider(1, 20)], outputs='image',
+                     title=title, description=description, examples=examples)
 iface.launch()