--- license: apache-2.0 pipeline_tag: text-to-image --- # VQ-VAE Preview The decoder provides a preview image from a predicted series of tokens (int values). Compatible with the Gemma/LLama/Qwen models and the [Trainer](https://huggingface.co/docs/transformers/en/main_classes/trainer) class. ![](images/preview.png) ## Inference ```python from diffusers import VQModel from PIL import Image from torchvision import transforms def image_to_indices(image, vq): normalize = transforms.Compose([ transforms.ToTensor(), transforms.Normalize([0.5], [0.5]) ]) norm = normalize(image).unsqueeze(0).to(vq.device) assert image.width == image.height latent = vq.encoder(norm) y = vq.quant_conv(latent) y = y.permute(0, 2, 3, 1).contiguous() indices = torch.argmin( torch.cdist(y.view(-1, vq.quantize.vq_embed_dim), vq.quantize.embedding.weight), dim=1 ) indices = indices.unsqueeze(0) assert indices.shape[0] == norm.shape[0] return indices, norm def norm_to_tensor(indices, vq, ada: AdaLayerNorm): b = indices.shape[0] w = int(indices.shape[-1] ** 0.5) v = vq.quantize.embedding(indices) v = ada.forward(v) v = v.view((b, w, w, vq.config.latent_channels)) v = v.permute(0, 3, 1, 2).contiguous() x = vq.post_quant_conv(v) y = vq.decoder(x, None) return y if __name__ == '__main__': vq = VQModel.from_pretrained('MeissonFlow/Meissonic', subfolder='vqvae') vq.to('cuda') vq.eval() image = Image.open('path/to/image.png') # Encode image. ids, _ = image_to_indices(image, vq) # Decode by integer ids. pixel_values = tokenizer.norm_to_tensor(ids, vq, ada) y_min, y_max = tensor.min(), tensor.max() y = (tensor - y_min) / (y_max - y_min) preview = transforms.functional.to_pil_image(y.detach().squeeze(0).to('cpu')) ``` ## Datasets - Chars/pixiv_rank_daily_2018_2023 - jordandavis/fashion_num_people - mattmdjaga/human_parsing_dataset