tuandunghcmut
/

vlm_clone_2

Model card Files Files and versions Community

tuandunghcmut commited on Apr 10

Commit

64f97f7

verified ·

1 Parent(s): 7ee2e81

Add files using upload-large-folder tool

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

VLM2Vec/evaluation/__init__.py +0 -0
VLM2Vec/evaluation/eval_blip.py +209 -0
VLM2Vec/evaluation/eval_clip.py +185 -0
VLM2Vec/evaluation/eval_openclip.py +185 -0
VLM2Vec/evaluation/eval_siglip.py +186 -0
VLM2Vec/src/dist_utils.py +92 -0
VLMEvalKit_old/PaddleMIX/deploy/README.md +110 -0
VLMEvalKit_old/PaddleMIX/deploy/README_en.md +108 -0
VLMEvalKit_old/PaddleMIX/docs/CHANGELOG.md +44 -0
VLMEvalKit_old/PaddleMIX/docs/FAQ.md +0 -0
VLMEvalKit_old/PaddleMIX/docs/train_tutorial.md +10 -0
VLMEvalKit_old/PaddleMIX/ppdiffusers/LICENSE +203 -0
VLMEvalKit_old/PaddleMIX/ppdiffusers/Makefile +30 -0
VLMEvalKit_old/PaddleMIX/ppdiffusers/README.md +1278 -0
VLMEvalKit_old/PaddleMIX/ppdiffusers/deploy/controlnet/export_model.py +263 -0
VLMEvalKit_old/PaddleMIX/ppdiffusers/deploy/controlnet/scripts/benchmark_paddle.sh +32 -0
VLMEvalKit_old/PaddleMIX/ppdiffusers/deploy/controlnet/scripts/benchmark_torch.sh +26 -0
VLMEvalKit_old/PaddleMIX/ppdiffusers/deploy/controlnet/scripts/inference.sh +26 -0
VLMEvalKit_old/PaddleMIX/ppdiffusers/deploy/sd15/export_model.py +205 -0
VLMEvalKit_old/PaddleMIX/ppdiffusers/deploy/sd15/infer.py +408 -0
VLMEvalKit_old/PaddleMIX/ppdiffusers/deploy/sd15/infer_dygraph_paddle.py +357 -0
VLMEvalKit_old/PaddleMIX/ppdiffusers/deploy/sd15/infer_dygraph_torch.py +417 -0
VLMEvalKit_old/PaddleMIX/ppdiffusers/deploy/sd3/README.md +77 -0
VLMEvalKit_old/PaddleMIX/ppdiffusers/deploy/sd3/infer_dygraph_paddle.py +264 -0
VLMEvalKit_old/PaddleMIX/ppdiffusers/deploy/sd3/infer_dygraph_torch.py +325 -0
VLMEvalKit_old/PaddleMIX/ppdiffusers/deploy/sd3/scripts/benchmark_paddle.sh +32 -0
VLMEvalKit_old/PaddleMIX/ppdiffusers/deploy/sd3/scripts/benchmark_torch.sh +21 -0
VLMEvalKit_old/PaddleMIX/ppdiffusers/deploy/sd3/text_to_image_generation-stable_diffusion_3.py +149 -0
VLMEvalKit_old/PaddleMIX/ppdiffusers/deploy/sdxl/README.md +44 -0
VLMEvalKit_old/PaddleMIX/ppdiffusers/ppdiffusers/experimental/rl/__init__.py +15 -0
VLMEvalKit_old/PaddleMIX/ppdiffusers/ppdiffusers/experimental/rl/value_guided_sampling.py +153 -0
VLMEvalKit_old/PaddleMIX/ppdiffusers/ppdiffusers/models/animate_anyone/pose_guider.py +68 -0
VLMEvalKit_old/PaddleMIX/ppdiffusers/ppdiffusers/models/animate_anyone/resnet.py +235 -0
VLMEvalKit_old/PaddleMIX/ppdiffusers/ppdiffusers/models/animate_anyone/transformer_3d.py +155 -0
VLMEvalKit_old/PaddleMIX/ppdiffusers/ppdiffusers/models/animate_anyone/unet_3d.py +615 -0
VLMEvalKit_old/PaddleMIX/ppdiffusers/ppdiffusers/models/hotshot_xl/__init__.py +28 -0
VLMEvalKit_old/PaddleMIX/ppdiffusers/ppdiffusers/models/hotshot_xl/resnet.py +124 -0
VLMEvalKit_old/PaddleMIX/ppdiffusers/ppdiffusers/models/hotshot_xl/transformer_3d.py +77 -0
VLMEvalKit_old/PaddleMIX/ppdiffusers/ppdiffusers/models/hotshot_xl/unet.py +778 -0
VLMEvalKit_old/PaddleMIX/ppdiffusers/ppdiffusers/models/hotshot_xl/unet_blocks.py +717 -0
VLMEvalKit_old/PaddleMIX/ppdiffusers/ppdiffusers/models/stable_cascade/gdf/__init__.py +162 -0
VLMEvalKit_old/PaddleMIX/ppdiffusers/ppdiffusers/models/stable_cascade/gdf/loss_weights.py +128 -0
VLMEvalKit_old/PaddleMIX/ppdiffusers/ppdiffusers/models/stable_cascade/gdf/scalers.py +58 -0
VLMEvalKit_old/PaddleMIX/ppdiffusers/ppdiffusers/models/stable_cascade/gdf/targets.py +59 -0
VLMEvalKit_old/PaddleMIX/ppdiffusers/ppdiffusers/models/stable_cascade/modules/__init__.py +17 -0
VLMEvalKit_old/PaddleMIX/ppdiffusers/ppdiffusers/models/stable_cascade/modules/common.py +151 -0
VLMEvalKit_old/PaddleMIX/ppdiffusers/ppdiffusers/models/stable_cascade/modules/efficientnet_v2_s.py +561 -0
VLMEvalKit_old/PaddleMIX/ppdiffusers/ppdiffusers/models/stable_cascade/modules/effnet.py +83 -0
VLMEvalKit_old/PaddleMIX/ppdiffusers/ppdiffusers/models/stable_cascade/modules/previewer.py +80 -0
VLMEvalKit_old/PaddleMIX/ppdiffusers/ppdiffusers/models/stable_cascade/modules/stage_a.py +206 -0

VLM2Vec/evaluation/__init__.py ADDED Viewed

File without changes

VLM2Vec/evaluation/eval_blip.py ADDED Viewed

	@@ -0,0 +1,209 @@

+# https://github.com/salesforce/LAVIS/blob/3446bac20c5646d35ae383ebe6d13cec4f8b00cb/examples/blip2_feature_extraction.ipynb
+# https://medium.com/@enrico.randellini/image-and-text-features-extraction-with-blip-and-blip-2-how-to-build-a-multimodal-search-engine-a4ceabf51fbe
+from src.arguments import ModelArguments, DataArguments, TrainingArguments
+from transformers import HfArgumentParser, AutoProcessor
+from src.dataset import EvalDataset
+from evaluation.collator import EvalCollator, BLIP2Collator
+from torch.utils.data import DataLoader
+import torch
+from tqdm import tqdm
+import numpy as np
+import pickle
+import os
+from datasets import load_dataset
+from evaluation.eval_utils import get_pred, save_results, print_results
+from lavis.models import load_model_and_preprocess
+t2i_tasks = [
+    "EDIS", "MSCOCO_t2i","VisDial","VisualNews_t2i","WebQA", "Wiki-SS-NQ", # retrieval
+    ]
+i2t_tasks = [
+    "MSCOCO_i2t","VisualNews_i2t", # retrieval
+    "ImageNet-1K", "N24News", "HatefulMemes", "VOC2007", "SUN397", "Place365", "ImageNet-A", "ImageNet-R", "ObjectNet", "Country211" # classification
+    ]
+def get_pred_blip(qry_t, tgt_t, mode="multimodal2text"):
+    if mode == "multimodal2text":
+        # Compute the dot product between each token in qry_t (shape 32, dim) and tgt_t (shape candidate_num, dim)
+        # This results in a (32, candidate_num) array of scores
+        scores = np.dot(qry_t, tgt_t.T)  # (32, dim) dot (candidate_num, dim).T -> (32, candidate_num)
+        # Find the maximum score for each candidate across the 32 tokens
+        max_scores = np.max(scores, axis=0)  # Max along the 32 tokens for each candidate (shape candidate_num)
+        # The prediction is the index of the target with the highest maximum score
+        pred = np.argmax(max_scores)
+    elif mode == "text2multimodal":
+        # Compute the dot product between qry_t (shape dim) and each of the 32 tokens in the target (candidate_num, 32, dim)
+        # This results in a (candidate_num, 32) array of scores
+        scores = np.dot(tgt_t, qry_t)  # (candidate_num, 32, dim) dot (dim) -> (candidate_num, 32)
+        # Find the maximum score for each candidate across the 32 tokens
+        max_scores = np.max(scores, axis=1)  # Max along the 32 tokens for each candidate (shape candidate_num)
+        # The prediction is the index of the target with the highest maximum score
+        pred = np.argmax(max_scores)
+    return max_scores, pred
+def main():
+    parser = HfArgumentParser((ModelArguments, DataArguments, TrainingArguments))
+    model_args, data_args, training_args = parser.parse_args_into_dataclasses()
+    model_args: ModelArguments
+    data_args: DataArguments
+    training_args: TrainingArguments
+    model, vis_processors, txt_processors = load_model_and_preprocess(name=model_args.model_name, model_type=model_args.model_type, is_eval=True, device=training_args.device)
+    embedding_type = data_args.embedding_type
+    eval_collator = BLIP2Collator(
+        data_args=data_args,
+        vis_processors=vis_processors,
+        txt_processors=txt_processors
+    )
+    # ToDo: This part of code is a little bit hacky. Need to refactor later.
+    for idx, subset in enumerate(data_args.subset_name):
+        print(f"\033[91m{idx+1}/{len(data_args.subset_name)}: Processing {subset} now!\033[0m")
+        encode_qry_path = os.path.join(data_args.encode_output_path, f"{subset}_qry")
+        encode_tgt_path = os.path.join(data_args.encode_output_path, f"{subset}_tgt")
+        if os.path.exists(encode_qry_path) and os.path.exists(encode_tgt_path):
+            continue
+        eval_qry_dataset = EvalDataset(
+            data_args=data_args,
+            subset=subset,
+            text_field="qry_text",
+            img_path_field="qry_img_path",
+        )
+        eval_tgt_dataset = EvalDataset(
+            data_args=data_args,
+            subset=subset,
+            text_field="tgt_text",
+            img_path_field="tgt_img_path",
+        )
+        eval_qry_loader = DataLoader(
+            eval_qry_dataset,
+            batch_size=training_args.per_device_eval_batch_size,
+            collate_fn=eval_collator,
+            shuffle=False,
+            drop_last=False,
+            num_workers=training_args.dataloader_num_workers,
+        )
+        eval_tgt_loader = DataLoader(
+            eval_tgt_dataset,
+            batch_size=training_args.per_device_eval_batch_size,
+            collate_fn=eval_collator,
+            shuffle=False,
+            drop_last=False,
+            num_workers=training_args.dataloader_num_workers,
+        )
+        encoded_tensor = []
+        with torch.no_grad():
+            for batch in tqdm(eval_qry_loader, desc="Encode query"):
+                samples, modes = batch
+                for sample, mode in zip(samples, modes):
+                    image_features, text_features = None, None
+                    if sample["image"] is not None:
+                        sample["image"] = sample["image"].to(training_args.device)
+                        image_features = model.extract_features(sample, mode="image").image_embeds[0,0,:] # (dim,)
+                    if sample["text_input"]:
+                        text_features = model.extract_features(sample, mode="text").text_embeds[0,0,:] # (dim,)
+                    if embedding_type=="unimodal":
+                        if subset in t2i_tasks:
+                            features = text_features
+                        if subset in i2t_tasks:
+                            features = image_features
+                    elif embedding_type=="multimodal":
+                        if image_features is None:
+                            features = text_features
+                        elif text_features is None:
+                            features = image_features
+                        else:
+                            features = image_features + text_features
+                    encoded_tensor.append(features.cpu().detach().float().numpy())
+        with open(encode_qry_path, 'wb') as f:
+            pickle.dump((encoded_tensor, eval_qry_dataset.paired_data), f)
+        encoded_tensor = []
+        with torch.no_grad():
+            for batch in tqdm(eval_tgt_loader, desc="Encode target"):
+                samples, modes = batch
+                for sample, mode in zip(samples, modes):
+                    image_features, text_features = None, None
+                    if sample["image"] is not None:
+                        sample["image"] = sample["image"].to(training_args.device)
+                        image_features = model.extract_features(sample, mode="image").image_embeds[0,0,:] # (dim,)
+                    if sample["text_input"]:
+                        text_features = model.extract_features(sample, mode="text").text_embeds[0,0,:] # (dim,)
+                    if embedding_type=="unimodal":
+                        if subset in t2i_tasks:
+                            features = image_features
+                        if subset in i2t_tasks:
+                            features = text_features
+                    elif embedding_type=="multimodal":
+                        if image_features is None:
+                            features = text_features
+                        elif text_features is None:
+                            features = image_features
+                        else:
+                            features = image_features + text_features
+                    encoded_tensor.append(features.cpu().detach().float().numpy())
+        with open(encode_tgt_path, 'wb') as f:
+            pickle.dump((encoded_tensor, eval_tgt_dataset.paired_data), f)
+    results = {}
+    for subset in tqdm(data_args.subset_name, desc="calculate score"):
+        encode_qry_path = os.path.join(data_args.encode_output_path, f"{subset}_qry")
+        encode_tgt_path = os.path.join(data_args.encode_output_path, f"{subset}_tgt")
+        with open(encode_qry_path, 'rb') as f:
+            qry_tensor, qry_index = pickle.load(f)
+        with open(encode_tgt_path, 'rb') as f:
+            tgt_tensor, tgt_index = pickle.load(f)
+        qry_dict, tgt_dict = {}, {}
+        for qry_t, tt in zip(qry_tensor, qry_index):
+            text, img_path = tt["text"], tt["img_path"]
+            qry_dict[(text, img_path)] = qry_t
+        for tgt_t, tt in zip(tgt_tensor, tgt_index):
+            text, img_path = tt["text"], tt["img_path"]
+            tgt_dict[(text, img_path)] = tgt_t
+        eval_data = load_dataset(
+            data_args.dataset_name,
+            subset,
+            split=data_args.dataset_split,
+        )
+        acc = 0
+        all_pred = []
+        for row in eval_data:
+            qry_t = qry_dict[(row["qry_text"], row["qry_img_path"])]  # (dim,)
+            tgt_t, all_candidates = [], []
+            if row["tgt_text"] == "":
+                row["tgt_text"] = ["" for _ in range(len(row["tgt_img_path"]))]
+            for tt in zip(row["tgt_text"], row["tgt_img_path"]):
+                tgt_t.append(tgt_dict[tt])
+                all_candidates.append(tt)
+            try:
+                tgt_t = np.stack(tgt_t, axis=0)  # (num_candidate, dim)
+            except:
+                import ipdb; ipdb.set_trace()
+            scores, pred = get_pred(qry_t, tgt_t, normalization=model_args.normalize)
+            if pred == 0:
+                acc += 1
+            all_pred.append(all_candidates[pred])
+            with open(os.path.join(data_args.encode_output_path, f"{subset}_pred.txt"), "w") as f:
+                for item in all_pred:
+                    f.write(f"{item}\n")
+        accuracy = acc / len(eval_data) * 100
+        results[subset] = accuracy
+        print(f"\033[91m{subset} accuracy: {acc/len(eval_data)}\033[0m")
+    save_results(results, model_args, data_args, training_args)
+    print_results(results)
+if __name__ == "__main__":
+    main()

VLM2Vec/evaluation/eval_clip.py ADDED Viewed

	@@ -0,0 +1,185 @@

+from src.arguments import ModelArguments, DataArguments, TrainingArguments
+from transformers import HfArgumentParser, AutoProcessor, AutoTokenizer, CLIPModel
+from src.dataset import EvalDataset
+from src.collator import CLIPCollator
+from torch.utils.data import DataLoader
+import torch
+from tqdm import tqdm
+import numpy as np
+import pickle
+import os
+from datasets import load_dataset
+from evaluation.eval_utils import get_pred, save_results, print_results
+t2i_tasks = [
+    "CIRR", "NIGHTS", "EDIS", "MSCOCO_t2i","VisDial","VisualNews_t2i","WebQA", "Wiki-SS-NQ", "OVEN", # retrieval
+    ]
+i2t_tasks = [
+    "MSCOCO_i2t","VisualNews_i2t", # retrieval
+    "ImageNet-1K", "N24News", "HatefulMemes", "VOC2007", "SUN397", "Place365", "ImageNet-A", "ImageNet-R", "ObjectNet", "Country211" # classification
+    ]
+def main():
+    parser = HfArgumentParser((ModelArguments, DataArguments, TrainingArguments))
+    model_args, data_args, training_args = parser.parse_args_into_dataclasses()
+    model_args: ModelArguments
+    data_args: DataArguments
+    training_args: TrainingArguments
+    model = CLIPModel.from_pretrained(model_args.model_name)
+    processor = AutoProcessor.from_pretrained(model_args.model_name)
+    tokenizer = AutoTokenizer.from_pretrained(model_args.model_name)
+    embedding_type = data_args.embedding_type
+    eval_collator = CLIPCollator(
+        data_args=data_args,
+        vis_processors=processor,
+        txt_processors=tokenizer
+    )
+    model.eval()
+    model = model.to(training_args.device)
+    # ToDo: This part of code is a little bit hacky. Need to refactor later.
+    for idx, subset in enumerate(data_args.subset_name):
+        print(f"\033[91m{idx+1}/{len(data_args.subset_name)}: Processing {subset} now!\033[0m")
+        encode_qry_path = os.path.join(data_args.encode_output_path, f"{subset}_qry")
+        encode_tgt_path = os.path.join(data_args.encode_output_path, f"{subset}_tgt")
+        if os.path.exists(encode_qry_path) and os.path.exists(encode_tgt_path):
+            continue
+        eval_qry_dataset = EvalDataset(
+            data_args=data_args,
+            subset=subset,
+            text_field="qry_text",
+            img_path_field="qry_img_path",
+        )
+        eval_tgt_dataset = EvalDataset(
+            data_args=data_args,
+            subset=subset,
+            text_field="tgt_text",
+            img_path_field="tgt_img_path",
+        )
+        eval_qry_loader = DataLoader(
+            eval_qry_dataset,
+            batch_size=training_args.per_device_eval_batch_size,
+            collate_fn=eval_collator,
+            shuffle=False,
+            drop_last=False,
+            num_workers=training_args.dataloader_num_workers,
+        )
+        eval_tgt_loader = DataLoader(
+            eval_tgt_dataset,
+            batch_size=training_args.per_device_eval_batch_size,
+            collate_fn=eval_collator,
+            shuffle=False,
+            drop_last=False,
+            num_workers=training_args.dataloader_num_workers,
+        )
+        encoded_tensor = []
+        with torch.no_grad():
+            for batch in tqdm(eval_qry_loader, desc="Encode query"):
+                batch = {key: value.to(training_args.device) for key, value in batch.items() if type(value) is not list}
+                image_features, text_features = None, None
+                if "pixel_values" in batch:
+                    image_features = model.get_image_features(batch["pixel_values"])
+                if "input_ids" in batch:
+                    text_features = model.get_text_features(batch["input_ids"], batch["attention_mask"])
+                if embedding_type=="unimodal":
+                    if subset in t2i_tasks:
+                        features = text_features
+                    if subset in i2t_tasks:
+                        features = image_features
+                elif embedding_type=="multimodal":
+                    if image_features is None:
+                        features = text_features
+                    elif text_features is None:
+                        features = image_features
+                    else:
+                        try:
+                            features = image_features + text_features
+                        except:
+                            import ipdb; ipdb.set_trace()
+                encoded_tensor.append(features.cpu().detach().float().numpy())
+        encoded_tensor = np.concatenate(encoded_tensor)
+        with open(encode_qry_path, 'wb') as f:
+            pickle.dump((encoded_tensor, eval_qry_dataset.paired_data), f)
+        encoded_tensor = []
+        with torch.no_grad():
+            for batch in tqdm(eval_tgt_loader, desc="Encode target"):
+                batch = {key: value.to(training_args.device) for key, value in batch.items() if type(value) is not list}
+                image_features, text_features = None, None
+                if "pixel_values" in batch:
+                    image_features = model.get_image_features(batch["pixel_values"])
+                if "input_ids" in batch:
+                    text_features = model.get_text_features(batch["input_ids"], batch["attention_mask"])
+                if embedding_type=="unimodal":
+                    if subset in t2i_tasks:
+                        features = image_features
+                    if subset in i2t_tasks:
+                        features = text_features
+                elif embedding_type=="multimodal":
+                    if image_features is None:
+                        features = text_features
+                    elif text_features is None:
+                        features = image_features
+                    else:
+                        features = image_features + text_features
+                encoded_tensor.append(features.cpu().detach().float().numpy())
+            encoded_tensor = np.concatenate(encoded_tensor)
+        with open(encode_tgt_path, 'wb') as f:
+            pickle.dump((encoded_tensor, eval_tgt_dataset.paired_data), f)
+    results = {}
+    for subset in tqdm(data_args.subset_name, desc="calculate score"):
+        encode_qry_path = os.path.join(data_args.encode_output_path, f"{subset}_qry")
+        encode_tgt_path = os.path.join(data_args.encode_output_path, f"{subset}_tgt")
+        with open(encode_qry_path, 'rb') as f:
+            qry_tensor, qry_index = pickle.load(f)
+        with open(encode_tgt_path, 'rb') as f:
+            tgt_tensor, tgt_index = pickle.load(f)
+        qry_dict, tgt_dict = {}, {}
+        for qry_t, tt in zip(qry_tensor, qry_index):
+            text, img_path = tt["text"], tt["img_path"]
+            qry_dict[(text, img_path)] = qry_t
+        for tgt_t, tt in zip(tgt_tensor, tgt_index):
+            text, img_path = tt["text"], tt["img_path"]
+            tgt_dict[(text, img_path)] = tgt_t
+        eval_data = load_dataset(
+            data_args.dataset_name,
+            subset,
+            split=data_args.dataset_split,
+        )
+        acc = 0
+        all_pred = []
+        for row in eval_data:
+            qry_t = qry_dict[(row["qry_text"], row["qry_img_path"])]  # (dim,)
+            tgt_t, all_candidates = [], []
+            if row["tgt_text"] == "":
+                row["tgt_text"] = ["" for _ in range(len(row["tgt_img_path"]))]
+            for tt in zip(row["tgt_text"], row["tgt_img_path"]):
+                tgt_t.append(tgt_dict[tt])
+                all_candidates.append(tt)
+            try:
+                tgt_t = np.stack(tgt_t, axis=0)  # (num_candidate, dim)
+            except:
+                import ipdb; ipdb.set_trace()
+            scores, pred = get_pred(qry_t, tgt_t, normalization=model_args.normalize)
+            if pred == 0:
+                acc += 1
+            all_pred.append(all_candidates[pred])
+            with open(os.path.join(data_args.encode_output_path, f"{subset}_pred.txt"), "w") as f:
+                for item in all_pred:
+                    f.write(f"{item}\n")
+        accuracy = acc / len(eval_data) * 100
+        results[subset] = accuracy
+        print(f"\033[91m{subset} accuracy: {acc/len(eval_data)}\033[0m")
+    save_results(results, model_args, data_args, training_args)
+    print_results(results)
+if __name__ == "__main__":
+    main()

VLM2Vec/evaluation/eval_openclip.py ADDED Viewed

	@@ -0,0 +1,185 @@

+import open_clip
+from src.arguments import ModelArguments, DataArguments, TrainingArguments
+from transformers import HfArgumentParser, AutoProcessor, AutoTokenizer, CLIPModel
+from src.dataset import EvalDataset
+from src.collator import EvalCollator, BLIP2Collator, CLIPCollator, OpenCLIPCollator
+from torch.utils.data import DataLoader
+import torch
+from tqdm import tqdm
+import numpy as np
+import pickle
+import os
+from datasets import load_dataset
+from evaluation.eval_utils import get_pred, save_results, print_results
+t2i_tasks = [
+    "CIRR", "NIGHTS", "EDIS", "MSCOCO_t2i","VisDial","VisualNews_t2i","WebQA", "Wiki-SS-NQ", "OVEN", # retrieval
+    ]
+i2t_tasks = [
+    "MSCOCO_i2t","VisualNews_i2t", # retrieval
+    "ImageNet-1K", "N24News", "HatefulMemes", "VOC2007", "SUN397", "Place365", "ImageNet-A", "ImageNet-R", "ObjectNet", "Country211" # classification
+    ]
+def main():
+    parser = HfArgumentParser((ModelArguments, DataArguments, TrainingArguments))
+    model_args, data_args, training_args = parser.parse_args_into_dataclasses()
+    model_args: ModelArguments
+    data_args: DataArguments
+    training_args: TrainingArguments
+    model, processor = open_clip.create_model_from_pretrained('hf-hub:laion/CLIP-ViT-B-16-laion2B-s34B-b88K')
+    tokenizer = open_clip.get_tokenizer('hf-hub:laion/CLIP-ViT-B-16-laion2B-s34B-b88K')
+    embedding_type = data_args.embedding_type
+    eval_collator = OpenCLIPCollator(
+        data_args=data_args,
+        vis_processors=processor,
+        txt_processors=tokenizer
+    )
+    model.eval()
+    model = model.to(training_args.device)
+    # ToDo: This part of code is a little bit hacky. Need to refactor later.
+    for idx, subset in enumerate(data_args.subset_name):
+        print(f"\033[91m{idx+1}/{len(data_args.subset_name)}: Processing {subset} now!\033[0m")
+        encode_qry_path = os.path.join(data_args.encode_output_path, f"{subset}_qry")
+        encode_tgt_path = os.path.join(data_args.encode_output_path, f"{subset}_tgt")
+        if os.path.exists(encode_qry_path) and os.path.exists(encode_tgt_path):
+            continue
+        eval_qry_dataset = EvalDataset(
+            data_args=data_args,
+            subset=subset,
+            text_field="qry_text",
+            img_path_field="qry_img_path",
+        )
+        eval_tgt_dataset = EvalDataset(
+            data_args=data_args,
+            subset=subset,
+            text_field="tgt_text",
+            img_path_field="tgt_img_path",
+        )
+        eval_qry_loader = DataLoader(
+            eval_qry_dataset,
+            batch_size=training_args.per_device_eval_batch_size,
+            collate_fn=eval_collator,
+            shuffle=False,
+            drop_last=False,
+            num_workers=training_args.dataloader_num_workers,
+        )
+        eval_tgt_loader = DataLoader(
+            eval_tgt_dataset,
+            batch_size=training_args.per_device_eval_batch_size,
+            collate_fn=eval_collator,
+            shuffle=False,
+            drop_last=False,
+            num_workers=training_args.dataloader_num_workers,
+        )
+        encoded_tensor = []
+        with torch.no_grad():
+            for batch in tqdm(eval_qry_loader, desc="Encode query"):
+                batch = {key: value.to(training_args.device) for key, value in batch.items() if type(value) is not list}
+                image_features, text_features = None, None
+                if "pixel_values" in batch:
+                    image_features = model.encode_image(batch["pixel_values"])
+                if "input_ids" in batch:
+                    text_features = model.encode_text(batch["input_ids"])
+                if embedding_type=="unimodal":
+                    if subset in t2i_tasks:
+                        features = text_features
+                    if subset in i2t_tasks:
+                        features = image_features
+                elif embedding_type=="multimodal":
+                    if image_features is None:
+                        features = text_features
+                    elif text_features is None:
+                        features = image_features
+                    else:
+                        try:
+                            features = image_features + text_features
+                        except:
+                            import ipdb; ipdb.set_trace()
+                encoded_tensor.append(features.cpu().detach().float().numpy())
+        encoded_tensor = np.concatenate(encoded_tensor)
+        with open(encode_qry_path, 'wb') as f:
+            pickle.dump((encoded_tensor, eval_qry_dataset.paired_data), f)
+        encoded_tensor = []
+        with torch.no_grad():
+            for batch in tqdm(eval_tgt_loader, desc="Encode target"):
+                batch = {key: value.to(training_args.device) for key, value in batch.items() if type(value) is not list}
+                image_features, text_features = None, None
+                if "pixel_values" in batch:
+                    image_features = model.encode_image(batch["pixel_values"])
+                if "input_ids" in batch:
+                    text_features = model.encode_text(batch["input_ids"])
+                if embedding_type=="unimodal":
+                    if subset in t2i_tasks:
+                        features = image_features
+                    if subset in i2t_tasks:
+                        features = text_features
+                elif embedding_type=="multimodal":
+                    if image_features is None:
+                        features = text_features
+                    elif text_features is None:
+                        features = image_features
+                    else:
+                        features = image_features + text_features
+                encoded_tensor.append(features.cpu().detach().float().numpy())
+            encoded_tensor = np.concatenate(encoded_tensor)
+        with open(encode_tgt_path, 'wb') as f:
+            pickle.dump((encoded_tensor, eval_tgt_dataset.paired_data), f)
+    results = {}
+    for subset in tqdm(data_args.subset_name, desc="calculate score"):
+        encode_qry_path = os.path.join(data_args.encode_output_path, f"{subset}_qry")
+        encode_tgt_path = os.path.join(data_args.encode_output_path, f"{subset}_tgt")
+        with open(encode_qry_path, 'rb') as f:
+            qry_tensor, qry_index = pickle.load(f)
+        with open(encode_tgt_path, 'rb') as f:
+            tgt_tensor, tgt_index = pickle.load(f)
+        qry_dict, tgt_dict = {}, {}
+        for qry_t, tt in zip(qry_tensor, qry_index):
+            text, img_path = tt["text"], tt["img_path"]
+            qry_dict[(text, img_path)] = qry_t
+        for tgt_t, tt in zip(tgt_tensor, tgt_index):
+            text, img_path = tt["text"], tt["img_path"]
+            tgt_dict[(text, img_path)] = tgt_t
+        eval_data = load_dataset(
+            data_args.dataset_name,
+            subset,
+            split=data_args.dataset_split,
+        )
+        acc = 0
+        all_pred = []
+        for row in eval_data:
+            qry_t = qry_dict[(row["qry_text"], row["qry_img_path"])]  # (dim,)
+            tgt_t, all_candidates = [], []
+            if row["tgt_text"] == "":
+                row["tgt_text"] = ["" for _ in range(len(row["tgt_img_path"]))]
+            for tt in zip(row["tgt_text"], row["tgt_img_path"]):
+                tgt_t.append(tgt_dict[tt])
+                all_candidates.append(tt)
+            try:
+                tgt_t = np.stack(tgt_t, axis=0)  # (num_candidate, dim)
+            except:
+                import ipdb; ipdb.set_trace()
+            scores, pred = get_pred(qry_t, tgt_t, normalization=model_args.normalize)
+            if pred == 0:
+                acc += 1
+            all_pred.append(all_candidates[pred])
+            with open(os.path.join(data_args.encode_output_path, f"{subset}_pred.txt"), "w") as f:
+                for item in all_pred:
+                    f.write(f"{item}\n")
+        accuracy = acc / len(eval_data) * 100
+        results[subset] = accuracy
+        print(f"\033[91m{subset} accuracy: {acc/len(eval_data)}\033[0m")
+    save_results(results, model_args, data_args, training_args)
+    print_results(results)
+if __name__ == "__main__":
+    main()

VLM2Vec/evaluation/eval_siglip.py ADDED Viewed

	@@ -0,0 +1,186 @@

+from src.arguments import ModelArguments, DataArguments, TrainingArguments
+from transformers import HfArgumentParser, AutoProcessor, AutoTokenizer, CLIPModel, AutoModel
+from src.dataset import EvalDataset
+from src.collator import EvalCollator, BLIP2Collator, CLIPCollator
+from torch.utils.data import DataLoader
+import torch
+from tqdm import tqdm
+import numpy as np
+import pickle
+import os
+from datasets import load_dataset
+from evaluation.eval_utils import get_pred, save_results, print_results
+t2i_tasks = [
+    "CIRR", "NIGHTS", "EDIS", "MSCOCO_t2i","VisDial","VisualNews_t2i","WebQA", "Wiki-SS-NQ", "OVEN", # retrieval
+    ]
+i2t_tasks = [
+    "MSCOCO_i2t","VisualNews_i2t", # retrieval
+    "ImageNet-1K", "N24News", "HatefulMemes", "VOC2007", "SUN397", "Place365", "ImageNet-A", "ImageNet-R", "ObjectNet", "Country211" # classification
+    ]
+def main():
+    parser = HfArgumentParser((ModelArguments, DataArguments, TrainingArguments))
+    model_args, data_args, training_args = parser.parse_args_into_dataclasses()
+    model_args: ModelArguments
+    data_args: DataArguments
+    training_args: TrainingArguments
+    model = AutoModel.from_pretrained("google/siglip-so400m-patch14-384")
+    all_processor = AutoProcessor.from_pretrained("google/siglip-so400m-patch14-384")
+    processor = all_processor.image_processor
+    tokenizer = all_processor.tokenizer
+    embedding_type = data_args.embedding_type
+    eval_collator = CLIPCollator(
+        data_args=data_args,
+        vis_processors=processor,
+        txt_processors=tokenizer
+    )
+    model.eval()
+    model = model.to(training_args.device)
+    # ToDo: This part of code is a little bit hacky. Need to refactor later.
+    for idx, subset in enumerate(data_args.subset_name):
+        print(f"\033[91m{idx+1}/{len(data_args.subset_name)}: Processing {subset} now!\033[0m")
+        encode_qry_path = os.path.join(data_args.encode_output_path, f"{subset}_qry")
+        encode_tgt_path = os.path.join(data_args.encode_output_path, f"{subset}_tgt")
+        if os.path.exists(encode_qry_path) and os.path.exists(encode_tgt_path):
+            continue
+        eval_qry_dataset = EvalDataset(
+            data_args=data_args,
+            subset=subset,
+            text_field="qry_text",
+            img_path_field="qry_img_path",
+        )
+        eval_tgt_dataset = EvalDataset(
+            data_args=data_args,
+            subset=subset,
+            text_field="tgt_text",
+            img_path_field="tgt_img_path",
+        )
+        eval_qry_loader = DataLoader(
+            eval_qry_dataset,
+            batch_size=training_args.per_device_eval_batch_size,
+            collate_fn=eval_collator,
+            shuffle=False,
+            drop_last=False,
+            num_workers=training_args.dataloader_num_workers,
+        )
+        eval_tgt_loader = DataLoader(
+            eval_tgt_dataset,
+            batch_size=training_args.per_device_eval_batch_size,
+            collate_fn=eval_collator,
+            shuffle=False,
+            drop_last=False,
+            num_workers=training_args.dataloader_num_workers,
+        )
+        encoded_tensor = []
+        with torch.no_grad():
+            for batch in tqdm(eval_qry_loader, desc="Encode query"):
+                batch = {key: value.to(training_args.device) for key, value in batch.items() if type(value) is not list}
+                image_features, text_features = None, None
+                if "pixel_values" in batch:
+                    image_features = model.get_image_features(batch["pixel_values"])
+                if "input_ids" in batch:
+                    text_features = model.get_text_features(batch["input_ids"], batch["attention_mask"])
+                if embedding_type=="unimodal":
+                    if subset in t2i_tasks:
+                        features = text_features
+                    if subset in i2t_tasks:
+                        features = image_features
+                elif embedding_type=="multimodal":
+                    if image_features is None:
+                        features = text_features
+                    elif text_features is None:
+                        features = image_features
+                    else:
+                        try:
+                            features = image_features + text_features
+                        except:
+                            import ipdb; ipdb.set_trace()
+                encoded_tensor.append(features.cpu().detach().float().numpy())
+        encoded_tensor = np.concatenate(encoded_tensor)
+        with open(encode_qry_path, 'wb') as f:
+            pickle.dump((encoded_tensor, eval_qry_dataset.paired_data), f)
+        encoded_tensor = []
+        with torch.no_grad():
+            for batch in tqdm(eval_tgt_loader, desc="Encode target"):
+                batch = {key: value.to(training_args.device) for key, value in batch.items() if type(value) is not list}
+                image_features, text_features = None, None
+                if "pixel_values" in batch:
+                    image_features = model.get_image_features(batch["pixel_values"])
+                if "input_ids" in batch:
+                    text_features = model.get_text_features(batch["input_ids"], batch["attention_mask"])
+                if embedding_type=="unimodal":
+                    if subset in t2i_tasks:
+                        features = image_features
+                    if subset in i2t_tasks:
+                        features = text_features
+                elif embedding_type=="multimodal":
+                    if image_features is None:
+                        features = text_features
+                    elif text_features is None:
+                        features = image_features
+                    else:
+                        features = image_features + text_features
+                encoded_tensor.append(features.cpu().detach().float().numpy())
+            encoded_tensor = np.concatenate(encoded_tensor)
+        with open(encode_tgt_path, 'wb') as f:
+            pickle.dump((encoded_tensor, eval_tgt_dataset.paired_data), f)
+    results = {}
+    for subset in tqdm(data_args.subset_name, desc="calculate score"):
+        encode_qry_path = os.path.join(data_args.encode_output_path, f"{subset}_qry")
+        encode_tgt_path = os.path.join(data_args.encode_output_path, f"{subset}_tgt")
+        with open(encode_qry_path, 'rb') as f:
+            qry_tensor, qry_index = pickle.load(f)
+        with open(encode_tgt_path, 'rb') as f:
+            tgt_tensor, tgt_index = pickle.load(f)
+        qry_dict, tgt_dict = {}, {}
+        for qry_t, tt in zip(qry_tensor, qry_index):
+            text, img_path = tt["text"], tt["img_path"]
+            qry_dict[(text, img_path)] = qry_t
+        for tgt_t, tt in zip(tgt_tensor, tgt_index):
+            text, img_path = tt["text"], tt["img_path"]
+            tgt_dict[(text, img_path)] = tgt_t
+        eval_data = load_dataset(
+            data_args.dataset_name,
+            subset,
+            split=data_args.dataset_split,
+        )
+        acc = 0
+        all_pred = []
+        for row in eval_data:
+            qry_t = qry_dict[(row["qry_text"], row["qry_img_path"])]  # (dim,)
+            tgt_t, all_candidates = [], []
+            if row["tgt_text"] == "":
+                row["tgt_text"] = ["" for _ in range(len(row["tgt_img_path"]))]
+            for tt in zip(row["tgt_text"], row["tgt_img_path"]):
+                tgt_t.append(tgt_dict[tt])
+                all_candidates.append(tt)
+            try:
+                tgt_t = np.stack(tgt_t, axis=0)  # (num_candidate, dim)
+            except:
+                import ipdb; ipdb.set_trace()
+            scores, pred = get_pred(qry_t, tgt_t, normalization=model_args.normalize)
+            if pred == 0:
+                acc += 1
+            all_pred.append(all_candidates[pred])
+            with open(os.path.join(data_args.encode_output_path, f"{subset}_pred.txt"), "w") as f:
+                for item in all_pred:
+                    f.write(f"{item}\n")
+        accuracy = acc / len(eval_data) * 100
+        results[subset] = accuracy
+        print(f"\033[91m{subset} accuracy: {acc/len(eval_data)}\033[0m")
+    save_results(results, model_args, data_args, training_args)
+    print_results(results)
+if __name__ == "__main__":
+    main()

VLM2Vec/src/dist_utils.py ADDED Viewed

	@@ -0,0 +1,92 @@

+# Code adapted from SimCSE (https://github.com/princeton-nlp/SimCSE) governed by MIT license.
+# Copyright (c) 2023, Salesforce, Inc.
+# All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+# For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+import torch
+import torch.distributed as dist
+class GatherLayer(torch.autograd.Function):
+    """
+    Gather tensors from all process, supporting backward propagation.
+    https://github.com/Spijkervet/SimCLR/blob/master/simclr/modules/gather.py
+    """
+    @staticmethod
+    def forward(ctx, input):
+        ctx.save_for_backward(input)
+        output = [torch.zeros_like(input) for _ in range(dist.get_world_size())]
+        dist.all_gather(output, input)
+        return tuple(output)
+    @staticmethod
+    def backward(ctx, *grads):
+        (input,) = ctx.saved_tensors
+        grad_out = torch.zeros_like(input)
+        grad_out[:] = grads[dist.get_rank()]
+        return grad_out
+def dist_gather(x: torch.tensor):
+    if not dist.is_initialized():  return x
+    if len(x.shape) == 0:
+        x = x.reshape(1)
+    x_gather = GatherLayer.apply(x)
+    x_gather = torch.cat(x_gather, dim=0)
+    return x_gather
+@torch.no_grad()
+def dist_gather_nograd(x: torch.tensor):
+    if not dist.is_initialized():  return x
+    x_gather = [torch.ones_like(x) for _ in range(get_world_size())]
+    dist.all_gather(x_gather, x, async_op=False)
+    x_gather = torch.cat(x_gather, dim=0)
+    return x_gather
+def get_rank():
+    if not dist.is_available():
+        return 0
+    if not dist.is_initialized():
+        return 0
+    return dist.get_rank()
+def is_main():
+    return get_rank() == 0
+def get_world_size():
+    if not dist.is_initialized():
+        return 1
+    else:
+        return dist.get_world_size()
+def barrier():
+    if dist.is_initialized():
+        dist.barrier()
+@torch.no_grad()
+def varsize_gather_nograd(x: torch.Tensor):
+    """gather tensors of different sizes along the first dimension"""
+    if not dist.is_initialized():
+        return x
+    # determine max size
+    size = torch.tensor([x.shape[0]], device=x.device, dtype=torch.int)
+    allsizes = [torch.zeros_like(size) for _ in range(dist.get_world_size())]
+    dist.all_gather(allsizes, size)
+    max_size = max([size.cpu().max() for size in allsizes])
+    padded = torch.empty(max_size, *x.shape[1:], dtype=x.dtype, device=x.device)
+    padded[: x.shape[0]] = x
+    output = [torch.zeros_like(padded) for _ in range(dist.get_world_size())]
+    dist.all_gather(output, padded)
+    output = [tensor[: allsizes[k]] for k, tensor in enumerate(output)]
+    output = torch.cat(output, dim=0)
+    return output

VLMEvalKit_old/PaddleMIX/deploy/README.md ADDED Viewed

	@@ -0,0 +1,110 @@

+# PaddleMIX推理部署
+[[English](README_en.md)]
+PaddleMIX基于Paddle Inference，提供了python的部署方案。部署方式分为两种：
+- 通过 **APPflow** ,设置static_mode = True 变量开启静态图推理，同时可配合trt加速推理；该方式部分模型不支持静态图以及trt，具体模型可参考[跨模态多场景应用](../applications/README.md/#跨模态多场景应用)；
+- 单模型部署
+## 1.APPflow部署
+在使用 PaddleMIX 一键预测 **APPflow** 时，可通过设置 static_mode = True 变量开启静态图推理，同时可配合trt加速推理。
+### 1.1 示例
+```python
+>>> from paddlemix.appflow import Appflow
+>>> from PIL import Image
+>>> task = Appflow(app="openset_det_sam",
+                   models=["GroundingDino/groundingdino-swint-ogc","Sam/SamVitH-1024"],
+                   static_mode=True,
+                   precision="fp32")
+>>> image_pil = Image.open("beauty.png").convert("RGB")
+>>> result = task(image=image_pil,prompt="women")
+```
+### 1.2 参数说明
+| 参数 | 是否必须| 含义                                                                                          |
+|-------|-------|---------------------------------------------------------------------------------------------|
+| --app | Yes| 应用名称                                                                                   |
+| --models | Yes | 需要使用的模型，可以是单个模型，也可以多个组合                                                                                     |
+| --static_mode  | Option | 是否静态图推理，默认False                                                                                 |
+| --precision | Option | 当 static_mode == True 时使用，默认fp32,可选择trt_fp32、trt_fp16                                                                                    |
+说明：
+- 部分模型不支持静态图以及trt，具体可参考[跨模态多场景应用](../applications/README.md)
+- 生成的静态图将在模型名字对应的文件夹下 如:GroundingDino/groundingdino-swint-ogc/
+## 2. 单模型预测部署
+Python端预测部署主要包含两个步骤：
+- 导出预测模型
+- 基于Python进行预测
+当前支持模型：
+- [blip2](./blip2/README.md)
+- [groundingdino](./groundingdino/README.md)
+- [sam](./sam/README.md)
+- [qwen_vl](./qwen_vl/README.md)
+以 groundingdino 为例子。
+### 2.1 导出预测模型
+```bash
+cd deploy/groundingdino
+# 导出groundingdino模型
+python export.py \
+--dino_type GroundingDino/groundingdino-swint-ogc
+```
+导出后目录下，包括 `model_state.pdiparams`,  `model_state.pdiparams.info`, `model_state.pdmodel`等文件。
+### 2.2 基于python的预测
+```bash
+ python predict.py  \
+ --text_encoder_type GroundingDino/groundingdino-swint-ogc \
+ --model_path output_groundingdino/GroundingDino/groundingdino-swint-ogc \
+ --input_image https://bj.bcebos.com/v1/paddlenlp/models/community/GroundingDino/000000004505.jpg \
+ --output_dir ./groundingdino_predict_output \
+ --prompt "bus"
+```
+## 3. 推理 BenchMark
+> Note:
+> 测试环境为:
+Paddle 3.0，
+PaddleMIX release/2.0
+PaddleNLP2.7.2
+A100 80G单卡。
+### 3.1 benchmark命令
+在 `deploy` 对应模型目录下的运行后加 --benchmark,
+如 GroundingDino 的benchmark命令为：
+```bash
+ cd deploy/groundingdino
+ python predict.py  \
+ --text_encoder_type GroundingDino/groundingdino-swint-ogc \
+ --model_path output_groundingdino/GroundingDino/groundingdino-swint-ogc \
+ --input_image https://bj.bcebos.com/v1/paddlenlp/models/community/GroundingDino/000000004505.jpg \
+ --output_dir ./groundingdino_predict_output \
+ --prompt "bus" \
+ --benchmark True
+```
+# A100性能数据
+|模型|图片分辨率|数据类型 |Paddle Deploy |
+|-|-|-|-|
+|qwen-vl-7b|448*448|fp16|669.8 ms|
+|llava-1.5-7b|336*336|fp16|981.2 ms|
+|llava-1.6-7b|336*336|fp16|778.7 ms|
+|groundingDino/groundingdino-swint-ogc|800*1193|fp32|100 ms|
+|Sam/SamVitH-1024|1024*1024|fp32|121 ms|

VLMEvalKit_old/PaddleMIX/deploy/README_en.md ADDED Viewed

	@@ -0,0 +1,108 @@

+# PaddleMIX Inference Deployment
+[[中文文档](README.md)]
+PaddleMIX utilizes Paddle Inference and provides a Python-based deployment solution. There are two deployment methods:
+1. **APPflow Deployment**:
+   - By setting the `static_mode = True` variable in APPflow, you can enable static graph inference. Additionally, you can accelerate inference using TensorRT. Note that not all models support static graph or TensorRT. Please refer to the [Multi Modal And Scenario](../applications/README_en.md/#multi-modal-and-scenario) section for specific model support.
+2. **Single Model Deployment**:
+For APPflow usage, you can set the `static_mode = True` variable to enable static graph inference and optionally accelerate inference using TensorRT.
+### 1.1 Exmaples
+```python
+>>> from paddlemix.appflow import Appflow
+>>> from PIL import Image
+>>> task = Appflow(app="openset_det_sam",
+                   models=["GroundingDino/groundingdino-swint-ogc","Sam/SamVitH-1024"],
+                   static_mode=True,
+                   precision="fp32")
+>>> image_pil = Image.open("beauty.png").convert("RGB")
+>>> result = task(image=image_pil,prompt="women")
+```
+### 1.2 Parameter Explanation
+| Parameter | Required? | Meaning                                                                                          |
+|-------|-------|---------------------------------------------------------------------------------------------|
+| --app | Yes| Application name                                                                                   |
+| --models | Yes | Model(s) used. Can be one model, or multiple models                                                                                    |
+| --static_mode  | Optional | Whether to use static graph inference, default to False                                                                                 |
+| --precision | Optional | When `static_mode == True`, it defaults to using FP32. You can optionally select `trt_fp32` or `trt_fp16`.                                                                                   |
+Instructions：
+- Some models do not support static graph or TensorRT. For specific information, please refer to [Multi Modal And Scenario](../applications/README_en.md/#multi-modal-and-scenario).
+- The generated static graph will be located in the folder corresponding to the model name, for example: `GroundingDino/groundingdino-swint-ogc/`.
+## 2. Single Model Prediction Deployment
+Python-based prediction deployment mainly involves two steps:
+- Exporting the predictive model
+- Performing prediction using Python
+Currently supported models:
+- [blip2](./blip2/README.md)
+- [groundingdino](./groundingdino/README.md)
+- [sam](./sam/README.md)
+- [qwen_vl](./qwen_vl/README.md)
+Using groundingdino as an exmaple.
+### 2.1 Exporting Predictive Model
+```bash
+cd deploy/groundingdino
+# 导出groundingdino模型
+python export.py \
+--dino_type GroundingDino/groundingdino-swint-ogc
+```
+Will be exported to the following directory, including `model_state.pdiparams`,  `model_state.pdiparams.info`, `model_state.pdmodel`and other files.
+### 2.2 Python-based Inference
+```bash
+ python predict.py  \
+ --text_encoder_type GroundingDino/groundingdino-swint-ogc \
+ --model_path output_groundingdino/GroundingDino/groundingdino-swint-ogc \
+ --input_image https://bj.bcebos.com/v1/paddlenlp/models/community/GroundingDino/000000004505.jpg \
+ --output_dir ./groundingdino_predict_output \
+ --prompt "bus"
+```
+## 3. BenchMark
+> Note:
+> environment
+Paddle 3.0
+PaddleMIX release/2.0
+PaddleNLP 2.7.2
+A100 80G。
+### 3.1 benchmark cmd
+Add -- benchmark after running in the 'deploy' corresponding model directory to obtain the running time of the model.
+example: GroundingDino benchmark：
+```bash
+ cd deploy/groundingdino
+ python predict.py  \
+ --text_encoder_type GroundingDino/groundingdino-swint-ogc \
+ --model_path output_groundingdino/GroundingDino/groundingdino-swint-ogc \
+ --input_image https://bj.bcebos.com/v1/paddlenlp/models/community/GroundingDino/000000004505.jpg \
+ --output_dir ./groundingdino_predict_output \
+ --prompt "bus" \
+ --benchmark True
+```
+|Model|image size|dtype |Paddle Deploy |
+|-|-|-|-|
+|qwen-vl-7b|448*448|fp16|669.8 ms|
+|llava-1.5-7b|336*336|fp16|981.2 ms|
+|llava-1.6-7b|336*336|fp16|778.7 ms|
+|groundingDino/groundingdino-swint-ogc|800*1193|fp32|100 ms|
+|Sam/SamVitH-1024|1024*1024|fp32|121 ms|

VLMEvalKit_old/PaddleMIX/docs/CHANGELOG.md ADDED Viewed

	@@ -0,0 +1,44 @@

+# 版本更新信息
+## 最新版本信息
+### 2.0(07/26/2024)
+#### 多模态理解
+1. 新增模型：LLaVA: v1.5-7b, v1.5-13b, v1,6-7b，CogAgent, CogVLM, Qwen-VL, InternLM-XComposer2
+2. 数据集增强：新增chatml_dataset图文对话数据读取方案，可自定义chat_template文件适配，支持混合数据集
+3. 工具链升级：新增Auto模块，统一SFT训练流程，兼容全参数、lora训练。新增mixtoken训练策略，SFT吞吐量提升5.6倍。支持Qwen-VL，LLaVA推理部署，较torch推理性能提升2.38倍
+#### 多模态生成
+1. 视频生成能力：支持Sora相关技术，支持DiT、SiT、UViT训练推理，新增NaViT、MAGVIT-v2模型； 新增视频生成模型SVD、Open Sora，支持模型微调和推理； 新增姿态可控视频生成模型AnimateAnyone、即插即用视频生成模型AnimateDiff、GIF视频生成模型Hotshot-XL；
+2. 文生图模型库：新增高速推理文图生成模型LCM，适配SD/SDXL训练和推理；
+3. 工具链升级：发布ppdiffusers 0.24.1版本，新增peft，accelerate后端； 权重加载/保存全面升级，支持分布式、模型切片、safetensors等场景。
+4. 生态兼容：提供基于ppdiffusers开发的ComfyUI插件，支持了常见的模型加载转换、文生图、图生图、图像局部修改等任务。新增Stable Diffusion 1.5系列节点；新增Stable Diffusion XL系列节点。新增4个图像生成的workflow案例。
+#### DataCopilot（多模态数据处理工具箱）
+1. 多模态数据集类型MMDataset，支持加载和导出Json、H5、Jsonl等多种数据存储格式，内置并发（map, filter）数据处理接口等
+2. 多模态数据格式工具，支持自定义数据结构，数据转换，离线格式检查
+3. 多模态数据分析工具，支持基本的统计信息，数据可视化功能，以及注册自定义功能
+### 1.0(11/15/2023)
+#### 核心能力
+1. 大规模预训练: BLIP-2支持数据并行、sharding、模型并行，流水线并行训练；支持千亿参数规模训练; EVA-CLIP支持数据并行、sharding、模型并行训练; Stable Diffusion支持数据并行、sharding、BF16 O2训练; CLIP，Coca支持数据并行训练
+2. 有监督精调: Stable Diffusion，SDXL 支持LoRA精调
+3. 推理部署: 支持BLIP-2，miniGPT-4，Grounding DINO, SAM，Stable Diffusion动转静导出部署
+#### 前沿模型
+1. 新增CLIP系列跨模态大模型：CLIP，EVA-CLIP，Coca
+2. 新增图生文跨模态大模型：BLIP-2，miniGPT-4，VisualGLM
+3. 新增跨模态视觉模型：Grounding DINO， SAM
+4. 新增融合更多模态大模型：ImageBind
+5. 新增文生图模型：SDXL，支持Text2Image、Img2Img、Inpainting、InstructPix2Pix等任务，支持DreamBooth Lora训练； 新增UniDiffuser，通过统一的多模态扩散过程支持文生图、图生文等任务； 新增文本条件视频生成模型LVDM，支持训练与推理； 新增文图生成模型Kandinsky 2.2，Consistency models； Controlnet升级，支持ControlNetImg2Img、ControlNetInpaint、 StableDiffusionXLControlNet等。
+#### 特色应用
+1. 新增跨模态大模型应用流水线AppFlow
+2. 新增基于chat的图像编辑应用
+3. 新增自动标注应用

VLMEvalKit_old/PaddleMIX/docs/FAQ.md ADDED Viewed

File without changes

VLMEvalKit_old/PaddleMIX/docs/train_tutorial.md ADDED Viewed

	@@ -0,0 +1,10 @@

+# Train Tutorial
+## 训练微调示例
+- [Blip2](../paddlemix/examples/blip2/README.md)
+- [clip](../paddlemix/examples/clip/README.md)
+- [coca](../paddlemix/examples/coca/README.md)
+- [eva02](../paddlemix/examples/eva02/README.md)
+- [evaclip](../paddlemix/examples/evaclip/README.md)
+- [Stable Diffusion](../ppdiffusers/examples/text_to_image/README.md)

VLMEvalKit_old/PaddleMIX/ppdiffusers/LICENSE ADDED Viewed

	@@ -0,0 +1,203 @@

+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+   1. Definitions.
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+   END OF TERMS AND CONDITIONS
+   APPENDIX: How to apply the Apache License to your work.
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+   Copyright [yyyy] [name of copyright owner]
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+       http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.

VLMEvalKit_old/PaddleMIX/ppdiffusers/Makefile ADDED Viewed

	@@ -0,0 +1,30 @@

+.DEFAULT_GOAL := all
+.PHONY: all
+all: deploy-version build deploy
+.PHONY: build
+build:
+	python3 setup.py sdist bdist_wheel
+.PHONY: deploy
+deploy:
+	make deploy-version
+	twine upload --skip-existing dist/*
+.PHONY: deploy-version
+deploy-version:
+	echo "VERSION = '$$(cat VERSION)'" > ppdiffusers/version.py
+.PHONY: install
+install:
+	pip install -r requirements.txt
+.PHONY: version
+version:
+	@newVersion=$$(awk -F. '{print $$1"."$$2"."$$3+1}' < VERSION) \
+		&& echo $${newVersion} > VERSION \
+		&& git add VERSION \
+		&& git commit -m "🔥 update version to $${newVersion}" > /dev/null \
+		&& echo "Bumped version to $${newVersion}"

VLMEvalKit_old/PaddleMIX/ppdiffusers/README.md ADDED Viewed

	@@ -0,0 +1,1278 @@

+<div align="center">
+  <img src="https://user-images.githubusercontent.com/11793384/215372703-4385f66a-abe4-44c7-9626-96b7b65270c8.png" width="40%" height="40%" />
+</div>
+<p align="center">
+    <a href="https://pypi.org/project/ppdiffusers/"><img src="https://img.shields.io/pypi/pyversions/ppdiffusers"></a>
+    <a href=""><img src="https://img.shields.io/badge/os-linux%2C%20win%2C%20mac-yellow.svg"></a>
+    <a href="https://github.com/PaddlePaddle/PaddleMIX/blob/develop/ppdiffusers/LICENSE"><img src="https://img.shields.io/badge/license-Apache%202-dfd.svg"></a>
+</p>
+<h4 align="center">
+  <a href=#特性> 特性 </a> |
+  <a href=#安装> 安装 </a> |
+  <a href=#快速开始> 快速开始 </a> |
+  <a href=#模型部署> 模型部署</a>
+</h4>
+# PPDiffusers: Diffusers toolbox implemented based on PaddlePaddle
+**PPDiffusers**是一款支持多种模态（如文本图像跨模态、图像、语音）扩散模型（Diffusion Model）训练和推理的国产化工具箱，依托于[**PaddlePaddle**](https://www.paddlepaddle.org.cn/)框架和[**PaddleNLP**](https://github.com/PaddlePaddle/PaddleNLP)自然语言处理开发库。
+## News 📢
+* 🔥 **2024.10.18 发布 0.29.0 版本，新增图像生成模型[Stable Diffusion 3 (SD3)](https://github.com/PaddlePaddle/PaddleMIX/blob/develop/ppdiffusers/examples/text_to_image/README_sd3.md)，支持DreamBooth训练及高性能推理；SD3、SDXL适配昇腾910B，提供国产计算芯片上的训推能力；DIT支持[高性能推理](https://github.com/PaddlePaddle/PaddleMIX/blob/develop/ppdiffusers/examples/class_conditional_image_generation/DiT/README.md#23-paddle-inference-%E9%AB%98%E6%80%A7%E8%83%BD%E6%8E%A8%E7%90%86)；支持PaddleNLP 3.0 beta版本。**
+* 🔥 **2024.07.15 发布 0.24.1 版本，新增[Open-Sora](https://github.com/PaddlePaddle/PaddleMIX/tree/develop/ppdiffusers/examples/Open-Sora)，支持模型训练和推理；全面支持Paddle 3.0。**
+* 🔥 **2024.04.17 发布 0.24.0 版本，支持[Sora相关技术](https://github.com/PaddlePaddle/PaddleMIX/tree/develop/ppdiffusers/examples/sora)，支持[DiT](https://github.com/PaddlePaddle/PaddleMIX/tree/develop/ppdiffusers/examples/class_conditional_image_generation/DiT)、[SiT](https://github.com/PaddlePaddle/PaddleMIX/tree/develop/ppdiffusers/examples/class_conditional_image_generation/DiT#exploring-flow-and-diffusion-based-generative-models-with-scalable-interpolant-transformers-sit)、[UViT](https://github.com/PaddlePaddle/PaddleMIX/tree/develop/ppdiffusers/examples/text_to_image_mscoco_uvit)训练推理，新增[NaViT](https://github.com/PaddlePaddle/PaddleMIX/tree/develop/ppdiffusers/examples/navit)、[MAGVIT-v2](https://github.com/PaddlePaddle/PaddleMIX/tree/develop/ppdiffusers/examples/video_tokenizer/magvit2)模型；
+视频生成能力全面升级；
+新增视频生成模型[SVD](https://github.com/PaddlePaddle/PaddleMIX/tree/develop/ppdiffusers/examples/stable_video_diffusion)，支持模型微调和推理；
+新增姿态可控视频生成模型[AnimateAnyone](https://github.com/PaddlePaddle/PaddleMIX/tree/develop/ppdiffusers/examples/AnimateAnyone)、即插即用视频生成模型[AnimateDiff](https://github.com/PaddlePaddle/PaddleMIX/blob/develop/ppdiffusers/examples/inference/text_to_video_generation_animediff.py)、GIF视频生成模型[Hotshot-XL](https://github.com/PaddlePaddle/PaddleMIX/tree/develop/ppdiffusers/examples/community/Hotshot-XL)；
+新增高速推理文图生成模型[LCM](https://github.com/PaddlePaddle/PaddleMIX/tree/develop/ppdiffusers/examples/consistency_distillation)，支持SD/SDXL训练和推理；
+[模型推理部署](https://github.com/PaddlePaddle/PaddleMIX/tree/develop/ppdiffusers/deploy)全面升级；新增peft，accelerate后端；
+权重加载/保存全面升级，支持分布式、模型切片、safetensors等场景，相关能力已集成DiT、 [IP-Adapter](https://github.com/PaddlePaddle/PaddleMIX/tree/develop/ppdiffusers/examples/ip_adapter)、[PhotoMaker](https://github.com/PaddlePaddle/PaddleMIX/tree/develop/ppdiffusers/examples/PhotoMaker)、[InstantID](https://github.com/PaddlePaddle/PaddleMIX/tree/develop/ppdiffusers/examples/InstantID)等。**
+* 🔥 **2023.12.12 发布 0.19.4 版本，修复已知的部分 BUG，修复 0D Tensor 的 Warning，新增 SDXL 的 FastdeployPipeline。**
+* 🔥 **2023.09.27 发布 0.19.3 版本，新增[SDXL](#文本图像多模)，支持Text2Image、Img2Img、Inpainting、InstructPix2Pix等任务，支持DreamBooth Lora训练；
+新增[UniDiffuser](#文本图像多模)，通过统一的多模态扩散过程支持文生图、图生文等任务；
+新增文本条件视频生成模型[LVDM](https://github.com/PaddlePaddle/PaddleMIX/tree/develop/ppdiffusers/examples/text_to_video_lvdm)，支持训练与推理；
+新增文图生成模型[Kandinsky 2.2](#文本图像多模)，[Consistency models](#文本图像多模)；
+Stable Diffusion支持[BF16 O2训练](https://github.com/PaddlePaddle/PaddleMIX/tree/develop/ppdiffusers/examples/stable_diffusion)，效果对齐FP32；
+[LoRA加载升级](#加载HF-LoRA权重)，支持加载SDXL的LoRA权重；
+[Controlnet](https://github.com/PaddlePaddle/PaddleMIX/tree/develop/ppdiffusers/ppdiffusers/pipelines/controlnet)升级，支持ControlNetImg2Img、ControlNetInpaint、StableDiffusionXLControlNet等。**
+## 特性
+#### 📦 SOTA扩散模型Pipelines集合
+我们提供**SOTA（State-of-the-Art）** 的扩散模型Pipelines集合。
+目前**PPDiffusers**已经集成了**100+Pipelines**，支持文图生成（Text-to-Image Generation）、文本引导的图像编辑（Text-Guided Image Inpainting）、文本引导的图像变换（Image-to-Image Text-Guided Generation）、文本条件的视频生成（Text-to-Video Generation）、超分（Super Superresolution）、文本条件的音频生成（Text-to-Audio Generation）在内的**10余项**任务，覆盖**文本、图像、视频、音频**等多种模态。
+如果想要了解当前支持的所有**Pipelines**以及对应的来源信息，可以阅读[🔥 PPDiffusers Pipelines](https://github.com/PaddlePaddle/PaddleMIX/blob/develop/ppdiffusers/ppdiffusers/pipelines/README.md)文档。
+#### 🔊 提供丰富的Noise Scheduler
+我们提供了丰富的**噪声调度器（Noise Scheduler）**，可以对**速度**与**质量**进行权衡，用户可在推理时根据需求快速切换使用。
+当前**PPDiffusers**已经集成了**14+Scheduler**，不仅支持 [DDPM](https://github.com/PaddlePaddle/PaddleMIX/blob/develop/ppdiffusers/ppdiffusers/schedulers/scheduling_ddpm.py)、[DDIM](https://github.com/PaddlePaddle/PaddleMIX/blob/develop/ppdiffusers/ppdiffusers/schedulers/scheduling_ddim.py) 和 [PNDM](https://github.com/PaddlePaddle/PaddleMIX/blob/develop/ppdiffusers/ppdiffusers/schedulers/scheduling_pndm.py)，还支持最新的 [🔥 DPMSolver](https://github.com/PaddlePaddle/PaddleMIX/blob/develop/ppdiffusers/ppdiffusers/schedulers/scheduling_dpmsolver_multistep.py)！
+#### 🎛️ 提供多种扩散模型组件
+我们提供了**多种扩散模型**组件，如[UNet1DModel](https://github.com/PaddlePaddle/PaddleMIX/blob/develop/ppdiffusers/ppdiffusers/models/unet_1d.py)、[UNet2DModel](https://github.com/PaddlePaddle/PaddleMIX/blob/develop/ppdiffusers/ppdiffusers/models/unet_2d.py)、[UNet2DConditionModel](https://github.com/PaddlePaddle/PaddleMIX/blob/develop/ppdiffusers/ppdiffusers/models/unet_2d_condition.py)、[UNet3DConditionModel](https://github.com/PaddlePaddle/PaddleMIX/blob/develop/ppdiffusers/ppdiffusers/models/unet_3d_condition.py)、[VQModel](https://github.com/PaddlePaddle/PaddleMIX/blob/develop/ppdiffusers/ppdiffusers/models/vae.py)、[AutoencoderKL](https://github.com/PaddlePaddle/PaddleMIX/blob/develop/ppdiffusers/ppdiffusers/models/vae.py)等。
+#### 📖 提供丰富的训练和推理教程
+我们提供了丰富的训练教程，不仅支持扩散模型的二次开发微调，如基于[Textual Inversion](https://github.com/PaddlePaddle/PaddleMIX/tree/develop/ppdiffusers/examples/textual_inversion)和[DreamBooth](https://github.com/PaddlePaddle/PaddleMIX/tree/develop/ppdiffusers/examples/dreambooth)使用3-5张图定制化训练生成图像的风格或物体，还支持[🔥 Latent Diffusion Model](https://github.com/PaddlePaddle/PaddleMIX/tree/develop/ppdiffusers/examples/text_to_image_laion400m)、[🔥 ControlNet](https://github.com/PaddlePaddle/PaddleMIX/tree/develop/ppdiffusers/examples/controlnet)、[🔥 T2I-Adapter](https://github.com/PaddlePaddle/PaddleMIX/tree/develop/ppdiffusers/examples/t2i-adapter)  等扩散模型的训练！
+此外，我们还提供了丰富的[🔥 Pipelines推理样例](https://github.com/PaddlePaddle/PaddleMIX/tree/develop/ppdiffusers/examples/inference)。
+#### 🚀 支持FastDeploy高性能部署
+我们提供基于[FastDeploy](https://github.com/PaddlePaddle/FastDeploy)的[🔥 高性能Stable Diffusion Pipeline](https://github.com/PaddlePaddle/PaddleMIX/blob/develop/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_fastdeploy_stable_diffusion.py)，更多有关FastDeploy进行多推理引擎后端高性能部署的信息请参考[🔥 高性能FastDeploy推理教程](https://github.com/PaddlePaddle/PaddleMIX/tree/develop/ppdiffusers/deploy)。
+## 安装
+### 环境依赖
+```
+pip install -r requirements.txt
+```
+关于PaddlePaddle安装的详细教程请查看[Installation](https://www.paddlepaddle.org.cn/install/quick?docurl=/documentation/docs/zh/develop/install/pip/linux-pip.html)。
+### pip安装
+```shell
+pip install --upgrade ppdiffusers
+```
+### 手动安装
+```shell
+git clone https://github.com/PaddlePaddle/PaddleMIX
+cd PaddleMIX/ppdiffusers
+python setup.py install
+```
+### 设置代理
+```shell
+export HF_HUB_ENABLE_HF_TRANSFER=1
+export HF_ENDPOINT=https://hf-mirror.com
+```
+## 快速开始
+我们将以扩散模型的典型代表**Stable Diffusion**为例，带你快速了解PPDiffusers。
+**Stable Diffusion**基于**潜在扩散模型（Latent Diffusion Models）**，专门用于**文图生成（Text-to-Image Generation）任务**。该模型是由来自 [CompVis](https://github.com/CompVis), [Stability AI](https://stability.ai/), [LAION](https://laion.ai/)以及[RunwayML](https://runwayml.com/)的工程师共同开发完成，目前发布了v1和v2两个版本。v1版本采用了LAION-5B数据集子集（分辨率为 512x512）进行训练，并具有以下架构设置：自动编码器下采样因子为8，UNet大小为860M，文本编码器为CLIP ViT-L/14。v2版本相较于v1版本在生成图像的质量和分辨率等进行了改善。
+### Stable Diffusion重点模型权重
+<details><summary>&emsp; Stable Diffusion 模型支持的权重（英文） </summary>
+**我们只需要将下面的"xxxx"，替换成所需的权重名，即可快速使用！**
+```python
+from ppdiffusers import *
+pipe_text2img = StableDiffusionPipeline.from_pretrained("xxxx")
+pipe_img2img = StableDiffusionImg2ImgPipeline.from_pretrained("xxxx")
+pipe_inpaint_legacy = StableDiffusionInpaintPipelineLegacy.from_pretrained("xxxx")
+pipe_mega = StableDiffusionMegaPipeline.from_pretrained("xxxx")
+# pipe_mega.text2img() 等于 pipe_text2img()
+# pipe_mega.img2img() 等于 pipe_img2img()
+# pipe_mega.inpaint_legacy() 等于 pipe_inpaint_legacy()
+```
+| PPDiffusers支持的模型名称                     | 支持加载的Pipeline                                    | 备注 | huggingface.co地址 |
+| :-------------------------------------------: | :--------------------------------------------------------------------: | --- | :-----------------------------------------: |
+| CompVis/stable-diffusion-v1-4           | StableDiffusionPipeline、StableDiffusionImg2ImgPipeline、StableDiffusionInpaintPipelineLegacy、StableDiffusionMegaPipeline、StableDiffusionPipelineAllinOne | Stable-Diffusion-v1-4 使用 Stable-Diffusion-v1-2 的权重进行初始化。随后在"laion-aesthetics v2 5+"数据集上以 **512x512** 分辨率微调了 **225k** 步数，对文本使用了 **10%** 的dropout（即：训练过程中文图对中的文本有 10% 的概率会变成空文本）。模型使用了[CLIP ViT-L/14](https://huggingface.co/openai/clip-vit-large-patch14)作为文本编码器。| [地址](https://huggingface.co/CompVis/stable-diffusion-v1-4) |
+| CompVis/ldm-text2im-large-256               | LDMTextToImagePipeline | [LDM论文](https://arxiv.org/pdf/2112.10752.pdf) LDM-KL-8-G* 权重。| [地址](https://huggingface.co/CompVis/ldm-text2im-large-256) |
+| CompVis/ldm-super-resolution-4x-openimages  | LDMSuperResolutionPipeline | [LDM论文](https://arxiv.org/pdf/2112.10752.pdf) LDM-VQ-4 权重，[原始权重链接](https://ommer-lab.com/files/latent-diffusion/sr_bsr.zip)。| [地址](https://huggingface.co/CompVis/ldm-super-resolution-4x-openimages) |
+| runwayml/stable-diffusion-v1-5              | StableDiffusionPipeline、StableDiffusionImg2ImgPipeline、StableDiffusionInpaintPipelineLegacy、StableDiffusionMegaPipeline、StableDiffusionPipelineAllinOne | Stable-Diffusion-v1-5 使用 Stable-Diffusion-v1-2 的权重进行初始化。随后在"laion-aesthetics v2 5+"数据集上以 **512x512** 分辨率微调了 **595k** 步数，对文本使用了 **10%** 的dropout（即：训练过程中文图对中的文本有 10% 的概率会变成空文本）。模型同样也使用了[CLIP ViT-L/14](https://huggingface.co/openai/clip-vit-large-patch14)作为文本编码器。| [地址](https://huggingface.co/runwayml/stable-diffusion-v1-5) |
+| runwayml/stable-diffusion-inpainting        | StableDiffusionInpaintPipeline | Stable-Diffusion-Inpainting 使用 Stable-Diffusion-v1-2 的权重进行初始化。首先进行了 **595k** 步的常规训练（实际也就是 Stable-Diffusion-v1-5 的权重），然后进行了 **440k** 步的 inpainting 修复训练。对于 inpainting 修复训练，给 UNet 额外增加了 **5** 输入通道（其中 **4** 个用于被 Mask 遮盖住的图片，**1** 个用于 Mask 本身）。在训练期间，会随机生成 Mask，并有 **25%** 概率会将原始图片全部 Mask 掉。| [地址](https://huggingface.co/runwayml/stable-diffusion-inpainting) |
+| stabilityai/stable-diffusion-2-base         | StableDiffusionPipeline、StableDiffusionImg2ImgPipeline、StableDiffusionInpaintPipelineLegacy、StableDiffusionMegaPipeline、StableDiffusionPipelineAllinOne | 该模型首先在 [LAION-5B 256x256 子集上](https://laion.ai/blog/laion-5b/) （过滤条件：[punsafe = 0.1 的 LAION-NSFW 分类器](https://github.com/LAION-AI/CLIP-based-NSFW-Detector) 和 审美分数大于等于 4.5 ）从头开始训练 **550k** 步，然后又在分辨率 **>= 512x512** 的同一数据集上进一步训练 **850k** 步。| [地址](https://huggingface.co/stabilityai/stable-diffusion-2-base) |
+| stabilityai/stable-diffusion-2              | StableDiffusionPipeline、StableDiffusionImg2ImgPipeline、StableDiffusionInpaintPipelineLegacy、StableDiffusionMegaPipeline、StableDiffusionPipelineAllinOne | stable-diffusion-2 使用 stable-diffusion-2-base 权重进行初始化，首先在同一数据集上（**512x512** 分辨率）使用 [v-objective](https://arxiv.org/abs/2202.00512) 训��了 **150k** 步。然后又在 **768x768** 分辨率上使用 [v-objective](https://arxiv.org/abs/2202.00512) 继续训练了 **140k** 步。| [地址](https://huggingface.co/stabilityai/stable-diffusion-2) |
+| stabilityai/stable-diffusion-2-inpainting   | StableDiffusionInpaintPipeline |stable-diffusion-2-inpainting 使用 stable-diffusion-2-base 权重初始化，并且额外训练了 **200k** 步。训练过程使用了 [LAMA](https://github.com/saic-mdal/lama) 中提出的 Mask 生成策略，并且使用 Mask 图片的 Latent 表示（经过 VAE 编码）作为附加条件。| [地址](https://huggingface.co/stabilityai/stable-diffusion-2-inpainting) |
+| stabilityai/stable-diffusion-x4-upscaler    | StableDiffusionUpscalePipeline | 该模型在**LAION 10M** 子集上（>2048x2048）训练了 1.25M 步。该模型还在分辨率为 **512x512** 的图像上使用 [Text-guided Latent Upscaling Diffusion Model](https://arxiv.org/abs/2112.10752) 进行了训练。除了**文本输入**之外，它还接收 **noise_level** 作为输入参数，因此我们可以使用 [预定义的 Scheduler](https://huggingface.co/stabilityai/stable-diffusion-x4-upscaler/blob/main/low_res_scheduler/scheduler_config.json) 向低分辨率的输入图片添加噪声。| [地址](https://huggingface.co/stabilityai/stable-diffusion-x4-upscaler) |
+| hakurei/waifu-diffusion    | StableDiffusionPipeline、StableDiffusionImg2ImgPipeline、StableDiffusionInpaintPipelineLegacy、StableDiffusionMegaPipeline、StableDiffusionPipelineAllinOne | waifu-diffusion-v1-2 使用 stable-diffusion-v1-4 权重初始化，并且在**高质量动漫**图像数据集上进行微调后得到的模型。用于微调的数据是 **680k** 文本图像样本，这些样本是通过 **booru 网站** 下载的。| [地址](https://huggingface.co/hakurei/waifu-diffusion) |
+| hakurei/waifu-diffusion-v1-3    | StableDiffusionPipeline、StableDiffusionImg2ImgPipeline、StableDiffusionInpaintPipelineLegacy、StableDiffusionMegaPipeline、StableDiffusionPipelineAllinOne | waifu-diffusion-v1-3 是 waifu-diffusion-v1-2 基础上进一步训练得到的。他们对数据集进行了额外操作：（1）删除下划线；（2）删除括号；（3）用逗号分隔每个booru 标签；（4）随机化标签顺序。| [地址](https://huggingface.co/hakurei/waifu-diffusion) |
+| naclbit/trinart_stable_diffusion_v2_60k    | StableDiffusionPipeline、StableDiffusionImg2ImgPipeline、StableDiffusionInpaintPipelineLegacy、StableDiffusionMegaPipeline、StableDiffusionPipelineAllinOne | trinart_stable_diffusion 使用 stable-diffusion-v1-4 权重初始化，在 40k **高分辨率漫画/动漫风格**的图片数据集上微调了 8 个 epoch。V2 版模型使用 **dropouts**、**10k+ 图像**和**新的标记策略**训练了**更长时间**。| [地址](https://huggingface.co/naclbit/trinart_stable_diffusion_v2) |
+| naclbit/trinart_stable_diffusion_v2_95k    | StableDiffusionPipeline、StableDiffusionImg2ImgPipeline、StableDiffusionInpaintPipelineLegacy、StableDiffusionMegaPipeline、StableDiffusionPipelineAllinOne | **95k** 步数的结果，其他同上。| [地址](https://huggingface.co/naclbit/trinart_stable_diffusion_v2) |
+| naclbit/trinart_stable_diffusion_v2_115k    | StableDiffusionPipeline、StableDiffusionImg2ImgPipeline、StableDiffusionInpaintPipelineLegacy、StableDiffusionMegaPipeline、StableDiffusionPipelineAllinOne | **115k** 步数的结果，其他同上。| [地址](https://huggingface.co/naclbit/trinart_stable_diffusion_v2) |
+| Deltaadams/Hentai-Diffusion    | StableDiffusionPipeline、StableDiffusionImg2ImgPipeline、StableDiffusionInpaintPipelineLegacy、StableDiffusionMegaPipeline、StableDiffusionPipelineAllinOne | None| [地址](https://huggingface.co/Deltaadams/Hentai-Diffusion) |
+| ringhyacinth/nail-set-diffuser    | StableDiffusionPipeline、StableDiffusionImg2ImgPipeline、StableDiffusionInpaintPipelineLegacy、StableDiffusionMegaPipeline、StableDiffusionPipelineAllinOne | 美甲领域的扩散模型，训练数据使用了 [Weekend](https://weibo.com/u/5982308498)| [地址](https://huggingface.co/ringhyacinth/nail-set-diffuser) |
+| Linaqruf/anything-v3.0    | StableDiffusionPipeline、StableDiffusionImg2ImgPipeline、StableDiffusionInpaintPipelineLegacy、StableDiffusionMegaPipeline、StableDiffusionPipelineAllinOne | 该模型可通过输入几个文本提示词就能生成**高质量、高度详细的动漫风格图片**，该模型支持使用 **danbooru 标签文本** 生成图像。| [地址](https://huggingface.co/Linaqruf/anything-v3.0) |
+</details>
+<details><summary>&emsp; Stable Diffusion 模型支持的权重（中文和多语言） </summary>
+| PPDiffusers支持的模型名称                     | 支持加载的Pipeline                                    | 备注 | huggingface.co地址 |
+| :-------------------------------------------: | :--------------------------------------------------------------------: | --- | :-----------------------------------------: |
+| BAAI/AltDiffusion                           | AltDiffusionPipeline、AltDiffusionImg2ImgPipeline | 该模型使用 [AltCLIP](https://github.com/FlagAI-Open/FlagAI/tree/master/examples/AltCLIP/README.md) 作为文本编码器，在 Stable Diffusion 基础上训练了**双语Diffusion模型**，其中训练数据来自 [WuDao数据集](https://data.baai.ac.cn/details/WuDaoCorporaText) 和 [LAION](https://huggingface.co/datasets/ChristophSchuhmann/improved_aesthetics_6plus) 。| [地址](https://huggingface.co/BAAI/AltDiffusion) |
+| BAAI/AltDiffusion-m9                        | AltDiffusionPipeline、AltDiffusionImg2ImgPipeline |该模型使用9种语言的 [AltCLIP-m9](https://github.com/FlagAI-Open/FlagAI/tree/master/examples/AltCLIP/README.md) 作为文本编码器，其他同上。| [地址](https://huggingface.co/BAAI/AltDiffusion-m9) |
+| IDEA-CCNL/Taiyi-Stable-Diffusion-1B-Chinese-v0.1 | StableDiffusionPipeline、StableDiffusionImg2ImgPipeline、StableDiffusionInpaintPipelineLegacy、StableDiffusionMegaPipeline、StableDiffusionPipelineAllinOne | 他们将 [Noah-Wukong](https://wukong-dataset.github.io/wukong-dataset/) 数据集 (100M) 和 [Zero](https://zero.so.com/) 数据集 (23M) 用作预训练的数据集，先用 [IDEA-CCNL/Taiyi-CLIP-RoBERTa-102M-ViT-L-Chinese](https://huggingface.co/IDEA-CCNL/Taiyi-CLIP-RoBERTa-102M-ViT-L-Chinese) 对这两个数据集的图文对相似性进行打分，取 CLIP Score 大于 0.2 的图文对作为训练集。 他们使用 [IDEA-CCNL/Taiyi-CLIP-RoBERTa-102M-ViT-L-Chinese](https://huggingface.co/IDEA-CCNL/Taiyi-CLIP-RoBERTa-102M-ViT-L-Chinese) 作为初始化的text encoder，冻住 [stable-diffusion-v1-4](https://huggingface.co/CompVis/stable-diffusion-v1-4) ([论文](https://arxiv.org/abs/2112.10752)) 模型的其他部分，只训练 text encoder，以便保留原始模型的生成能力且实现中文概念的对齐。该模型目前在0.2亿图文对上训练了一个 epoch。 在 32 x A100 上训练了大约100小时，该版本只是一个初步的版本。| [地址](https://huggingface.co/IDEA-CCNL/Taiyi-Stable-Diffusion-1B-Chinese-v0.1) |
+| IDEA-CCNL/Taiyi-Stable-Diffusion-1B-Chinese-EN-v0.1 | StableDiffusionPipeline、StableDiffusionImg2ImgPipeline、StableDiffusionInpaintPipelineLegacy、StableDiffusionMegaPipeline、StableDiffusionPipelineAllinOne | 他们将 [Noah-Wukong](https://wukong-dataset.github.io/wukong-dataset/) 数据集 (100M) 和 [Zero](https://zero.so.com/) 数据集 (23M) 用作预训练的数据集，先用 [IDEA-CCNL/Taiyi-CLIP-RoBERTa-102M-ViT-L-Chinese](https://huggingface.co/IDEA-CCNL/Taiyi-CLIP-RoBERTa-102M-ViT-L-Chinese) 对这两个数据集的图文对相似性进行打分，取 CLIP Score 大于 0.2 的图文对作为训练集。 他们使用 [stable-diffusion-v1-4](https://huggingface.co/CompVis/stable-diffusion-v1-4) ([论文](https://arxiv.org/abs/2112.10752)) 模型进行继续训练，其中训练分为**两个stage**。**第一个stage** 中冻住模型的其他部分，只训练 text encoder ，以便保留原始模型的生成能力且实现中文概念的对齐。**第二个stage** 中将全部模型解冻，一起训练 text encoder 和 diffusion model ，以便 diffusion model 更好的适配中文引导。第一个 stage 他们训练了 80 小时，第二个 stage 训练了 100 小时，两个stage都是用了8 x A100，该版本是一个初步的版本。| [地址](https://huggingface.co/IDEA-CCNL/Taiyi-Stable-Diffusion-1B-Chinese-EN-v0.1) |
+</details>
+### 加载HF Diffusers权重
+```python
+from ppdiffusers import StableDiffusionPipeline
+# 设置from_hf_hub为True，表示从huggingface hub下载，from_diffusers为True表示加载的是diffusers版Pytorch权重
+pipe = StableDiffusionPipeline.from_pretrained("stabilityai/stable-diffusion-2", from_hf_hub=True, from_diffusers=True)
+```
+### 加载原库的Lightning权重
+```python
+from ppdiffusers import StableDiffusionPipeline
+# 可输入网址 或 本地ckpt、safetensors文件
+pipe = StableDiffusionPipeline.from_single_file("https://paddlenlp.bj.bcebos.com/models/community/junnyu/develop/ppdiffusers/chilloutmix_NiPrunedFp32Fix.safetensors")
+```
+### 加载HF LoRA权重
+```python
+from ppdiffusers import DiffusionPipeline
+pipe = DiffusionPipeline.from_pretrained("stabilityai/stable-diffusion-xl-base-1.0", paddle_dtype=paddle.float16)
+pipe.load_lora_weights("stabilityai/stable-diffusion-xl-base-1.0",
+    weight_name="sd_xl_offset_example-lora_1.0.safetensors",
+    from_diffusers=True)
+```
+### 加载Civitai社区的LoRA权重
+```python
+from ppdiffusers import StableDiffusionPipeline
+pipe = StableDiffusionPipeline.from_pretrained("TASUKU2023/Chilloutmix")
+# 加载lora权重
+pipe.load_lora_weights("./",
+    weight_name="Moxin_10.safetensors",
+    from_diffusers=True)
+pipe.fuse_lora()
+```
+### XFormers加速
+为了使用**XFormers加速**，我们需要安装`develop`版本的`paddle`，Linux系统的安装命令如下：
+```sh
+python -m pip install paddlepaddle-gpu==0.0.0.post117 -f https://www.paddlepaddle.org.cn/whl/linux/gpu/develop.html
+```
+```python
+import paddle
+from ppdiffusers import StableDiffusionPipeline
+pipe = StableDiffusionPipeline.from_pretrained("TASUKU2023/Chilloutmix", paddle_dtype=paddle.float16)
+# 开启xformers加速 默认选择"cutlass"加速
+pipe.enable_xformers_memory_efficient_attention()
+# flash 需要使用 A100、A10、3060、3070、3080、3090 等以上显卡。
+# pipe.enable_xformers_memory_efficient_attention("flash")
+```
+### ToME + ControlNet
+```python
+# 安装develop的ppdiffusers
+# pip install "ppdiffusers>=0.24.0"
+import paddle
+from ppdiffusers import ControlNetModel, StableDiffusionControlNetPipeline
+from ppdiffusers.utils import load_image
+controlnet = ControlNetModel.from_pretrained("lllyasviel/sd-controlnet-canny")
+pipe = StableDiffusionControlNetPipeline.from_pretrained(
+    "runwayml/stable-diffusion-v1-5", safety_checker=None, controlnet=controlnet, paddle_dtype=paddle.float16
+)
+# Apply ToMe with a 50% merging ratio
+pipe.apply_tome(ratio=0.5) # Can also use pipe.unet in place of pipe here
+# 我们可以开启 xformers
+# pipe.enable_xformers_memory_efficient_attention()
+generator = paddle.Generator().manual_seed(0)
+prompt = "bird"
+image = load_image(
+    "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/sd_controlnet/bird_canny.png"
+)
+image = pipe(prompt, image, generator=generator).images[0]
+image.save("bird.png")
+```
+### 文图生成 （Text-to-Image Generation）
+```python
+import paddle
+from ppdiffusers import StableDiffusionPipeline
+pipe = StableDiffusionPipeline.from_pretrained("stabilityai/stable-diffusion-2")
+# 设置随机种子，我们可以复现下面的结果！
+paddle.seed(5232132133)
+prompt = "a portrait of shiba inu with a red cap growing on its head. intricate. lifelike. soft light. sony a 7 r iv 5 5 mm. cinematic post - processing "
+image = pipe(prompt, guidance_scale=7.5, height=768, width=768).images[0]
+image.save("shiba_dog_with_a_red_cap.png")
+```
+<div align="center">
+<img width="500" alt="image" src="https://user-images.githubusercontent.com/50394665/204796701-d7911f76-8670-47d5-8d1b-8368b046c5e4.png">
+</div>
+### 文本引导的图像变换（Image-to-Image Text-Guided Generation）
+<details><summary>&emsp;Image-to-Image Text-Guided Generation Demo </summary>
+```python
+import paddle
+from ppdiffusers import StableDiffusionImg2ImgPipeline
+from ppdiffusers.utils import load_image
+pipe = StableDiffusionImg2ImgPipeline.from_pretrained("Linaqruf/anything-v3.0", safety_checker=None)
+url = "https://paddlenlp.bj.bcebos.com/models/community/CompVis/data/image_Kurisu.png"
+image = load_image(url).resize((512, 768))
+# 设置随机种子，我们可以复现下面的结果！
+paddle.seed(42)
+prompt = "Kurisu Makise, looking at viewer, long hair, standing, 1girl, hair ornament, hair flower, cute, jacket, white flower, white dress"
+negative_prompt = "lowres, bad anatomy, bad hands, text, error, missing fingers, extra digit, fewer digits, cropped, worst quality, low quality, normal quality, jpeg artifacts, signature, watermark, username, blurry"
+image = pipe(prompt=prompt, negative_prompt=negative_prompt, image=image, strength=0.75, guidance_scale=7.5).images[0]
+image.save("image_Kurisu_img2img.png")
+```
+<div align="center">
+<img width="500" alt="image" src="https://user-images.githubusercontent.com/50394665/204799529-cd89dcdb-eb1d-4247-91ac-b0f7bad777f8.png">
+</div>
+</details>
+### 文本引导的图像编辑（Text-Guided Image Inpainting）
+注意！当前有两种版本的图像编辑代码，一个是Legacy版本，一个是正式版本，下面将分别介绍两种代码如何使用！
+<details><summary>&emsp;Legacy版本代码</summary>
+```python
+import paddle
+from ppdiffusers import StableDiffusionInpaintPipelineLegacy
+from ppdiffusers.utils import load_image
+# 可选模型权重
+# CompVis/stable-diffusion-v1-4
+# runwayml/stable-diffusion-v1-5
+# stabilityai/stable-diffusion-2-base （原始策略 512x512）
+# stabilityai/stable-diffusion-2 （v-objective 768x768）
+# Linaqruf/anything-v3.0
+# ......
+img_url = "https://paddlenlp.bj.bcebos.com/models/community/CompVis/stable-diffusion-v1-4/overture-creations.png"
+mask_url = "https://paddlenlp.bj.bcebos.com/models/community/CompVis/stable-diffusion-v1-4/overture-creations-mask.png"
+image = load_image(img_url).resize((512, 512))
+mask_image = load_image(mask_url).resize((512, 512))
+pipe = StableDiffusionInpaintPipelineLegacy.from_pretrained("stabilityai/stable-diffusion-2-base", safety_checker=None)
+# 设置随机种子，我们可以复现下面的结果！
+paddle.seed(10245)
+prompt = "a red cat sitting on a bench"
+image = pipe(prompt=prompt, image=image, mask_image=mask_image, strength=0.75).images[0]
+image.save("a_red_cat_legacy.png")
+```
+<div align="center">
+<img width="900" alt="image" src="https://user-images.githubusercontent.com/50394665/204802186-5a6d302b-83aa-4247-a5bb-ebabfcc3abc4.png">
+</div>
+</details>
+<details><summary>&emsp;正式版本代码</summary>
+Tips: 下面的使用方法是新版本的代码，也是官���推荐的代码，注意必须配合 **runwayml/stable-diffusion-inpainting** 和 **stabilityai/stable-diffusion-2-inpainting** 才可正常使用。
+```python
+import paddle
+from ppdiffusers import StableDiffusionInpaintPipeline
+from ppdiffusers.utils import load_image
+# 可选模型权重
+# runwayml/stable-diffusion-inpainting
+# stabilityai/stable-diffusion-2-inpainting
+img_url = "https://paddlenlp.bj.bcebos.com/models/community/CompVis/stable-diffusion-v1-4/overture-creations.png"
+mask_url = "https://paddlenlp.bj.bcebos.com/models/community/CompVis/stable-diffusion-v1-4/overture-creations-mask.png"
+image = load_image(img_url).resize((512, 512))
+mask_image = load_image(mask_url).resize((512, 512))
+pipe = StableDiffusionInpaintPipeline.from_pretrained("stabilityai/stable-diffusion-2-inpainting")
+# 设置随机种子，我们可以复现下面的结果！
+paddle.seed(1024)
+prompt = "Face of a yellow cat, high resolution, sitting on a park bench"
+image = pipe(prompt=prompt, image=image, mask_image=mask_image).images[0]
+image.save("a_yellow_cat.png")
+```
+<div align="center">
+<img width="900" alt="image" src="https://user-images.githubusercontent.com/50394665/204801946-6cd043bc-f3db-42cf-82cd-6a6171484523.png">
+</div>
+</details>
+### 文本引导的图像放大 & 超分（Text-Guided Image Upscaling & Super-Resolution）
+<details><summary>&emsp;Text-Guided Image Upscaling Demo</summary>
+```python
+import paddle
+from ppdiffusers import StableDiffusionUpscalePipeline
+from ppdiffusers.utils import load_image
+pipe = StableDiffusionUpscalePipeline.from_pretrained("stabilityai/stable-diffusion-x4-upscaler")
+url = "https://paddlenlp.bj.bcebos.com/models/community/CompVis/data/low_res_cat.png"
+# 我们人工将原始图片缩小成 128x128 分辨率，最终保存的图片会放大4倍！
+low_res_img = load_image(url).resize((128, 128))
+prompt = "a white cat"
+image = pipe(prompt=prompt, image=low_res_img).images[0]
+image.save("upscaled_white_cat.png")
+```
+<div align="center">
+<img width="200" alt="image" src="https://user-images.githubusercontent.com/50394665/204806180-b7f1b9cf-8a62-4577-b5c4-91adda08a13b.png">
+<img width="400" alt="image" src="https://user-images.githubusercontent.com/50394665/204806202-8c110be3-5f48-4946-95ea-21ad5a9a2340.png">
+</div>
+</details>
+<details><summary>&emsp;Super-Resolution Demo</summary>
+```python
+import paddle
+from ppdiffusers import LDMSuperResolutionPipeline
+from ppdiffusers.utils import load_image
+pipe = LDMSuperResolutionPipeline.from_pretrained("CompVis/ldm-super-resolution-4x-openimages")
+url = "https://paddlenlp.bj.bcebos.com/models/community/CompVis/stable-diffusion-v1-4/overture-creations.png"
+# 我们人工将原始图片缩小成 128x128 分辨率，最终保存的图片会放大4倍！
+low_res_img = load_image(url).resize((128, 128))
+image = pipe(image=low_res_img, num_inference_steps=100).images[0]
+image.save("ldm-super-resolution-image.png")
+```
+<div align="center">
+<img width="200" alt="image" src="https://user-images.githubusercontent.com/50394665/204804426-5e28b571-aa41-4f56-ba26-68cca75fdaae.png">
+<img width="400" alt="image" src="https://user-images.githubusercontent.com/50394665/204804148-fe7c293b-6cd7-4942-ae9c-446369fe8410.png">
+</div>
+</details>
+## 模型推理部署
+除了**Paddle动态图**运行之外，很多模型还支持将模型导出并使用推理引擎运行。我们提供基于[FastDeploy](https://github.com/PaddlePaddle/FastDeploy)上的**StableDiffusion**模型部署示例，涵盖文生图、图生图、图像编辑等任务，用户可以按照我们提供[StableDiffusion模型导出教程](https://github.com/PaddlePaddle/PaddleMIX/blob/develop/ppdiffusers/deploy/export.md)将模型导出，然后使用`FastDeployStableDiffusionMegaPipeline`进行高性能推理部署！
+<details><summary>&emsp; 已预先导出的FastDeploy版Stable Diffusion权重 </summary>
+**注意：当前导出的vae encoder带有随机因素！**
+- CompVis/stable-diffusion-v1-4@fastdeploy
+- runwayml/stable-diffusion-v1-5@fastdeploy
+- runwayml/stable-diffusion-inpainting@fastdeploy
+- stabilityai/stable-diffusion-2-base@fastdeploy
+- stabilityai/stable-diffusion-2@fastdeploy
+- stabilityai/stable-diffusion-2-inpainting@fastdeploy
+- Linaqruf/anything-v3.0@fastdeploy
+- hakurei/waifu-diffusion-v1-3@fastdeploy
+</details>
+<details><summary>&emsp; FastDeploy Demo </summary>
+```python
+import paddle
+import fastdeploy as fd
+from ppdiffusers import FastDeployStableDiffusionMegaPipeline
+from ppdiffusers.utils import load_image
+def create_runtime_option(device_id=0, backend="paddle", use_cuda_stream=True):
+    option = fd.RuntimeOption()
+    if backend == "paddle":
+        option.use_paddle_backend()
+    else:
+        option.use_ort_backend()
+    if device_id == -1:
+        option.use_cpu()
+    else:
+        option.use_gpu(device_id)
+        if use_cuda_stream:
+            paddle_stream = paddle.device.cuda.current_stream(device_id).cuda_stream
+            option.set_external_raw_stream(paddle_stream)
+    return option
+runtime_options = {
+    "text_encoder": create_runtime_option(0, "paddle"),  # use gpu:0
+    "vae_encoder": create_runtime_option(0, "paddle"),  # use gpu:0
+    "vae_decoder": create_runtime_option(0, "paddle"),  # use gpu:0
+    "unet": create_runtime_option(0, "paddle"),  # use gpu:0
+}
+fd_pipe = FastDeployStableDiffusionMegaPipeline.from_pretrained(
+    "Linaqruf/anything-v3.0@fastdeploy", runtime_options=runtime_options
+)
+# text2img
+prompt = "a portrait of shiba inu with a red cap growing on its head. intricate. lifelike. soft light. sony a 7 r iv 5 5 mm. cinematic post - processing "
+image_text2img = fd_pipe.text2img(prompt=prompt, num_inference_steps=50).images[0]
+image_text2img.save("image_text2img.png")
+# img2img
+url = "https://paddlenlp.bj.bcebos.com/models/community/CompVis/data/image_Kurisu.png"
+image = load_image(url).resize((512, 512))
+prompt = "Kurisu Makise, looking at viewer, long hair, standing, 1girl, hair ornament, hair flower, cute, jacket, white flower, white dress"
+negative_prompt = "lowres, bad anatomy, bad hands, text, error, missing fingers, extra digit, fewer digits, cropped, worst quality, low quality, normal quality, jpeg artifacts, signature, watermark, username, blurry"
+image_img2img = fd_pipe.img2img(
+    prompt=prompt, negative_prompt=negative_prompt, image=image, strength=0.75, guidance_scale=7.5
+).images[0]
+image_img2img.save("image_img2img.png")
+# inpaint_legacy
+img_url = "https://paddlenlp.bj.bcebos.com/models/community/CompVis/stable-diffusion-v1-4/overture-creations.png"
+mask_url = "https://paddlenlp.bj.bcebos.com/models/community/CompVis/stable-diffusion-v1-4/overture-creations-mask.png"
+image = load_image(img_url).resize((512, 512))
+mask_image = load_image(mask_url).resize((512, 512))
+prompt = "a red cat sitting on a bench"
+image_inpaint_legacy = fd_pipe.inpaint_legacy(
+    prompt=prompt, image=image, mask_image=mask_image, strength=0.75, num_inference_steps=50
+).images[0]
+image_inpaint_legacy.save("image_inpaint_legacy.png")
+```
+</details>
+<div align="center">
+<img width="900" alt="image" src="https://user-images.githubusercontent.com/50394665/205297240-46b80992-34af-40cd-91a6-ae76589d0e21.png">
+</div>
+## 更多任务分类展示
+### 文本图像多模
+<details open>
+<summary>&emsp;文图生成（Text-to-Image Generation）</summary>
+#### text_to_image_generation-stable_diffusion
+```python
+from ppdiffusers import StableDiffusionPipeline
+# 加载模型和scheduler
+pipe = StableDiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5")
+# 执行pipeline进行推理
+prompt = "a photo of an astronaut riding a horse on mars"
+image = pipe(prompt).images[0]
+# 保存图片
+image.save("astronaut_rides_horse_sd.png")
+```
+<div align="center">
+<img width="300" alt="image" src="https://user-images.githubusercontent.com/20476674/209322401-6ecfeaaa-6878-4302-b592-07a31de4e590.png">
+</div>
+#### text_to_image_generation-stable_diffusion_xl
+```python
+import paddle
+from ppdiffusers import StableDiffusionXLPipeline
+pipe = StableDiffusionXLPipeline.from_pretrained(
+     "stabilityai/stable-diffusion-xl-base-1.0",
+     paddle_dtype=paddle.float16,
+     variant="fp16"
+)
+prompt = "a photo of an astronaut riding a horse on mars"
+generator = paddle.Generator().manual_seed(42)
+image = pipe(prompt=prompt, generator=generator, num_inference_steps=50).images[0]
+image.save('sdxl_text2image.png')
+```
+<div align="center">
+<img width="300" alt="image" src="https://github.com/PaddlePaddle/PaddleMIX/assets/35400185/d72729f9-8685-48f9-a238-e4ddf6d264f3">
+</div>
+#### text_to_image_generation-sdxl_base_with_refiner
+```python
+from ppdiffusers import DiffusionPipeline
+import paddle
+# load both base & refiner
+base = DiffusionPipeline.from_pretrained(
+    "stabilityai/stable-diffusion-xl-base-1.0",
+    paddle_dtype=paddle.float16,
+)
+refiner = DiffusionPipeline.from_pretrained(
+    "stabilityai/stable-diffusion-xl-refiner-1.0",
+    text_encoder_2=base.text_encoder_2,
+    vae=base.vae,
+    paddle_dtype=paddle.float16,
+    variant="fp16",
+)
+# Define how many steps and what % of steps to be run on each experts (80/20) here
+n_steps = 40
+high_noise_frac = 0.8
+prompt = "A majestic lion jumping from a big stone at night"
+prompt = "a photo of an astronaut riding a horse on mars"
+generator = paddle.Generator().manual_seed(42)
+# run both experts
+image = base(
+    prompt=prompt,
+    output_type="latent",
+    generator=generator,
+).images
+image = refiner(
+    prompt=prompt,
+    image=image,
+    generator=generator,
+).images[0]
+image.save('text_to_image_generation-sdxl-base-with-refiner-result.png')
+```
+<div align="center">
+<img width="300" alt="image" src="https://github.com/PaddlePaddle/PaddleMIX/assets/35400185/8ef36826-ed94-4856-a356-af1677f60d1b">
+</div>
+#### text_to_image_generation-kandinsky2_2
+```python
+from ppdiffusers import KandinskyV22Pipeline, KandinskyV22PriorPipeline
+pipe_prior = KandinskyV22PriorPipeline.from_pretrained("kandinsky-community/kandinsky-2-2-prior")
+prompt = "red cat, 4k photo"
+out = pipe_prior(prompt)
+image_emb = out.image_embeds
+zero_image_emb = out.negative_image_embeds
+pipe = KandinskyV22Pipeline.from_pretrained("kandinsky-community/kandinsky-2-2-decoder")
+image = pipe(
+    image_embeds=image_emb,
+    negative_image_embeds=zero_image_emb,
+    height=768,
+    width=768,
+    num_inference_steps=50,
+).images
+image[0].save("text_to_image_generation-kandinsky2_2-result-cat.png")
+```
+<div align="center">
+<img width="300" alt="image" src="https://github.com/PaddlePaddle/PaddleMIX/assets/35400185/188f76dd-4bd7-4a33-8f30-b893c7a9e249">
+</div>
+#### text_to_image_generation-unidiffuser
+```python
+import paddle
+from paddlenlp.trainer import set_seed
+from ppdiffusers import UniDiffuserPipeline
+model_id_or_path = "thu-ml/unidiffuser-v1"
+pipe = UniDiffuserPipeline.from_pretrained(model_id_or_path, paddle_dtype=paddle.float16)
+set_seed(42)
+# Text variation can be performed with a text-to-image generation followed by a image-to-text generation:
+# 1. Text-to-image generation
+prompt = "an elephant under the sea"
+sample = pipe(prompt=prompt, num_inference_steps=20, guidance_scale=8.0)
+t2i_image = sample.images[0]
+t2i_image.save("t2i_image.png")
+````
+<div align="center">
+<img width="300" alt="image" src="https://github.com/PaddlePaddle/PaddleMIX/assets/35400185/a6eb11d2-ad27-4263-8cb4-b0d8dd42b36c">
+</div>
+#### text_to_image_generation-deepfloyd_if
+```python
+import paddle
+from ppdiffusers import DiffusionPipeline, IFPipeline, IFSuperResolutionPipeline
+from ppdiffusers.utils import pd_to_pil
+# Stage 1: generate images
+pipe = IFPipeline.from_pretrained("DeepFloyd/IF-I-XL-v1.0", variant="fp16", paddle_dtype=paddle.float16)
+pipe.enable_xformers_memory_efficient_attention()
+prompt = 'a photo of a kangaroo wearing an orange hoodie and blue sunglasses standing in front of the eiffel tower holding a sign that says "very deep learning"'
+prompt_embeds, negative_embeds = pipe.encode_prompt(prompt)
+image = pipe(
+    prompt_embeds=prompt_embeds,
+    negative_prompt_embeds=negative_embeds,
+    output_type="pd",
+).images
+# save intermediate image
+pil_image = pd_to_pil(image)
+pil_image[0].save("text_to_image_generation-deepfloyd_if-result-if_stage_I.png")
+# save gpu memory
+pipe.to(paddle_device="cpu")
+# Stage 2: super resolution stage1
+super_res_1_pipe = IFSuperResolutionPipeline.from_pretrained(
+    "DeepFloyd/IF-II-L-v1.0", text_encoder=None, variant="fp16", paddle_dtype=paddle.float16
+)
+super_res_1_pipe.enable_xformers_memory_efficient_attention()
+image = super_res_1_pipe(
+    image=image,
+    prompt_embeds=prompt_embeds,
+    negative_prompt_embeds=negative_embeds,
+    output_type="pd",
+).images
+# save intermediate image
+pil_image = pd_to_pil(image)
+pil_image[0].save("text_to_image_generation-deepfloyd_if-result-if_stage_II.png")
+# save gpu memory
+super_res_1_pipe.to(paddle_device="cpu")
+```
+<div align="center">
+<img alt="image" src="https://user-images.githubusercontent.com/20476674/246785766-700dfad9-159d-4bfb-bfc7-c18df938a052.png">
+</div>
+<div align="center">
+<center>if_stage_I</center>
+</div>
+<div align="center">
+<img alt="image" src="https://user-images.githubusercontent.com/20476674/246785773-3359ca5f-dadf-4cc8-b318-ff1f9d4a2d35.png">
+</div>
+<div align="center">
+<center>if_stage_II</center>
+<!-- <img alt="image" src="https://user-images.githubusercontent.com/20476674/246785774-8870829a-354b-4a87-9d67-93af315f51e6.png">
+<center>if_stage_III</center> -->
+</div>
+</details>
+<details><summary>&emsp;文本引导的图像放大（Text-Guided Image Upscaling）</summary>
+#### text_guided_image_upscaling-stable_diffusion_2
+```python
+from ppdiffusers import StableDiffusionUpscalePipeline
+from ppdiffusers.utils import load_image
+pipe = StableDiffusionUpscalePipeline.from_pretrained("stabilityai/stable-diffusion-x4-upscaler")
+url = "https://paddlenlp.bj.bcebos.com/models/community/CompVis/data/low_res_cat.png"
+low_res_img = load_image(url).resize((128, 128))
+prompt = "a white cat"
+upscaled_image = pipe(prompt=prompt, image=low_res_img).images[0]
+upscaled_image.save("upsampled_cat_sd2.png")
+```
+<div align="center">
+<img alt="image" src="https://user-images.githubusercontent.com/20476674/209324085-0d058b70-89b0-43c2-affe-534eedf116cf.png">
+<center>原图像</center>
+<img alt="image" src="https://user-images.githubusercontent.com/20476674/209323862-ce2d8658-a52b-4f35-90cb-aa7d310022e7.png">
+<center>生成图像</center>
+</div>
+</details>
+<details><summary>&emsp;文本引导的图像编辑（Text-Guided Image Inpainting）</summary>
+#### text_guided_image_inpainting-stable_diffusion_2
+```python
+import paddle
+from ppdiffusers import PaintByExamplePipeline
+from ppdiffusers.utils import load_image
+img_url = "https://paddlenlp.bj.bcebos.com/models/community/Fantasy-Studio/data/image_example_1.png"
+mask_url = "https://paddlenlp.bj.bcebos.com/models/community/Fantasy-Studio/data/mask_example_1.png"
+example_url = "https://paddlenlp.bj.bcebos.com/models/community/Fantasy-Studio/data/reference_example_1.jpeg"
+init_image = load_image(img_url).resize((512, 512))
+mask_image = load_image(mask_url).resize((512, 512))
+example_image = load_image(example_url).resize((512, 512))
+pipe = PaintByExamplePipeline.from_pretrained("Fantasy-Studio/Paint-by-Example")
+# 使用fp16加快生成速度
+with paddle.amp.auto_cast(True):
+    image = pipe(image=init_image, mask_image=mask_image, example_image=example_image).images[0]
+image.save("image_guided_image_inpainting-paint_by_example-result.png")
+```
+<div align="center">
+<img alt="image" src="https://user-images.githubusercontent.com/20476674/247118364-5d91f433-f9ac-4514-b5f0-cb4599905847.png" width=300>
+<center>原图像</center>
+<div align="center">
+<img alt="image" src="https://user-images.githubusercontent.com/20476674/247118361-0f78d6db-6896-4f8d-b1bd-8350192f7a4e.png" width=300>
+<center>掩码图像</center>
+<div align="center">
+<img alt="image" src="https://user-images.githubusercontent.com/20476674/247118368-305a048d-ddc3-4a5f-8915-58591ef680f0.jpeg" width=300>
+<center>参考图像</center>
+<img alt="image" src="https://user-images.githubusercontent.com/20476674/247117963-e5b9b754-39a3-480b-a557-46a2f9310e79.png" width=300>
+<center>生成图像</center>
+</div>
+</details>
+<details><summary>&emsp;文本引导的图像变换（Image-to-Image Text-Guided Generation）</summary>
+#### text_guided_image_inpainting-kandinsky2_2
+```python
+import numpy as np
+import paddle
+from ppdiffusers import KandinskyV22InpaintPipeline, KandinskyV22PriorPipeline
+from ppdiffusers.utils import load_image
+pipe_prior = KandinskyV22PriorPipeline.from_pretrained(
+    "kandinsky-community/kandinsky-2-2-prior", paddle_dtype=paddle.float16
+)
+prompt = "a hat"
+image_emb, zero_image_emb = pipe_prior(prompt, return_dict=False)
+pipe = KandinskyV22InpaintPipeline.from_pretrained(
+    "kandinsky-community/kandinsky-2-2-decoder-inpaint", paddle_dtype=paddle.float16
+)
+init_image = load_image(
+    "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/kandinsky/cat.png"
+)
+mask = np.zeros((768, 768), dtype=np.float32)
+mask[:250, 250:-250] = 1
+out = pipe(
+    image=init_image,
+    mask_image=mask,
+    image_embeds=image_emb,
+    negative_image_embeds=zero_image_emb,
+    height=768,
+    width=768,
+    num_inference_steps=50,
+)
+image = out.images[0]
+image.save("text_guided_image_inpainting-kandinsky2_2-result-cat_with_hat.png")
+```
+<div align="center">
+<img width="300" alt="image" src="https://github.com/PaddlePaddle/PaddleMIX/assets/35400185/64a943d5-167b-4433-91c3-3cf9279714db">
+<center>原图像</center>
+<img width="300" alt="image" src="https://github.com/PaddlePaddle/PaddleMIX/assets/35400185/f469c127-52f4-4173-a693-c06b92a052aa">
+<center>生成图像</center>
+</div>
+#### image_to_image_text_guided_generation-stable_diffusion
+```python
+import paddle
+from ppdiffusers import StableDiffusionImg2ImgPipeline
+from ppdiffusers.utils import load_image
+# 加载pipeline
+pipe = StableDiffusionImg2ImgPipeline.from_pretrained("runwayml/stable-diffusion-v1-5")
+# 下载初始图片
+url = "https://paddlenlp.bj.bcebos.com/models/community/CompVis/stable-diffusion-v1-4/sketch-mountains-input.png"
+init_image = load_image(url).resize((768, 512))
+prompt = "A fantasy landscape, trending on artstation"
+# 使用fp16加快生成速度
+with paddle.amp.auto_cast(True):
+    image = pipe(prompt=prompt, image=init_image, strength=0.75, guidance_scale=7.5).images[0]
+image.save("fantasy_landscape.png")
+```
+<div align="center">
+<img width="300" alt="image" src="https://user-images.githubusercontent.com/20476674/209327142-d8e1d0c7-3bf8-4a08-a0e8-b11451fc84d8.png">
+<center>原图像</center>
+<img width="300" alt="image" src="https://user-images.githubusercontent.com/20476674/209325799-d9ff279b-0d57-435f-bda7-763e3323be23.png">
+<center>生成图像</center>
+</div>
+#### image_to_image_text_guided_generation-stable_diffusion_xl
+```python
+import paddle
+from ppdiffusers import StableDiffusionXLImg2ImgPipeline
+from ppdiffusers.utils import load_image
+pipe = StableDiffusionXLImg2ImgPipeline.from_pretrained(
+    "stabilityai/stable-diffusion-xl-refiner-1.0",
+    paddle_dtype=paddle.float16,
+    # from_hf_hub=True,
+    # from_diffusers=True,
+    variant="fp16"
+)
+url = "https://paddlenlp.bj.bcebos.com/models/community/westfish/develop-0-19-3/000000009.png"
+init_image = load_image(url).convert("RGB")
+prompt = "a photo of an astronaut riding a horse on mars"
+image = pipe(prompt, image=init_image).images[0]
+image.save('sdxl_image2image.png')
+```
+<div align="center">
+<img width="300" alt="image" src="https://github.com/PaddlePaddle/PaddleMIX/assets/35400185/41bd9381-2799-4bed-a5e2-ba312a2f8da9">
+<center>原图像</center>
+<img width="300" alt="image" src="https://github.com/PaddlePaddle/PaddleMIX/assets/35400185/db672d03-2e3a-46ac-97fd-d80cca18dbbe">
+<center>生成图像</center>
+</div>
+#### image_to_image_text_guided_generation-kandinsky2_2
+```python
+import paddle
+from ppdiffusers import KandinskyV22Img2ImgPipeline, KandinskyV22PriorPipeline
+from ppdiffusers.utils import load_image
+pipe_prior = KandinskyV22PriorPipeline.from_pretrained(
+    "kandinsky-community/kandinsky-2-2-prior", paddle_dtype=paddle.float16
+)
+prompt = "A red cartoon frog, 4k"
+image_emb, zero_image_emb = pipe_prior(prompt, return_dict=False)
+pipe = KandinskyV22Img2ImgPipeline.from_pretrained(
+    "kandinsky-community/kandinsky-2-2-decoder", paddle_dtype=paddle.float16
+)
+init_image = load_image(
+    "https://hf-mirror.com/datasets/hf-internal-testing/diffusers-images/resolve/main/kandinsky/frog.png"
+)
+image = pipe(
+    image=init_image,
+    image_embeds=image_emb,
+    negative_image_embeds=zero_image_emb,
+    height=768,
+    width=768,
+    num_inference_steps=100,
+    strength=0.2,
+).images
+image[0].save("image_to_image_text_guided_generation-kandinsky2_2-result-red_frog.png")
+```
+<div align="center">
+<img width="300" alt="image" src="https://github.com/PaddlePaddle/PaddleMIX/assets/35400185/aae57109-94ad-408e-ae75-8cce650cebe5">
+<center>原图像</center>
+<img width="300" alt="image" src="https://github.com/PaddlePaddle/PaddleMIX/assets/35400185/23cf2c4e-416f-4f21-82a6-e57de11b5e83">
+<center>生成图像</center>
+</div>
+</details>
+</details>
+<details><summary>&emsp;文本图像双引导图像生成（Dual Text and Image Guided Generation）</summary>
+#### dual_text_and_image_guided_generation-versatile_diffusion
+```python
+from ppdiffusers import VersatileDiffusionDualGuidedPipeline
+from ppdiffusers.utils import load_image
+url = "https://paddlenlp.bj.bcebos.com/models/community/CompVis/data/benz.jpg"
+image = load_image(url)
+text = "a red car in the sun"
+pipe = VersatileDiffusionDualGuidedPipeline.from_pretrained("shi-labs/versatile-diffusion")
+pipe.remove_unused_weights()
+text_to_image_strength = 0.75
+image = pipe(prompt=text, image=image, text_to_image_strength=text_to_image_strength).images[0]
+image.save("versatile-diffusion-red_car.png")
+```
+<div align="center">
+<img width="300" alt="image" src="https://user-images.githubusercontent.com/20476674/209325965-2475e9c4-a524-4970-8498-dfe10ff9cf24.jpg" >
+<center>原图像</center>
+<img width="300" alt="image" src="https://user-images.githubusercontent.com/20476674/209325293-049098d0-d591-4abc-b151-9291ac2636da.png">
+<center>生成图像</center>
+</div>
+</details>
+### 文本视频多模
+<details open>
+<summary>&emsp;文本条件的视频生成（Text-to-Video Generation）</summary>
+#### text_to_video_generation-lvdm
+```python
+import paddle
+from ppdiffusers import LVDMTextToVideoPipeline
+# 加载模型和scheduler
+pipe = LVDMTextToVideoPipeline.from_pretrained("westfish/lvdm_text2video_orig_webvid_2m")
+# 执行pipeline进行推理
+seed = 2013
+generator = paddle.Generator().manual_seed(seed)
+samples = pipe(
+    prompt="cutting in kitchen",
+    num_frames=16,
+    height=256,
+    width=256,
+    num_inference_steps=50,
+    generator=generator,
+    guidance_scale=15,
+    eta=1,
+    save_dir=".",
+    save_name="text_to_video_generation-lvdm-result-ddim_lvdm_text_to_video_ucf",
+    encoder_type="2d",
+    scale_factor=0.18215,
+    shift_factor=0,
+)
+```
+<div align="center">
+<img width="300" alt="image" src="https://user-images.githubusercontent.com/20476674/270906907-2b9d53c1-0272-4c7a-81b2-cd962d23bbee.gif">
+</div>
+#### text_to_video_generation-synth
+```python
+import imageio
+from ppdiffusers import DPMSolverMultistepScheduler, TextToVideoSDPipeline
+pipe = TextToVideoSDPipeline.from_pretrained("damo-vilab/text-to-video-ms-1.7b")
+pipe.scheduler = DPMSolverMultistepScheduler.from_config(pipe.scheduler.config)
+prompt = "An astronaut riding a horse."
+video_frames = pipe(prompt, num_inference_steps=25).frames
+imageio.mimsave("text_to_video_generation-synth-result-astronaut_riding_a_horse.mp4", video_frames, fps=8)
+```
+<div align="center">
+<img width="300" alt="image" src="https://user-images.githubusercontent.com/20476674/281259277-0ebe29a3-4eba-48ee-a98b-292e60de3c98.gif">
+</div>
+#### text_to_video_generation-synth with zeroscope_v2_XL
+```python
+import imageio
+from ppdiffusers import DPMSolverMultistepScheduler, TextToVideoSDPipeline
+# from ppdiffusers.utils import export_to_video
+pipe = TextToVideoSDPipeline.from_pretrained("cerspense/zeroscope_v2_XL")
+pipe.scheduler = DPMSolverMultistepScheduler.from_config(pipe.scheduler.config)
+prompt = "An astronaut riding a horse."
+video_frames = pipe(prompt, num_inference_steps=50, height=320, width=576, num_frames=24).frames
+imageio.mimsave("text_to_video_generation-synth-result-astronaut_riding_a_horse.mp4", video_frames, fps=8)
+```
+<div align="center">
+<img width="300" alt="image" src="https://github.com/PaddlePaddle/PaddleMIX/assets/35400185/43ebbca0-9f07-458b-809a-acf296a2539b">
+</div>
+#### text_to_video_generation-zero
+```python
+import imageio
+# pip install imageio[ffmpeg]
+import paddle
+from ppdiffusers import TextToVideoZeroPipeline
+model_id = "runwayml/stable-diffusion-v1-5"
+pipe = TextToVideoZeroPipeline.from_pretrained(model_id, paddle_dtype=paddle.float16)
+prompt = "A panda is playing guitar on times square"
+result = pipe(prompt=prompt).images
+result = [(r * 255).astype("uint8") for r in result]
+imageio.mimsave("text_to_video_generation-zero-result-panda.mp4", result, fps=4)
+```
+<div align="center">
+<img width="300" alt="image" src="https://user-images.githubusercontent.com/20476674/246779321-c2b0c2b4-e383-40c7-a4d8-f417e8062b35.gif">
+</div>
+</details>
+### 文本音频多模
+<details>
+<summary>&emsp;文本条件的音频生成（Text-to-Audio Generation）</summary>
+#### text_to_audio_generation-audio_ldm
+```python
+import paddle
+import scipy
+from ppdiffusers import AudioLDM2Pipeline
+pipe = AudioLDM2Pipeline.from_pretrained("cvssp/audioldm2", paddle_dtype=paddle.float16)
+prompt = "Musical constellations twinkling in the night sky, forming a cosmic melody."
+negative_prompt = "Low quality."
+audio = pipe(prompt, negative_prompt=negative_prompt, num_inference_steps=200, audio_length_in_s=10).audios[0]
+output_path = f"{prompt}.wav"
+# save the audio sample as a .wav file
+scipy.io.wavfile.write(output_path, rate=16000, data=audio)
+```
+<div align = "center">
+  <thead>
+  </thead>
+  <tbody>
+   <tr>
+      <td align = "center">
+      <a href="https://paddlenlp.bj.bcebos.com/models/community/paddlemix/ppdiffusers/AudioLDM2-Music.wav" rel="nofollow">
+            <img align="center" src="https://user-images.githubusercontent.com/20476674/209344877-edbf1c24-f08d-4e3b-88a4-a27e1fd0a858.png" width="200 style="max-width: 100%;"></a><br>
+      </td>
+    </tr>
+  </tbody>
+</div>
+</details>
+可以使用以下代码转换[huggingface](https://huggingface.co/docs/diffusers/api/pipelines/audioldm2)的模型，一键在paddle中使用
+```python
+pipe = AudioLDM2Pipeline.from_pretrained("cvssp/audioldm2-music", from_hf_hub=True, from_diffusers=True).save_pretrained("cvssp/audioldm2-music")
+```
+### 图像
+<details><summary>&emsp;无条件图像生成（Unconditional Image Generation）</summary>
+#### unconditional_image_generation-latent_diffusion_uncond
+```python
+from ppdiffusers import LDMPipeline
+# 加载模型和scheduler
+pipe = LDMPipeline.from_pretrained("CompVis/ldm-celebahq-256")
+# 执行pipeline进行推理
+image = pipe(num_inference_steps=200).images[0]
+# 保存图片
+image.save("ldm_generated_image.png")
+```
+<div align="center">
+<img width="300" alt="image" src="https://user-images.githubusercontent.com/20476674/209327936-7fe914e0-0ea0-4e21-a433-24eaed6ee94c.png">
+</div>
+</details>
+<details><summary>&emsp;超分（Super Superresolution）</summary>
+#### super_resolution-latent_diffusion
+```python
+import paddle
+from ppdiffusers import LDMSuperResolutionPipeline
+from ppdiffusers.utils import load_image
+# 加载pipeline
+pipe = LDMSuperResolutionPipeline.from_pretrained("CompVis/ldm-super-resolution-4x-openimages")
+# 下载初始图片
+url = "https://paddlenlp.bj.bcebos.com/models/community/CompVis/stable-diffusion-v1-4/overture-creations.png"
+init_image = load_image(url).resize((128, 128))
+init_image.save("original-image.png")
+# 使用fp16加快生成速度
+with paddle.amp.auto_cast(True):
+    image = pipe(init_image, num_inference_steps=100, eta=1).images[0]
+image.save("super-resolution-image.png")
+```
+<div align="center">
+<img  alt="image" src="https://user-images.githubusercontent.com/20476674/209328660-9700fdc3-72b3-43bd-9a00-23b370ba030b.png">
+<center>原图像</center>
+<img  alt="image" src="https://user-images.githubusercontent.com/20476674/209328479-4eaea5d8-aa4a-4f31-aa2a-b47e3c730f15.png">
+<center>生成图像</center>
+</div>
+</details>
+<details><summary>&emsp;图像编辑（Image Inpainting）</summary>
+#### image_inpainting-repaint
+```python
+from ppdiffusers import RePaintPipeline, RePaintScheduler
+from ppdiffusers.utils import load_image
+img_url = "https://paddlenlp.bj.bcebos.com/models/community/CompVis/data/celeba_hq_256.png"
+mask_url = "https://paddlenlp.bj.bcebos.com/models/community/CompVis/data/mask_256.png"
+# Load the original image and the mask as PIL images
+original_image = load_image(img_url).resize((256, 256))
+mask_image = load_image(mask_url).resize((256, 256))
+scheduler = RePaintScheduler.from_pretrained("google/ddpm-ema-celebahq-256", subfolder="scheduler")
+pipe = RePaintPipeline.from_pretrained("google/ddpm-ema-celebahq-256", scheduler=scheduler)
+output = pipe(
+    original_image=original_image,
+    mask_image=mask_image,
+    num_inference_steps=250,
+    eta=0.0,
+    jump_length=10,
+    jump_n_sample=10,
+)
+inpainted_image = output.images[0]
+inpainted_image.save("repaint-image.png")
+```
+<div align="center">
+<img  alt="image" src="https://user-images.githubusercontent.com/20476674/209329052-b6fc2aaf-1a59-49a3-92ef-60180fdffd81.png">
+<center>原图像</center>
+<img  alt="image" src="https://user-images.githubusercontent.com/20476674/209329048-4fe12176-32a0-4800-98f2-49bd8d593799.png">
+<center>mask图像</center>
+<img  alt="image" src="https://user-images.githubusercontent.com/20476674/209329241-b7e4d99e-468a-4b95-8829-d77ee14bfe98.png">
+<center>生成图像</center>
+</div>
+</details>
+<details><summary>&emsp;图像变化（Image Variation）</summary>
+#### image_variation-versatile_diffusion
+```python
+from ppdiffusers import VersatileDiffusionImageVariationPipeline
+from ppdiffusers.utils import load_image
+url = "https://paddlenlp.bj.bcebos.com/models/community/CompVis/data/benz.jpg"
+image = load_image(url)
+pipe = VersatileDiffusionImageVariationPipeline.from_pretrained("shi-labs/versatile-diffusion")
+image = pipe(image).images[0]
+image.save("versatile-diffusion-car_variation.png")
+```
+<div align="center">
+<img  width="300" alt="image" src="https://user-images.githubusercontent.com/20476674/209331434-51f6cdbd-b8e4-4faa-8e49-1cc852e35603.jpg">
+<center>原图像</center>
+<img  width="300" alt="image" src="https://user-images.githubusercontent.com/20476674/209331591-f6cc4cd8-8430-4627-8d22-bf404fb2bfdd.png">
+<center>生成图像</center>
+</div>
+</details>
+### 音频
+<details>
+<summary>&emsp;无条件音频生成（Unconditional Audio Generation）</summary>
+#### unconditional_audio_generation-audio_diffusion
+```python
+from scipy.io.wavfile import write
+from ppdiffusers import AudioDiffusionPipeline
+import paddle
+# 加载模型和scheduler
+pipe = AudioDiffusionPipeline.from_pretrained("teticio/audio-diffusion-ddim-256")
+pipe.set_progress_bar_config(disable=None)
+generator = paddle.Generator().manual_seed(42)
+output = pipe(generator=generator)
+audio = output.audios[0]
+image = output.images[0]
+# 保存音频到本地
+for i, audio in enumerate(audio):
+    write(f"audio_diffusion_test{i}.wav", pipe.mel.config.sample_rate, audio.transpose())
+# 保存图片
+image.save("audio_diffusion_test.png")
+```
+<div align = "center">
+  <thead>
+  </thead>
+  <tbody>
+   <tr>
+      <td align = "center">
+      <a href="https://paddlenlp.bj.bcebos.com/models/community/teticio/data/audio_diffusion_test0.wav" rel="nofollow">
+            <img align="center" src="https://user-images.githubusercontent.com/20476674/209344877-edbf1c24-f08d-4e3b-88a4-a27e1fd0a858.png" width="200 style="max-width: 100%;"></a><br>
+      </td>
+    </tr>
+  </tbody>
+</div>
+<div align="center">
+<img  width="300" alt="image" src="https://user-images.githubusercontent.com/20476674/209342125-93e8715e-895b-4115-9e1e-e65c6c2cd95a.png">
+</div>
+#### unconditional_audio_generation-spectrogram_diffusion
+```python
+import paddle
+import scipy
+from ppdiffusers import MidiProcessor, SpectrogramDiffusionPipeline
+from ppdiffusers.utils.download_utils import ppdiffusers_url_download
+# Download MIDI from: wget https://paddlenlp.bj.bcebos.com/models/community/junnyu/develop/beethoven_hammerklavier_2.mid
+mid_file_path = ppdiffusers_url_download(
+    "https://paddlenlp.bj.bcebos.com/models/community/junnyu/develop/beethoven_hammerklavier_2.mid", cache_dir="."
+)
+pipe = SpectrogramDiffusionPipeline.from_pretrained("google/music-spectrogram-diffusion", paddle_dtype=paddle.float16)
+processor = MidiProcessor()
+output = pipe(processor(mid_file_path))
+audio = output.audios[0]
+output_path = "unconditional_audio_generation-spectrogram_diffusion-result-beethoven_hammerklavier_2.wav"
+# save the audio sample as a .wav file
+scipy.io.wavfile.write(output_path, rate=16000, data=audio)
+```
+<div align = "center">
+  <thead>
+  </thead>
+  <tbody>
+   <tr>
+      <td align = "center">
+      <a href="https://paddlenlp.bj.bcebos.com/models/community/westfish/develop_ppdiffusers_data/beethoven_hammerklavier_2.wav" rel="nofollow">
+            <img align="center" src="https://user-images.githubusercontent.com/20476674/209344877-edbf1c24-f08d-4e3b-88a4-a27e1fd0a858.png" width="200 style="max-width: 100%;"></a><br>
+      </td>
+    </tr>
+  </tbody>
+</div>
+</details>
+## License
+PPDiffusers 遵循 [Apache-2.0开源协议](https://github.com/PaddlePaddle/PaddleMIX/blob/develop/ppdiffusers/LICENSE)。
+Stable Diffusion 遵循 [The CreativeML OpenRAIL M 开源协议](https://huggingface.co/spaces/CompVis/stable-diffusion-license)。
+> The CreativeML OpenRAIL M is an [Open RAIL M license](https://www.licenses.ai/blog/2022/8/18/naming-convention-of-responsible-ai-licenses), adapted from the work that [BigScience](https://bigscience.huggingface.co/) and [the RAIL Initiative](https://www.licenses.ai/) are jointly carrying in the area of responsible AI licensing. See also [the article about the BLOOM Open RAIL license](https://bigscience.huggingface.co/blog/the-bigscience-rail-license) on which this license is based.
+Stable Diffusion 3遵循 [Stability Community 开源协议](https://stability.ai/license)。
+> Community License: Free for research, non-commercial, and commercial use for organisations or individuals with less than $1M annual revenue. You only need a paid Enterprise license if your yearly revenues exceed USD$1M and you use Stability AI models in commercial products or services. Read more: https://stability.ai/license
+## Acknowledge
+我们借鉴了🤗 Hugging Face的[Diffusers](https://github.com/huggingface/diffusers)关于预训练扩散模型使用的优秀设计，在此对Hugging Face作者及其开源社区表示感谢。
+## Citation
+```bibtex
+@misc{ppdiffusers,
+  author = {PaddlePaddle Authors},
+  title = {PPDiffusers: State-of-the-art diffusion model toolkit based on PaddlePaddle},
+  year = {2022},
+  publisher = {GitHub},
+  journal = {GitHub repository},
+  howpublished = {\url{https://github.com/PaddlePaddle/PaddleMIX/tree/develop/ppdiffusers}}
+}
+```

VLMEvalKit_old/PaddleMIX/ppdiffusers/deploy/controlnet/export_model.py ADDED Viewed

	@@ -0,0 +1,263 @@

+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import argparse
+import os
+# set USE_PPXFORMERS=False to avoid using ppxformers
+os.environ["USE_PPXFORMERS"] = "False"
+from pathlib import Path
+from types import MethodType
+import paddle
+from ppdiffusers import (
+    ControlNetModel,
+    PaddleInferRuntimeModel,
+    PaddleInferStableDiffusionControlNetPipeline,
+    StableDiffusionControlNetPipeline,
+    UNet2DConditionModel,
+)
+class ControlNetWithUnetModel(paddle.nn.Layer):
+    def __init__(
+        self,
+        unet,
+        controlnet,
+    ):
+        super().__init__()
+        self.unet = unet
+        self.controlnet = controlnet
+    def forward(
+        self,
+        sample,
+        timestep,
+        encoder_hidden_states,
+        controlnet_cond,
+        controlnet_conditioning_scale,
+        return_dict=True,
+    ):
+        down_block_res_samples, mid_block_res_sample = self.controlnet(
+            sample,
+            timestep,
+            encoder_hidden_states=encoder_hidden_states,
+            controlnet_cond=controlnet_cond,
+            conditioning_scale=controlnet_conditioning_scale,
+            return_dict=False,
+        )
+        noise_pred = self.unet(
+            sample,
+            timestep,
+            encoder_hidden_states=encoder_hidden_states,
+            down_block_additional_residuals=down_block_res_samples,
+            mid_block_additional_residual=mid_block_res_sample,
+            return_dict=return_dict,
+        )
+        return noise_pred
+def convert_ppdiffusers_pipeline_to_paddleinfer_pipeline(
+    model_path: str,
+    controlnet_model_path: str,
+    output_path: str,
+    sample: bool = False,
+    height: int = None,
+    width: int = None,
+):
+    unet_tmp = UNet2DConditionModel.from_pretrained(model_path, resnet_pre_temb_non_linearity=False, subfolder="unet")
+    controlnet_tmp = ControlNetModel.from_pretrained(controlnet_model_path, resnet_pre_temb_non_linearity=False)
+    pipeline = StableDiffusionControlNetPipeline.from_pretrained(
+        model_path,
+        unet=unet_tmp,
+        controlnet=controlnet_tmp,
+        safety_checker=None,
+        feature_extractor=None,
+        requires_safety_checker=False,
+    )
+    output_path = Path(output_path)
+    # calculate latent's H and W
+    latent_height = height // 8 if height is not None else None
+    latent_width = width // 8 if width is not None else None
+    # get arguments
+    cross_attention_dim = pipeline.unet.config.cross_attention_dim  # 768 or 1024 or 1280
+    unet_channels = pipeline.unet.config.in_channels  # 4
+    vae_in_channels = pipeline.vae.config.in_channels  # 3
+    vae_latent_channels = pipeline.vae.config.latent_channels  # 4
+    print(
+        f"cross_attention_dim: {cross_attention_dim}\n",
+        f"unet_in_channels: {unet_channels}\n",
+        f"vae_encoder_in_channels: {vae_in_channels}\n",
+        f"vae_decoder_latent_channels: {vae_latent_channels}",
+    )
+    # 1. Convert text_encoder
+    text_encoder = paddle.jit.to_static(
+        pipeline.text_encoder,
+        input_spec=[paddle.static.InputSpec(shape=[None, None], dtype="int64", name="input_ids")],  # input_ids
+    )
+    save_path = os.path.join(args.output_path, "text_encoder", "inference")
+    paddle.jit.save(text_encoder, save_path)
+    print(f"Save text_encoder model in {save_path} successfully.")
+    del pipeline.text_encoder
+    # wrap unet + controlnet
+    new_unet = ControlNetWithUnetModel(unet=pipeline.unet, controlnet=pipeline.controlnet)
+    # 2. Convert unet
+    unet = paddle.jit.to_static(
+        new_unet,
+        input_spec=[
+            paddle.static.InputSpec(
+                shape=[None, unet_channels, latent_height, latent_width],
+                dtype="float32",
+                name="sample",
+            ),  # sample
+            paddle.static.InputSpec(shape=[1], dtype="float32", name="timestep"),  # timestep
+            paddle.static.InputSpec(
+                shape=[None, None, cross_attention_dim],
+                dtype="float32",
+                name="encoder_hidden_states",
+            ),  # encoder_hidden_states
+            paddle.static.InputSpec(
+                shape=[None, vae_in_channels, height, width],
+                dtype="float32",
+                name="controlnet_cond",
+            ),  # controlnet_cond
+            paddle.static.InputSpec(
+                shape=[len(pipeline.unet.config.block_out_channels) * 3 + 1],
+                dtype="float32",
+                name="controlnet_conditioning_scale",
+            ),  # controlnet_conditioning_scale
+        ],
+    )
+    save_path = os.path.join(args.output_path, "unet", "inference")
+    paddle.jit.save(unet, save_path)
+    print(f"Save unet model in {save_path} successfully.")
+    del pipeline.unet
+    del new_unet
+    def forward_vae_encoder_mode(self, z):
+        return self.encode(z, True).latent_dist.mode()
+    def forward_vae_encoder_sample(self, z):
+        return self.encode(z, True).latent_dist.sample()
+    # 3. Convert vae encoder
+    vae_encoder = pipeline.vae
+    if sample:
+        vae_encoder.forward = MethodType(forward_vae_encoder_sample, vae_encoder)
+    else:
+        vae_encoder.forward = MethodType(forward_vae_encoder_mode, vae_encoder)
+    vae_encoder = paddle.jit.to_static(
+        vae_encoder,
+        input_spec=[
+            paddle.static.InputSpec(
+                shape=[None, vae_in_channels, height, width],
+                dtype="float32",
+                name="sample",  # N, C, H, W
+            ),  # latent
+        ],
+    )
+    # Save vae_encoder in static graph model.
+    save_path = os.path.join(args.output_path, "vae_encoder", "inference")
+    paddle.jit.save(vae_encoder, save_path)
+    print(f"Save vae_encoder model in {save_path} successfully.")
+    # 4. Convert vae encoder
+    vae_decoder = pipeline.vae
+    def forward_vae_decoder(self, z):
+        return self.decode(z, True).sample
+    vae_decoder.forward = MethodType(forward_vae_decoder, vae_decoder)
+    vae_decoder = paddle.jit.to_static(
+        vae_decoder,
+        input_spec=[
+            paddle.static.InputSpec(
+                shape=[None, vae_latent_channels, latent_height, latent_width],
+                dtype="float32",
+                name="latent_sample",
+            ),  # latent_sample
+        ],
+    )
+    # Save vae_decoder in static graph model.
+    save_path = os.path.join(args.output_path, "vae_decoder", "inference")
+    paddle.jit.save(vae_decoder, save_path)
+    print(f"Save vae_decoder model in {save_path} successfully.")
+    del pipeline.vae
+    paddleinfer_pipeline = PaddleInferStableDiffusionControlNetPipeline(
+        vae_encoder=PaddleInferRuntimeModel.from_pretrained(output_path / "vae_encoder"),
+        vae_decoder=PaddleInferRuntimeModel.from_pretrained(output_path / "vae_decoder"),
+        text_encoder=PaddleInferRuntimeModel.from_pretrained(output_path / "text_encoder"),
+        unet=PaddleInferRuntimeModel.from_pretrained(output_path / "unet"),
+        tokenizer=pipeline.tokenizer,
+        scheduler=pipeline.scheduler,
+        safety_checker=None,
+        feature_extractor=None,
+        image_encoder=None,
+        requires_safety_checker=False,
+    )
+    paddleinfer_pipeline.save_pretrained(str(output_path))
+    print("PaddleInfer pipeline saved to", output_path)
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--pretrained_model_name_or_path",
+        type=str,
+        default="runwayml/stable-diffusion-v1-5",
+        help="Path to the `ppdiffusers` checkpoint to convert (either a local directory or on the bos).",
+    )
+    parser.add_argument(
+        "--controlnet_pretrained_model_name_or_path",
+        type=str,
+        default="lllyasviel/sd-controlnet-canny",
+        help="Path to the `ppdiffusers` controlnet_pretrained_model_name_or_path  checkpoint to convert (either a local directory or on the bos).",
+    )
+    parser.add_argument("--output_path", type=str, required=True, help="Path to the output model.")
+    parser.add_argument(
+        "--sample",
+        action="store_true",
+        default=False,
+        help="Export the vae encoder in mode or sample",
+    )
+    parser.add_argument(
+        "--height",
+        type=int,
+        default=None,
+        help="The height of output images. Default: None",
+    )
+    parser.add_argument(
+        "--width",
+        type=int,
+        default=None,
+        help="The width of output images. Default: None",
+    )
+    args = parser.parse_args()
+    convert_ppdiffusers_pipeline_to_paddleinfer_pipeline(
+        args.pretrained_model_name_or_path,
+        args.controlnet_pretrained_model_name_or_path,
+        args.output_path,
+        args.sample,
+        args.height,
+        args.width,
+    )

VLMEvalKit_old/PaddleMIX/ppdiffusers/deploy/controlnet/scripts/benchmark_paddle.sh ADDED Viewed

	@@ -0,0 +1,32 @@

+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# attention raw fp16
+python infer_dygraph_paddle.py --scheduler "ddim" --task_name all --attention_type raw --use_fp16 True --inference_steps 50 --height 512 --width 512 --benchmark_steps 10
+# attention cutlass fp16
+python infer_dygraph_paddle.py --scheduler "ddim" --task_name all --attention_type cutlass --use_fp16 True --inference_steps 50 --height 512 --width 512 --benchmark_steps 10
+# attention flash fp16
+python infer_dygraph_paddle.py --scheduler "ddim" --task_name all --attention_type flash --use_fp16 True --inference_steps 50 --height 512 --width 512 --benchmark_steps 10
+# attention raw fp32
+python infer_dygraph_paddle.py --scheduler "ddim" --task_name all --attention_type raw --use_fp16 False --inference_steps 50 --height 512 --width 512 --benchmark_steps 10
+# attention cutlass fp32
+python infer_dygraph_paddle.py --scheduler "ddim" --task_name all --attention_type cutlass --use_fp16 False --inference_steps 50 --height 512 --width 512 --benchmark_steps 10
+# attention flash fp32
+python infer_dygraph_paddle.py --scheduler "ddim" --task_name all --attention_type flash --use_fp16 False --inference_steps 50 --height 512 --width 512 --benchmark_steps 10

VLMEvalKit_old/PaddleMIX/ppdiffusers/deploy/controlnet/scripts/benchmark_torch.sh ADDED Viewed

	@@ -0,0 +1,26 @@

+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# attention raw
+python infer_dygraph_torch.py --scheduler "ddim" --task_name all --attention_type raw --use_fp16 True --inference_steps 50 --height 512 --width 512 --benchmark_steps 10
+# attention sdp
+python infer_dygraph_torch.py --scheduler "ddim" --task_name all --attention_type sdp --use_fp16 True --inference_steps 50 --height 512 --width 512 --benchmark_steps 10
+# attention raw fp32
+python infer_dygraph_torch.py --scheduler "ddim" --task_name all --attention_type raw --use_fp16 False --inference_steps 50 --height 512 --width 512 --benchmark_steps 10
+# attention sdp fp32
+python infer_dygraph_torch.py --scheduler "ddim" --task_name all --attention_type sdp --use_fp16 False --inference_steps 50 --height 512 --width 512 --benchmark_steps 10

VLMEvalKit_old/PaddleMIX/ppdiffusers/deploy/controlnet/scripts/inference.sh ADDED Viewed

	@@ -0,0 +1,26 @@

+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+# use paddle as backend to inference static model is not fast,
+# this script is used to make sure the inference is correct.
+# ==============================================================================
+# text2img
+python infer.py --model_dir static_model/stable-diffusion-v1-5-canny/ --scheduler "ddim" --backend paddle --device gpu --task_name text2img
+# img2img
+python infer.py --model_dir static_model/stable-diffusion-v1-5-canny/ --scheduler "ddim" --backend paddle --device gpu --task_name img2img
+# inpaint
+python infer.py --model_dir static_model/stable-diffusion-v1-5-canny/ --scheduler "ddim" --backend paddle --device gpu --task_name inpaint_legacy

VLMEvalKit_old/PaddleMIX/ppdiffusers/deploy/sd15/export_model.py ADDED Viewed

	@@ -0,0 +1,205 @@

+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import argparse
+import os
+# set USE_PPXFORMERS=False to avoid using ppxformers
+os.environ["USE_PPXFORMERS"] = "False"
+from pathlib import Path
+from types import MethodType
+import paddle
+from unet_2d_condition_housing import UNet2DConditionModelSDHousing
+from ppdiffusers import (
+    PaddleInferRuntimeModel,
+    PaddleInferStableDiffusionInpaintPipeline,
+    PaddleInferStableDiffusionMegaPipeline,
+    StableDiffusionPipeline,
+)
+def convert_ppdiffusers_pipeline_to_paddleinfer_pipeline(
+    model_path: str,
+    output_path: str,
+    sample: bool = False,
+    height: int = None,
+    width: int = None,
+):
+    # specify unet model with unet pre_temb_act opt enabled.
+    unet_model = UNet2DConditionModelSDHousing.from_pretrained(
+        model_path, resnet_pre_temb_non_linearity=False, subfolder="unet"
+    )
+    pipeline = StableDiffusionPipeline.from_pretrained(
+        model_path,
+        unet=unet_model,
+        safety_checker=None,
+    )
+    output_path = Path(output_path)
+    # calculate latent's H and W
+    latent_height = height // 8 if height is not None else None
+    latent_width = width // 8 if width is not None else None
+    # get arguments
+    cross_attention_dim = pipeline.unet.config.cross_attention_dim  # 768 or 1024 or 1280
+    unet_channels = pipeline.unet.config.in_channels  # 4 or 9
+    vae_in_channels = pipeline.vae.config.in_channels  # 3
+    vae_latent_channels = pipeline.vae.config.latent_channels  # 4
+    print(
+        f"cross_attention_dim: {cross_attention_dim}\n",
+        f"unet_in_channels: {unet_channels}\n",
+        f"vae_encoder_in_channels: {vae_in_channels}\n",
+        f"vae_decoder_latent_channels: {vae_latent_channels}",
+    )
+    # 1. Convert text_encoder
+    text_encoder = paddle.jit.to_static(
+        pipeline.text_encoder,
+        input_spec=[paddle.static.InputSpec(shape=[None, None], dtype="int64", name="input_ids")],  # input_ids
+    )
+    save_path = os.path.join(args.output_path, "text_encoder", "inference")
+    paddle.jit.save(text_encoder, save_path)
+    print(f"Save text_encoder model in {save_path} successfully.")
+    del pipeline.text_encoder
+    # 2. Convert unet
+    unet = paddle.jit.to_static(
+        pipeline.unet,
+        input_spec=[
+            paddle.static.InputSpec(
+                shape=[None, unet_channels, latent_height, latent_width],
+                dtype="float32",
+                name="sample",
+            ),  # sample
+            paddle.static.InputSpec(shape=[1], dtype="float32", name="timestep"),  # timestep
+            paddle.static.InputSpec(
+                shape=[None, None, cross_attention_dim],
+                dtype="float32",
+                name="encoder_hidden_states",
+            ),  # encoder_hidden_states
+        ],
+    )
+    save_path = os.path.join(args.output_path, "unet", "inference")
+    paddle.jit.save(unet, save_path)
+    print(f"Save unet model in {save_path} successfully.")
+    del pipeline.unet
+    def forward_vae_encoder_mode(self, z):
+        return self.encode(z, True).latent_dist.mode()
+    def forward_vae_encoder_sample(self, z):
+        return self.encode(z, True).latent_dist.sample()
+    # 3. Convert vae encoder
+    vae_encoder = pipeline.vae
+    if sample:
+        vae_encoder.forward = MethodType(forward_vae_encoder_sample, vae_encoder)
+    else:
+        vae_encoder.forward = MethodType(forward_vae_encoder_mode, vae_encoder)
+    vae_encoder = paddle.jit.to_static(
+        vae_encoder,
+        input_spec=[
+            paddle.static.InputSpec(
+                shape=[None, vae_in_channels, height, width],
+                dtype="float32",
+                name="sample",  # N, C, H, W
+            ),  # latent
+        ],
+    )
+    # Save vae_encoder in static graph model.
+    save_path = os.path.join(args.output_path, "vae_encoder", "inference")
+    paddle.jit.save(vae_encoder, save_path)
+    print(f"Save vae_encoder model in {save_path} successfully.")
+    # 4. Convert vae encoder
+    vae_decoder = pipeline.vae
+    def forward_vae_decoder(self, z):
+        return self.decode(z, True).sample
+    vae_decoder.forward = MethodType(forward_vae_decoder, vae_decoder)
+    vae_decoder = paddle.jit.to_static(
+        vae_decoder,
+        input_spec=[
+            paddle.static.InputSpec(
+                shape=[None, vae_latent_channels, latent_height, latent_width],
+                dtype="float32",
+                name="latent_sample",
+            ),  # latent_sample
+        ],
+    )
+    # Save vae_decoder in static graph model.
+    save_path = os.path.join(args.output_path, "vae_decoder", "inference")
+    paddle.jit.save(vae_decoder, save_path)
+    print(f"Save vae_decoder model in {save_path} successfully.")
+    del pipeline.vae
+    if "inpainting" in model_path:
+        fd_pipe_cls = PaddleInferStableDiffusionInpaintPipeline
+    else:
+        fd_pipe_cls = PaddleInferStableDiffusionMegaPipeline
+    paddleinfer_pipeline = fd_pipe_cls(
+        vae_encoder=PaddleInferRuntimeModel.from_pretrained(output_path / "vae_encoder"),
+        vae_decoder=PaddleInferRuntimeModel.from_pretrained(output_path / "vae_decoder"),
+        text_encoder=PaddleInferRuntimeModel.from_pretrained(output_path / "text_encoder"),
+        unet=PaddleInferRuntimeModel.from_pretrained(output_path / "unet"),
+        tokenizer=pipeline.tokenizer,
+        scheduler=pipeline.scheduler,
+        feature_extractor=pipeline.feature_extractor,
+        image_encoder=None,
+        safety_checker=None,
+        requires_safety_checker=False,
+    )
+    paddleinfer_pipeline.save_pretrained(str(output_path))
+    print("PaddleInfer pipeline saved to", output_path)
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--pretrained_model_name_or_path",
+        type=str,
+        required=True,
+        help="Path to the `ppdiffusers` checkpoint to convert (either a local directory or on the bos).",
+    )
+    parser.add_argument("--output_path", type=str, required=True, help="Path to the output model.")
+    parser.add_argument(
+        "--sample",
+        action="store_true",
+        default=False,
+        help="Export the vae encoder in mode or sample",
+    )
+    parser.add_argument(
+        "--height",
+        type=int,
+        default=None,
+        help="The height of output images. Default: None",
+    )
+    parser.add_argument(
+        "--width",
+        type=int,
+        default=None,
+        help="The width of output images. Default: None",
+    )
+    args = parser.parse_args()
+    convert_ppdiffusers_pipeline_to_paddleinfer_pipeline(
+        args.pretrained_model_name_or_path,
+        args.output_path,
+        args.sample,
+        args.height,
+        args.width,
+    )

VLMEvalKit_old/PaddleMIX/ppdiffusers/deploy/sd15/infer.py ADDED Viewed

	@@ -0,0 +1,408 @@

+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import argparse
+import os
+import time
+# isort: split
+import paddle
+import paddle.inference as paddle_infer
+# isort: split
+import numpy as np
+from paddlenlp.trainer.argparser import strtobool
+from tqdm.auto import trange
+from ppdiffusers import (  # noqa
+    DiffusionPipeline,
+    PaddleInferStableDiffusionMegaPipeline,
+)
+from ppdiffusers.utils import load_image
+def parse_arguments():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--model_dir",
+        default="runwayml/stable-diffusion-v1-5@paddleinfer",
+        help="The model directory of diffusion_model.",
+    )
+    parser.add_argument(
+        "--inference_steps",
+        type=int,
+        default=50,
+        help="The number of unet inference steps.",
+    )
+    parser.add_argument(
+        "--benchmark_steps",
+        type=int,
+        default=10,
+        help="The number of performance benchmark steps.",
+    )
+    parser.add_argument(
+        "--backend",
+        type=str,
+        default="paddle_tensorrt",
+        choices=["paddle", "paddle_tensorrt"],
+        help="The inference runtime backend of unet model and text encoder model.",
+    )
+    parser.add_argument(
+        "--device",
+        type=str,
+        default="gpu",
+        choices=[
+            "cpu",
+            "gpu",
+            "huawei_ascend_npu",
+            "kunlunxin_xpu",
+        ],
+        help="The inference runtime device of models.",
+    )
+    parser.add_argument(
+        "--task_name",
+        type=str,
+        default="text2img",
+        choices=[
+            "text2img",
+            "img2img",
+            "inpaint_legacy",
+            "all",
+        ],
+        help="The task can be one of [text2img, img2img, inpaint_legacy, all]. ",
+    )
+    parser.add_argument(
+        "--parse_prompt_type",
+        type=str,
+        default="lpw",
+        choices=[
+            "raw",
+            "lpw",
+        ],
+        help="The parse_prompt_type can be one of [raw, lpw]. ",
+    )
+    parser.add_argument("--use_fp16", type=strtobool, default=True, help="Wheter to use FP16 mode")
+    parser.add_argument("--device_id", type=int, default=0, help="The selected gpu id. -1 means use cpu")
+    parser.add_argument(
+        "--scheduler",
+        type=str,
+        default="preconfig-euler-ancestral",
+        choices=[
+            "pndm",
+            "lms",
+            "euler",
+            "euler-ancestral",
+            "preconfig-euler-ancestral",
+            "dpm-multi",
+            "dpm-single",
+            "unipc-multi",
+            "ddim",
+            "ddpm",
+            "deis-multi",
+            "heun",
+            "kdpm2-ancestral",
+            "kdpm2",
+        ],
+        help="The scheduler type of stable diffusion.",
+    )
+    parser.add_argument("--height", type=int, default=512, help="Height of input image")
+    parser.add_argument("--width", type=int, default=512, help="Width of input image")
+    parser.add_argument("--strength", type=float, default=1.0, help="Strength for img2img / inpaint")
+    parser.add_argument("--hr_resize_height", type=int, default=768, help="HR Height of input image")
+    parser.add_argument("--hr_resize_width", type=int, default=768, help="HR Width of input image")
+    parser.add_argument("--is_sd2_0", type=strtobool, default=False, help="Is sd2_0 model?")
+    parser.add_argument(
+        "--tune",
+        type=strtobool,
+        default=False,
+        help="Whether to tune the shape of tensorrt engine.",
+    )
+    return parser.parse_args()
+def create_paddle_inference_runtime(
+    model_dir="",
+    model_name="",
+    use_trt=False,
+    precision_mode=paddle_infer.PrecisionType.Half,
+    device_id=0,
+    disable_paddle_trt_ops=[],
+    disable_paddle_pass=[],
+    workspace=24 * 1024 * 1024 * 1024,
+    tune=False,
+):
+    config = paddle_infer.Config()
+    config.enable_memory_optim()
+    shape_file = f"{model_dir}/{model_name}/shape_range_info.pbtxt"
+    if tune:
+        config.collect_shape_range_info(shape_file)
+        config.switch_ir_optim(False)
+    else:
+        config.enable_new_executor()
+        if str(os.environ.get("FLAGS_enable_pir_in_executor")).lower() in ("true", "1"):
+            config.enable_new_ir()
+            if str(os.environ.get("FLAGS_use_cinn")).lower() in ("true", "1"):
+                config.enable_cinn()
+    if device_id != -1:
+        config.use_gpu()
+        config.enable_use_gpu(memory_pool_init_size_mb=2000, device_id=device_id, precision_mode=precision_mode)
+    for pass_name in disable_paddle_pass:
+        config.delete_pass(pass_name)
+    if use_trt:
+        config.enable_tensorrt_engine(
+            workspace_size=workspace,
+            precision_mode=precision_mode,
+            max_batch_size=1,
+            min_subgraph_size=3,
+            use_static=True,
+        )
+        config.enable_tensorrt_memory_optim()
+        config.enable_tuned_tensorrt_dynamic_shape(shape_file, True)
+        cache_file = os.path.join(model_dir, model_name, "_opt_cache/")
+        config.set_optim_cache_dir(cache_file)
+        if precision_mode != paddle_infer.PrecisionType.Half:
+            only_fp16_passes = [
+                "trt_cross_multihead_matmul_fuse_pass",
+                "trt_flash_multihead_matmul_fuse_pass",
+                "preln_elementwise_groupnorm_act_pass",
+                "elementwise_groupnorm_act_pass",
+            ]
+            for curr_pass in only_fp16_passes:
+                config.delete_pass(curr_pass)
+    return config
+def main(args):
+    if args.device_id == -1:
+        paddle.set_device("cpu")
+    else:
+        paddle.set_device(f"gpu:{args.device_id}")
+    seed = 1024
+    min_image_size = 512
+    max_image_size = 768
+    max_image_size = max(min_image_size, max_image_size)
+    # 4. Init runtime
+    only_fp16_passes = [
+        "trt_cross_multihead_matmul_fuse_pass",
+        "trt_flash_multihead_matmul_fuse_pass",
+        "preln_elementwise_groupnorm_act_pass",
+        "elementwise_groupnorm_act_pass",
+    ]
+    no_need_passes = [
+        "trt_prompt_tuning_embedding_eltwise_layernorm_fuse_pass",
+        "add_support_int8_pass",
+        "elementwise_groupnorm_act_pass",
+        "groupnorm_act_pass",
+        "preln_elementwise_groupnorm_act_pass",
+    ]
+    paddle_delete_passes = dict(
+        text_encoder=only_fp16_passes + no_need_passes if not args.use_fp16 else no_need_passes,
+        text_encoder_2=only_fp16_passes + no_need_passes if not args.use_fp16 else no_need_passes,
+        vae_encoder=only_fp16_passes + [] if args.use_fp16 else [],
+        vae_decoder=only_fp16_passes + no_need_passes if not args.use_fp16 else no_need_passes,
+        unet=only_fp16_passes + no_need_passes if not args.use_fp16 else no_need_passes,
+        image_encoder=only_fp16_passes + no_need_passes if not args.use_fp16 else no_need_passes,
+    )
+    args.use_trt = args.backend == "paddle_tensorrt"
+    precision_mode = paddle_infer.PrecisionType.Half if args.use_fp16 else paddle_infer.PrecisionType.Float32
+    infer_configs = dict(
+        text_encoder=create_paddle_inference_runtime(
+            model_dir=args.model_dir,
+            use_trt=False,
+            model_name="text_encoder",
+            precision_mode=paddle_infer.PrecisionType.Half,
+            device_id=args.device_id,
+            disable_paddle_trt_ops=["range", "lookup_table_v2"],
+            disable_paddle_pass=paddle_delete_passes.get("text_encoder", []),
+            tune=False,
+        ),
+        vae_encoder=create_paddle_inference_runtime(
+            model_dir=args.model_dir,
+            model_name="vae_encoder",
+            use_trt=False,
+            precision_mode=paddle_infer.PrecisionType.Half,
+            device_id=args.device_id,
+            disable_paddle_pass=paddle_delete_passes.get("vae_encoder", []),
+            tune=False,
+        ),
+        vae_decoder=create_paddle_inference_runtime(
+            model_dir=args.model_dir,
+            model_name="vae_decoder",
+            use_trt=False,
+            precision_mode=paddle_infer.PrecisionType.Float32,
+            device_id=args.device_id,
+            disable_paddle_pass=paddle_delete_passes.get("vae_decoder", []),
+            tune=False,
+        ),
+        unet=create_paddle_inference_runtime(
+            model_dir=args.model_dir,
+            model_name="unet",
+            use_trt=args.use_trt,
+            precision_mode=precision_mode,
+            device_id=args.device_id,
+            disable_paddle_pass=no_need_passes,
+            tune=args.tune,
+        ),
+    )
+    pipe = PaddleInferStableDiffusionMegaPipeline.from_pretrained(
+        args.model_dir,
+        infer_configs=infer_configs,
+        use_optim_cache=False,
+    )
+    pipe.set_progress_bar_config(disable=False)
+    pipe.change_scheduler(args.scheduler)
+    parse_prompt_type = args.parse_prompt_type
+    width = args.width
+    height = args.height
+    folder = f"results-{args.backend}"
+    os.makedirs(folder, exist_ok=True)
+    if args.task_name in ["text2img", "all"]:
+        # text2img
+        prompt = "a photo of an astronaut riding a horse on mars"
+        time_costs = []
+        # warmup
+        pipe.text2img(
+            prompt,
+            num_inference_steps=20,
+            height=height,
+            width=width,
+            # parse_prompt_type=parse_prompt_type,
+        )
+        print("==> Test text2img performance.")
+        for step in trange(args.benchmark_steps):
+            start = time.time()
+            paddle.seed(seed)
+            images = pipe.text2img(
+                prompt,
+                output_type="pil",
+                num_inference_steps=args.inference_steps,
+                height=height,
+                width=width,
+                # parse_prompt_type=parse_prompt_type,
+            ).images
+            latency = time.time() - start
+            time_costs += [latency]
+            # print(f"No {step:3d} time cost: {latency:2f} s")
+        print(
+            f"Use fp16: {'true' if args.use_fp16 else 'false'}, "
+            f"Mean iter/sec: {1 / (np.mean(time_costs) / args.inference_steps):2f} it/s, "
+            f"Mean latency: {np.mean(time_costs):2f} s, p50 latency: {np.percentile(time_costs, 50):2f} s, "
+            f"p90 latency: {np.percentile(time_costs, 90):2f} s, p95 latency: {np.percentile(time_costs, 95):2f} s."
+        )
+        images[0].save(f"{folder}/text2img.png")
+    if args.task_name in ["img2img", "all"]:
+        # img2img
+        img_url = (
+            "https://paddlenlp.bj.bcebos.com/models/community/CompVis/stable-diffusion-v1-4/sketch-mountains-input.png"
+        )
+        init_image = load_image(img_url)
+        prompt = "A fantasy landscape, trending on artstation"
+        time_costs = []
+        # warmup
+        pipe.img2img(
+            prompt,
+            image=init_image,
+            num_inference_steps=20,
+            height=height,
+            width=width,
+            strength=args.strength,
+            # parse_prompt_type=parse_prompt_type,
+        )
+        print("==> Test img2img performance.")
+        for step in trange(args.benchmark_steps):
+            start = time.time()
+            paddle.seed(seed)
+            images = pipe.img2img(
+                prompt,
+                image=init_image,
+                num_inference_steps=args.inference_steps,
+                height=height,
+                width=width,
+                strength=args.strength,
+                # parse_prompt_type=parse_prompt_type,
+            ).images
+            latency = time.time() - start
+            time_costs += [latency]
+            # print(f"No {step:3d} time cost: {latency:2f} s")
+        print(
+            f"Use fp16: {'true' if args.use_fp16 else 'false'}, "
+            f"Mean iter/sec: {1 / (np.mean(time_costs) / args.inference_steps):2f} it/s, "
+            f"Mean latency: {np.mean(time_costs):2f} s, p50 latency: {np.percentile(time_costs, 50):2f} s, "
+            f"p90 latency: {np.percentile(time_costs, 90):2f} s, p95 latency: {np.percentile(time_costs, 95):2f} s."
+        )
+        images[0].save(f"{folder}/img2img.png")
+    if args.task_name in ["inpaint", "inpaint_legacy", "all"]:
+        img_url = (
+            "https://paddlenlp.bj.bcebos.com/models/community/CompVis/stable-diffusion-v1-4/overture-creations.png"
+        )
+        mask_url = "https://paddlenlp.bj.bcebos.com/models/community/CompVis/stable-diffusion-v1-4/overture-creations-mask.png"
+        init_image = load_image(img_url)
+        mask_image = load_image(mask_url)
+        prompt = "Face of a yellow cat, high resolution, sitting on a park bench"
+        time_costs = []
+        # warmup
+        if args.task_name in ["inpaint_legacy", "all"]:
+            call_fn = pipe.inpaint_legacy
+            task_name = "inpaint_legacy"
+        else:
+            call_fn = pipe.inpaint
+            task_name = "inpaint"
+        call_fn(
+            prompt,
+            image=init_image,
+            mask_image=mask_image,
+            num_inference_steps=20,
+            height=height,
+            width=width,
+            strength=args.strength,
+            parse_prompt_type=parse_prompt_type,
+        )
+        print(f"==> Test {task_name} performance.")
+        for step in trange(args.benchmark_steps):
+            start = time.time()
+            paddle.seed(seed)
+            images = call_fn(
+                prompt,
+                image=init_image,
+                mask_image=mask_image,
+                num_inference_steps=args.inference_steps,
+                height=height,
+                width=width,
+                strength=args.strength,
+                parse_prompt_type=parse_prompt_type,
+            ).images
+            latency = time.time() - start
+            time_costs += [latency]
+            # print(f"No {step:3d} time cost: {latency:2f} s")
+        print(
+            f"Use fp16: {'true' if args.use_fp16 else 'false'}, "
+            f"Mean iter/sec: {1 / (np.mean(time_costs) / args.inference_steps):2f} it/s, "
+            f"Mean latency: {np.mean(time_costs):2f} s, p50 latency: {np.percentile(time_costs, 50):2f} s, "
+            f"p90 latency: {np.percentile(time_costs, 90):2f} s, p95 latency: {np.percentile(time_costs, 95):2f} s."
+        )
+        images[0].save(f"{folder}/{task_name}.png")
+if __name__ == "__main__":
+    args = parse_arguments()
+    main(args)

VLMEvalKit_old/PaddleMIX/ppdiffusers/deploy/sd15/infer_dygraph_paddle.py ADDED Viewed

	@@ -0,0 +1,357 @@

+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import argparse
+import os
+import time
+import warnings
+import cv2
+import numpy as np
+import paddle
+from PIL import Image
+from tqdm.auto import trange
+from ppdiffusers import (
+    DDIMScheduler,
+    DDPMScheduler,
+    DEISMultistepScheduler,
+    DPMSolverMultistepScheduler,
+    DPMSolverSinglestepScheduler,
+    EulerAncestralDiscreteScheduler,
+    EulerDiscreteScheduler,
+    HeunDiscreteScheduler,
+    KDPM2AncestralDiscreteScheduler,
+    KDPM2DiscreteScheduler,
+    LMSDiscreteScheduler,
+    PNDMScheduler,
+    StableDiffusionImg2ImgPipeline,
+    StableDiffusionInpaintPipeline,
+    StableDiffusionPipeline,
+    UniPCMultistepScheduler,
+)
+from ppdiffusers.utils import load_image
+def get_canny_image(image, args):
+    if isinstance(image, Image.Image):
+        image = np.array(image)
+    image = cv2.Canny(image, args.low_threshold, args.high_threshold)
+    image = image[:, :, None]
+    image = np.concatenate([image, image, image], axis=2)
+    canny_image = Image.fromarray(image)
+    return canny_image
+def strtobool(v):
+    if isinstance(v, bool):
+        return v
+    if v.lower() in ("yes", "true", "t", "y", "1"):
+        return True
+    elif v.lower() in ("no", "false", "f", "n", "0"):
+        return False
+    else:
+        raise ValueError(
+            f"Truthy value expected: got {v} but expected one of yes/no, true/false, t/f, y/n, 1/0 (case insensitive)."
+        )
+def change_scheduler(self, scheduler_type="ddim"):
+    self.orginal_scheduler_config = self.scheduler.config
+    scheduler_type = scheduler_type.lower()
+    if scheduler_type == "pndm":
+        scheduler = PNDMScheduler.from_config(self.orginal_scheduler_config, skip_prk_steps=True)
+    elif scheduler_type == "lms":
+        scheduler = LMSDiscreteScheduler.from_config(self.orginal_scheduler_config)
+    elif scheduler_type == "heun":
+        scheduler = HeunDiscreteScheduler.from_config(self.orginal_scheduler_config)
+    elif scheduler_type == "euler":
+        scheduler = EulerDiscreteScheduler.from_config(self.orginal_scheduler_config)
+    elif scheduler_type == "euler-ancestral":
+        scheduler = EulerAncestralDiscreteScheduler.from_config(self.orginal_scheduler_config)
+    elif scheduler_type == "dpm-multi":
+        scheduler = DPMSolverMultistepScheduler.from_config(self.orginal_scheduler_config)
+    elif scheduler_type == "dpm-single":
+        scheduler = DPMSolverSinglestepScheduler.from_config(self.orginal_scheduler_config)
+    elif scheduler_type == "kdpm2-ancestral":
+        scheduler = KDPM2AncestralDiscreteScheduler.from_config(self.orginal_scheduler_config)
+    elif scheduler_type == "kdpm2":
+        scheduler = KDPM2DiscreteScheduler.from_config(self.orginal_scheduler_config)
+    elif scheduler_type == "unipc-multi":
+        scheduler = UniPCMultistepScheduler.from_config(self.orginal_scheduler_config)
+    elif scheduler_type == "ddim":
+        scheduler = DDIMScheduler.from_config(
+            self.orginal_scheduler_config,
+            steps_offset=1,
+            clip_sample=False,
+            set_alpha_to_one=False,
+        )
+    elif scheduler_type == "ddpm":
+        scheduler = DDPMScheduler.from_config(
+            self.orginal_scheduler_config,
+        )
+    elif scheduler_type == "deis-multi":
+        scheduler = DEISMultistepScheduler.from_config(
+            self.orginal_scheduler_config,
+        )
+    else:
+        raise ValueError(f"Scheduler of type {scheduler_type} doesn't exist!")
+    return scheduler
+def parse_arguments():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--pretrained_model_name_or_path",
+        type=str,
+        default="runwayml/stable-diffusion-v1-5",
+        help="Path to the `diffusers` checkpoint to convert (either a local directory or on the bos).",
+    )
+    parser.add_argument(
+        "--inference_steps",
+        type=int,
+        default=50,
+        help="The number of unet inference steps.",
+    )
+    parser.add_argument(
+        "--benchmark_steps",
+        type=int,
+        default=10,
+        help="The number of performance benchmark steps.",
+    )
+    parser.add_argument(
+        "--task_name",
+        type=str,
+        default="all",
+        choices=[
+            "text2img",
+            "img2img",
+            "inpaint_legacy",
+            "all",
+        ],
+        help="The task can be one of [text2img, img2img, inpaint_legacy, all]. ",
+    )
+    parser.add_argument(
+        "--parse_prompt_type",
+        type=str,
+        default="raw",
+        choices=[
+            "raw",
+            "lpw",
+        ],
+        help="The parse_prompt_type can be one of [raw, lpw]. ",
+    )
+    parser.add_argument("--use_fp16", type=strtobool, default=True, help="Wheter to use FP16 mode")
+    parser.add_argument(
+        "--attention_type", type=str, default="raw", choices=["raw", "cutlass", "flash", "all"], help="attention_type."
+    )
+    parser.add_argument("--device_id", type=int, default=0, help="The selected gpu id. -1 means use cpu")
+    parser.add_argument(
+        "--scheduler",
+        type=str,
+        default="euler-ancestral",
+        choices=[
+            "pndm",
+            "lms",
+            "euler",
+            "euler-ancestral",
+            "dpm-multi",
+            "dpm-single",
+            "unipc-multi",
+            "ddim",
+            "ddpm",
+            "deis-multi",
+            "heun",
+            "kdpm2-ancestral",
+            "kdpm2",
+        ],
+        help="The scheduler type of stable diffusion.",
+    )
+    parser.add_argument("--height", type=int, default=512, help="Height of input image")
+    parser.add_argument("--width", type=int, default=512, help="Width of input image")
+    parser.add_argument("--strength", type=float, default=1.0, help="Strength for img2img / inpaint")
+    return parser.parse_args()
+def main(args):
+    seed = 1024
+    paddle_dtype = paddle.float16 if args.use_fp16 else paddle.float32
+    pipe = StableDiffusionPipeline.from_pretrained(
+        args.pretrained_model_name_or_path,
+        safety_checker=None,
+        feature_extractor=None,
+        requires_safety_checker=False,
+        paddle_dtype=paddle_dtype,
+    )
+    scheduler = change_scheduler(pipe, args.scheduler)
+    pipe.scheduler = scheduler
+    if args.attention_type == "all":
+        args.attention_type = ["raw", "cutlass", "flash"]
+    else:
+        args.attention_type = [args.attention_type]
+    for attention_type in args.attention_type:
+        if attention_type == "raw":
+            pipe.disable_xformers_memory_efficient_attention()
+        else:
+            try:
+                pipe.enable_xformers_memory_efficient_attention(attention_type)
+            except Exception as e:
+                if attention_type == "flash":
+                    warnings.warn(
+                        "Attention type flash is not supported on your GPU! We need to use 3060、3070、3080、3090、4060、4070、4080、4090、A30、A100 etc."
+                    )
+                    continue
+                else:
+                    raise ValueError(e)
+        if not args.use_fp16 and attention_type == "flash":
+            print("Flash attention is not supported dtype=float32! Please use float16 or bfloat16. We will skip this!")
+            continue
+        width = args.width
+        height = args.height
+        pipe.set_progress_bar_config(disable=False)
+        folder = f"paddle_attn_{attention_type}_fp16" if args.use_fp16 else f"paddle_attn_{attention_type}_fp32"
+        os.makedirs(folder, exist_ok=True)
+        if args.task_name in ["text2img", "all"]:
+            init_image = load_image(
+                "https://paddlenlp.bj.bcebos.com/models/community/junnyu/develop/control_bird_canny_demo.png"
+            )
+            # text2img
+            prompt = "bird"
+            time_costs = []
+            # warmup
+            pipe(
+                prompt,
+                num_inference_steps=10,
+                height=height,
+                width=width,
+            )
+            print("==> Test text2img performance.")
+            for step in trange(args.benchmark_steps):
+                start = time.time()
+                paddle.seed(seed)
+                images = pipe(
+                    prompt,
+                    num_inference_steps=args.inference_steps,
+                    height=height,
+                    width=width,
+                ).images
+                latency = time.time() - start
+                time_costs += [latency]
+                # print(f"No {step:3d} time cost: {latency:2f} s")
+            print(
+                f"Attention type: {attention_type}, "
+                f"Use fp16: {'true' if args.use_fp16 else 'false'}, "
+                f"Mean iter/sec: {1 / (np.mean(time_costs) / args.inference_steps):2f} it/s, "
+                f"Mean latency: {np.mean(time_costs):2f} s, p50 latency: {np.percentile(time_costs, 50):2f} s, "
+                f"p90 latency: {np.percentile(time_costs, 90):2f} s, p95 latency: {np.percentile(time_costs, 95):2f} s."
+            )
+            images[0].save(f"{folder}/text2img.png")
+        if args.task_name in ["img2img", "all"]:
+            pipe_img2img = StableDiffusionImg2ImgPipeline(**pipe.components)
+            pipe_img2img.set_progress_bar_config(disable=False)
+            img_url = "https://paddlenlp.bj.bcebos.com/models/community/CompVis/stable-diffusion-v1-4/sketch-mountains-input.png"
+            init_image = load_image(img_url).resize((width, height))
+            prompt = "A fantasy landscape, trending on artstation"
+            time_costs = []
+            # warmup
+            pipe_img2img(
+                prompt,
+                image=init_image,
+                num_inference_steps=20,
+                height=height,
+                width=width,
+                strength=args.strength,
+            )
+            print("==> Test img2img performance.")
+            for step in trange(args.benchmark_steps):
+                start = time.time()
+                paddle.seed(seed)
+                images = pipe_img2img(
+                    prompt,
+                    image=init_image,
+                    num_inference_steps=args.inference_steps,
+                    height=height,
+                    width=width,
+                    strength=args.strength,
+                ).images
+                latency = time.time() - start
+                time_costs += [latency]
+                # print(f"No {step:3d} time cost: {latency:2f} s")
+            print(
+                f"Attention type: {attention_type}, "
+                f"Use fp16: {'true' if args.use_fp16 else 'false'}, "
+                f"Mean iter/sec: {1 / (np.mean(time_costs) / args.inference_steps):2f} it/s, "
+                f"Mean latency: {np.mean(time_costs):2f} s, p50 latency: {np.percentile(time_costs, 50):2f} s, "
+                f"p90 latency: {np.percentile(time_costs, 90):2f} s, p95 latency: {np.percentile(time_costs, 95):2f} s."
+            )
+            images[0].save(f"{folder}/img2img.png")
+        if args.task_name in ["inpaint_legacy", "all"]:
+            pipe_inpaint = StableDiffusionInpaintPipeline(**pipe.components)
+            pipe_inpaint.set_progress_bar_config(disable=False)
+            img_url = (
+                "https://paddlenlp.bj.bcebos.com/models/community/CompVis/stable-diffusion-v1-4/overture-creations.png"
+            )
+            mask_url = "https://paddlenlp.bj.bcebos.com/models/community/CompVis/stable-diffusion-v1-4/overture-creations-mask.png"
+            init_image = load_image(img_url).resize((width, height))
+            mask_image = load_image(mask_url).resize((width, height))
+            prompt = "Face of a yellow cat, high resolution, sitting on a park bench"
+            time_costs = []
+            task_name = "inpaint_legacy"
+            pipe_inpaint(
+                prompt,
+                image=init_image,
+                mask_image=mask_image,
+                num_inference_steps=20,
+                height=height,
+                width=width,
+                strength=args.strength,
+            )
+            print(f"==> Test {task_name} performance.")
+            for step in trange(args.benchmark_steps):
+                start = time.time()
+                paddle.seed(seed)
+                images = pipe_inpaint(
+                    prompt,
+                    image=init_image,
+                    mask_image=mask_image,
+                    num_inference_steps=args.inference_steps,
+                    height=height,
+                    width=width,
+                    strength=args.strength,
+                ).images
+                latency = time.time() - start
+                time_costs += [latency]
+                # print(f"No {step:3d} time cost: {latency:2f} s")
+            print(
+                f"Attention type: {attention_type}, "
+                f"Use fp16: {'true' if args.use_fp16 else 'false'}, "
+                f"Mean iter/sec: {1 / (np.mean(time_costs) / args.inference_steps):2f} it/s, "
+                f"Mean latency: {np.mean(time_costs):2f} s, p50 latency: {np.percentile(time_costs, 50):2f} s, "
+                f"p90 latency: {np.percentile(time_costs, 90):2f} s, p95 latency: {np.percentile(time_costs, 95):2f} s."
+            )
+            images[0].save(f"{folder}/{task_name}.png")
+if __name__ == "__main__":
+    args = parse_arguments()
+    main(args)

VLMEvalKit_old/PaddleMIX/ppdiffusers/deploy/sd15/infer_dygraph_torch.py ADDED Viewed

	@@ -0,0 +1,417 @@

+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import argparse
+import os
+import time
+import torch
+torch.nn.functional.scaled_dot_product_attention_ = torch.nn.functional.scaled_dot_product_attention
+delattr(torch.nn.functional, "scaled_dot_product_attention")
+import cv2
+import numpy as np
+from diffusers import (
+    DDIMScheduler,
+    DDPMScheduler,
+    DEISMultistepScheduler,
+    DPMSolverMultistepScheduler,
+    DPMSolverSinglestepScheduler,
+    EulerAncestralDiscreteScheduler,
+    EulerDiscreteScheduler,
+    HeunDiscreteScheduler,
+    KDPM2AncestralDiscreteScheduler,
+    KDPM2DiscreteScheduler,
+    LMSDiscreteScheduler,
+    PNDMScheduler,
+    StableDiffusionImg2ImgPipeline,
+    StableDiffusionInpaintPipeline,
+    StableDiffusionPipeline,
+    UniPCMultistepScheduler,
+)
+from diffusers.models.attention_processor import AttnProcessor, AttnProcessor2_0
+from diffusers.utils import load_image
+from PIL import Image
+from tqdm.auto import trange
+def get_canny_image(image, args):
+    if isinstance(image, Image.Image):
+        image = np.array(image)
+    image = cv2.Canny(image, args.low_threshold, args.high_threshold)
+    image = image[:, :, None]
+    image = np.concatenate([image, image, image], axis=2)
+    canny_image = Image.fromarray(image)
+    return canny_image
+def strtobool(v):
+    if isinstance(v, bool):
+        return v
+    if v.lower() in ("yes", "true", "t", "y", "1"):
+        return True
+    elif v.lower() in ("no", "false", "f", "n", "0"):
+        return False
+    else:
+        raise ValueError(
+            f"Truthy value expected: got {v} but expected one of yes/no, true/false, t/f, y/n, 1/0 (case insensitive)."
+        )
+def change_scheduler(self, scheduler_type="ddim"):
+    self.orginal_scheduler_config = self.scheduler.config
+    scheduler_type = scheduler_type.lower()
+    if scheduler_type == "pndm":
+        scheduler = PNDMScheduler.from_config(self.orginal_scheduler_config, skip_prk_steps=True)
+    elif scheduler_type == "lms":
+        scheduler = LMSDiscreteScheduler.from_config(self.orginal_scheduler_config)
+    elif scheduler_type == "heun":
+        scheduler = HeunDiscreteScheduler.from_config(self.orginal_scheduler_config)
+    elif scheduler_type == "euler":
+        scheduler = EulerDiscreteScheduler.from_config(self.orginal_scheduler_config)
+    elif scheduler_type == "euler-ancestral":
+        scheduler = EulerAncestralDiscreteScheduler.from_config(self.orginal_scheduler_config)
+    elif scheduler_type == "dpm-multi":
+        scheduler = DPMSolverMultistepScheduler.from_config(self.orginal_scheduler_config)
+    elif scheduler_type == "dpm-single":
+        scheduler = DPMSolverSinglestepScheduler.from_config(self.orginal_scheduler_config)
+    elif scheduler_type == "kdpm2-ancestral":
+        scheduler = KDPM2AncestralDiscreteScheduler.from_config(self.orginal_scheduler_config)
+    elif scheduler_type == "kdpm2":
+        scheduler = KDPM2DiscreteScheduler.from_config(self.orginal_scheduler_config)
+    elif scheduler_type == "unipc-multi":
+        scheduler = UniPCMultistepScheduler.from_config(self.orginal_scheduler_config)
+    elif scheduler_type == "ddim":
+        scheduler = DDIMScheduler.from_config(
+            self.orginal_scheduler_config,
+            steps_offset=1,
+            clip_sample=False,
+            set_alpha_to_one=False,
+        )
+    elif scheduler_type == "ddpm":
+        scheduler = DDPMScheduler.from_config(
+            self.orginal_scheduler_config,
+        )
+    elif scheduler_type == "deis-multi":
+        scheduler = DEISMultistepScheduler.from_config(
+            self.orginal_scheduler_config,
+        )
+    else:
+        raise ValueError(f"Scheduler of type {scheduler_type} doesn't exist!")
+    return scheduler
+def parse_arguments():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--pretrained_model_name_or_path",
+        type=str,
+        default="runwayml/stable-diffusion-v1-5",
+        help="Path to the `diffusers` checkpoint to convert (either a local directory or on the bos).",
+    )
+    parser.add_argument(
+        "--inference_steps",
+        type=int,
+        default=50,
+        help="The number of unet inference steps.",
+    )
+    parser.add_argument(
+        "--benchmark_steps",
+        type=int,
+        default=10,
+        help="The number of performance benchmark steps.",
+    )
+    parser.add_argument(
+        "--task_name",
+        type=str,
+        default="all",
+        choices=[
+            "text2img",
+            "img2img",
+            "inpaint_legacy",
+            "all",
+        ],
+        help="The task can be one of [text2img, img2img, inpaint_legacy, all]. ",
+    )
+    parser.add_argument(
+        "--parse_prompt_type",
+        type=str,
+        default="raw",
+        choices=[
+            "raw",
+            "lpw",
+        ],
+        help="The parse_prompt_type can be one of [raw, lpw]. ",
+    )
+    parser.add_argument(
+        "--channels_last",
+        type=strtobool,
+        default=False,
+        help="Wheter to use channels_last",
+    )
+    parser.add_argument("--use_fp16", type=strtobool, default=True, help="Wheter to use FP16 mode")
+    parser.add_argument("--tf32", type=strtobool, default=True, help="tf32")
+    parser.add_argument("--compile", type=strtobool, default=False, help="compile")
+    parser.add_argument(
+        "--attention_type",
+        type=str,
+        default="sdp",
+        choices=[
+            "raw",
+            "sdp",
+        ],
+        help="attention_type.",
+    )
+    parser.add_argument("--device_id", type=int, default=0, help="The selected gpu id. -1 means use cpu")
+    parser.add_argument(
+        "--scheduler",
+        type=str,
+        default="euler-ancestral",
+        choices=[
+            "pndm",
+            "lms",
+            "euler",
+            "euler-ancestral",
+            "dpm-multi",
+            "dpm-single",
+            "unipc-multi",
+            "ddim",
+            "ddpm",
+            "deis-multi",
+            "heun",
+            "kdpm2-ancestral",
+            "kdpm2",
+        ],
+        help="The scheduler type of stable diffusion.",
+    )
+    parser.add_argument("--height", type=int, default=512, help="Height of input image")
+    parser.add_argument("--width", type=int, default=512, help="Width of input image")
+    parser.add_argument("--strength", type=float, default=1.0, help="Strength for img2img / inpaint")
+    return parser.parse_args()
+def attn_processors(self):
+    processors = {}
+    def fn_recursive_add_processors(name: str, module, processors):
+        if hasattr(module, "set_processor"):
+            processors[f"{name}.processor"] = module.processor
+        for sub_name, child in module.named_children():
+            fn_recursive_add_processors(f"{name}.{sub_name}", child, processors)
+        return processors
+    for name, module in self.named_children():
+        fn_recursive_add_processors(name, module, processors)
+    return processors
+def set_attn_processor(self, processor):
+    count = len(attn_processors(self).keys())
+    if isinstance(processor, dict) and len(processor) != count:
+        raise ValueError(
+            f"A dict of processors was passed, but the number of processors {len(processor)} does not match the"
+            f" number of attention layers: {count}. Please make sure to pass {count} processor classes."
+        )
+    def fn_recursive_attn_processor(name: str, module, processor):
+        if hasattr(module, "set_processor"):
+            if not isinstance(processor, dict):
+                module.set_processor(processor)
+            else:
+                module.set_processor(processor.pop(f"{name}.processor"))
+        for sub_name, child in module.named_children():
+            fn_recursive_attn_processor(f"{name}.{sub_name}", child, processor)
+    for name, module in self.named_children():
+        fn_recursive_attn_processor(name, module, processor)
+def main(args):
+    if args.tf32:
+        torch.backends.cuda.matmul.allow_tf32 = True
+    else:
+        torch.backends.cuda.matmul.allow_tf32 = False
+    seed = 1024
+    torch_dtype = torch.float16 if args.use_fp16 else torch.float32
+    pipe = StableDiffusionPipeline.from_pretrained(
+        args.pretrained_model_name_or_path,
+        safety_checker=None,
+        feature_extractor=None,
+        requires_safety_checker=False,
+        torch_dtype=torch_dtype,
+    )
+    scheduler = change_scheduler(pipe, args.scheduler)
+    pipe.scheduler = scheduler
+    if args.device_id >= 0:
+        pipe.to(f"cuda:{args.device_id}")
+    if args.attention_type == "all":
+        args.attention_type = ["raw", "sdp"]
+    else:
+        args.attention_type = [args.attention_type]
+    for attention_type in args.attention_type:
+        attn_prrocessor_cls = AttnProcessor if attention_type == "raw" else AttnProcessor2_0
+        if attention_type == "sdp":
+            torch.nn.functional.scaled_dot_product_attention = torch.nn.functional.scaled_dot_product_attention_
+        set_attn_processor(pipe.unet, attn_prrocessor_cls())
+        set_attn_processor(pipe.vae, attn_prrocessor_cls())
+        if args.channels_last:
+            pipe.unet.to(memory_format=torch.channels_last)
+        if args.compile:
+            print("Run torch compile")
+            pipe.unet = torch.compile(pipe.unet, mode="reduce-overhead", fullgraph=True)
+        width = args.width
+        height = args.height
+        pipe.set_progress_bar_config(disable=False)
+        folder = f"torch_attn_{attention_type}_fp16" if args.use_fp16 else f"torch_attn_{attention_type}_fp32"
+        os.makedirs(folder, exist_ok=True)
+        if args.task_name in ["text2img", "all"]:
+            init_image = load_image(
+                "https://paddlenlp.bj.bcebos.com/models/community/junnyu/develop/control_bird_canny_demo.png"
+            )
+            # text2img
+            prompt = "bird"
+            time_costs = []
+            # warmup
+            pipe(
+                prompt,
+                num_inference_steps=10,
+                height=height,
+                width=width,
+            )
+            print("==> Test text2img performance.")
+            for step in trange(args.benchmark_steps):
+                start = time.time()
+                torch.cuda.manual_seed(seed)
+                images = pipe(
+                    prompt,
+                    num_inference_steps=args.inference_steps,
+                    height=height,
+                    width=width,
+                ).images
+                latency = time.time() - start
+                time_costs += [latency]
+                # print(f"No {step:3d} time cost: {latency:2f} s")
+            print(
+                f"Attention type: {attention_type}, "
+                f"Use fp16: {'true' if args.use_fp16 else 'false'}, "
+                f"Mean iter/sec: {1 / (np.mean(time_costs) / args.inference_steps):2f} it/s, "
+                f"Mean latency: {np.mean(time_costs):2f} s, p50 latency: {np.percentile(time_costs, 50):2f} s, "
+                f"p90 latency: {np.percentile(time_costs, 90):2f} s, p95 latency: {np.percentile(time_costs, 95):2f} s."
+            )
+            images[0].save(f"{folder}/text2img.png")
+        if args.task_name in ["img2img", "all"]:
+            pipe_img2img = StableDiffusionImg2ImgPipeline(**pipe.components)
+            pipe_img2img.set_progress_bar_config(disable=False)
+            img_url = "https://paddlenlp.bj.bcebos.com/models/community/CompVis/stable-diffusion-v1-4/sketch-mountains-input.png"
+            init_image = load_image(img_url).resize((width, height))
+            prompt = "A fantasy landscape, trending on artstation"
+            time_costs = []
+            # warmup
+            pipe_img2img(
+                prompt,
+                image=init_image,
+                num_inference_steps=20,
+                height=height,
+                width=width,
+                strength=args.strength,
+            )
+            print("==> Test img2img performance.")
+            for step in trange(args.benchmark_steps):
+                start = time.time()
+                torch.cuda.manual_seed(seed)
+                images = pipe_img2img(
+                    prompt,
+                    image=init_image,
+                    num_inference_steps=args.inference_steps,
+                    height=height,
+                    width=width,
+                    strength=args.strength,
+                ).images
+                latency = time.time() - start
+                time_costs += [latency]
+                # print(f"No {step:3d} time cost: {latency:2f} s")
+            print(
+                f"Attention type: {attention_type}, "
+                f"Use fp16: {'true' if args.use_fp16 else 'false'}, "
+                f"Mean iter/sec: {1 / (np.mean(time_costs) / args.inference_steps):2f} it/s, "
+                f"Mean latency: {np.mean(time_costs):2f} s, p50 latency: {np.percentile(time_costs, 50):2f} s, "
+                f"p90 latency: {np.percentile(time_costs, 90):2f} s, p95 latency: {np.percentile(time_costs, 95):2f} s."
+            )
+            images[0].save(f"{folder}/img2img.png")
+        if args.task_name in ["inpaint_legacy", "all"]:
+            pipe_inpaint = StableDiffusionInpaintPipeline(**pipe.components)
+            pipe_inpaint.set_progress_bar_config(disable=False)
+            img_url = (
+                "https://paddlenlp.bj.bcebos.com/models/community/CompVis/stable-diffusion-v1-4/overture-creations.png"
+            )
+            mask_url = "https://paddlenlp.bj.bcebos.com/models/community/CompVis/stable-diffusion-v1-4/overture-creations-mask.png"
+            init_image = load_image(img_url).resize((width, height))
+            mask_image = load_image(mask_url).resize((width, height))
+            prompt = "Face of a yellow cat, high resolution, sitting on a park bench"
+            time_costs = []
+            task_name = "inpaint_legacy"
+            pipe_inpaint(
+                prompt,
+                image=init_image,
+                mask_image=mask_image,
+                num_inference_steps=20,
+                height=height,
+                width=width,
+                strength=args.strength,
+            )
+            print(f"==> Test {task_name} performance.")
+            for step in trange(args.benchmark_steps):
+                start = time.time()
+                torch.cuda.manual_seed(seed)
+                images = pipe_inpaint(
+                    prompt,
+                    image=init_image,
+                    mask_image=mask_image,
+                    num_inference_steps=args.inference_steps,
+                    height=height,
+                    width=width,
+                    strength=args.strength,
+                ).images
+                latency = time.time() - start
+                time_costs += [latency]
+                # print(f"No {step:3d} time cost: {latency:2f} s")
+            print(
+                f"Attention type: {attention_type}, "
+                f"Use fp16: {'true' if args.use_fp16 else 'false'}, "
+                f"Mean iter/sec: {1 / (np.mean(time_costs) / args.inference_steps):2f} it/s, "
+                f"Mean latency: {np.mean(time_costs):2f} s, p50 latency: {np.percentile(time_costs, 50):2f} s, "
+                f"p90 latency: {np.percentile(time_costs, 90):2f} s, p95 latency: {np.percentile(time_costs, 95):2f} s."
+            )
+            images[0].save(f"{folder}/{task_name}.png")
+if __name__ == "__main__":
+    args = parse_arguments()
+    main(args)

VLMEvalKit_old/PaddleMIX/ppdiffusers/deploy/sd3/README.md ADDED Viewed

	@@ -0,0 +1,77 @@

+# Stable Diffusion 3 高性能推理
+- Paddle Inference提供Stable Diffusion 3 模型高性能推理实现，推理性能提升70%+
+环境准备：
+```shell
+# 安装 triton并适配paddle
+python -m pip install triton
+python -m pip install git+https://github.com/zhoutianzi666/UseTritonInPaddle.git
+python -c "import use_triton_in_paddle; use_triton_in_paddle.make_triton_compatible_with_paddle()"
+# 安装develop版本的paddle，请根据自己的cuda版本选择对应的paddle版本，这里选择12.3的cuda版本
+python -m pip install --pre paddlepaddle-gpu -i https://www.paddlepaddle.org.cn/packages/nightly/cu123/
+# 安装paddlemix库,使用集成在paddlemix库中的自定义算子。
+python -m pip install paddlemix
+# 指定 libCutlassGemmEpilogue.so 的路径
+# 详情请参考 https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/phi/kernels/fusion/cutlass/gemm_epilogue/README.md
+export LD_LIBRARY_PATH=/your_dir/Paddle/paddle/phi/kernels/fusion/cutlass/gemm_epilogue/build:$LD_LIBRARY_PATH
+- 请注意，该项用于在静态图推理时利用Cutlass融合算子提升推理性能，但是并不是必须项。
+如果不使用Cutlass可以将`./text_to_image_generation-stable_diffusion_3.py`中的`exp_enable_use_cutlass`设为False。
+-
+```
+高性能推理指令：
+```shell
+# 执行FP16推理
+python  text_to_image_generation-stable_diffusion_3.py  --dtype float16 --height 512 --width 512 \
+--num-inference-steps 50 --inference_optimize 1  \
+--benchmark 1
+```
+注：--inference_optimize 1 用于开启推理优化，--benchmark 1 用于开启性能测试。
+- 在 NVIDIA A100-SXM4-40GB 上测试的性能如下：
+| Paddle Inference|    PyTorch   | Paddle 动态图 |
+| --------------- | ------------ | ------------ |
+|       1.2 s     |     1.78 s   |    4.202 s   |
+## Paddle Stable Diffusion 3 模型多卡推理：
+### Data Parallel 实现原理
+- 在SD3中，对于输入是一个prompt时，使用CFG需要同时进行unconditional guide和text guide的生成，此时 MM-DiT-blocks 的输入batch_size=2；
+所以我们考虑在多卡并行的方案中，将batch为2的输入拆分到两张卡上进行计算，这样单卡的计算量就减少为原来的一半，降低了单卡所承载的浮点计算量。
+计算完成后，我们再把两张卡的计算结果聚合在一起，结果与单卡计算完全一致。
+### Model parallel 实现原理
+- 在SD3中,在Linear和Attnetion中有大量的GEMM（General Matrix Multiply），当生成高分辨率图像时，GEMM的计算量以及模型的预训练权重大小都呈线性递增。
+因此，我们考虑在多卡并行方案中，将模型的这些GEMM拆分到两张卡上进行计算，这样单卡的计算量和权重大小就都减少为原来的一半，不仅降低了单卡所承载的浮点计算量，也降低了单卡的显存占用。
+### 开启多卡推理方法
+- Paddle Inference 提供了SD3模型的多卡推理功能，用户可以通过设置 `mp_size 2` 来开启Model Parallel，使用 `dp_size 2`来开启Data Parallel。
+使用 `python -m paddle.distributed.launch --gpus “0,1,2,3”` 指定使用哪些卡进行推理，其中`--gpus “0,1,2,3”`即为启用的GPU卡号。
+如果只需使用两卡推理，则只需指定两卡即可，如 `python -m paddle.distributed.launch --gpus “0,1”`。同时需要指定使用的并行方法及并行度，如 `mp_size 2` 或者 `dp_size 2`。
+- 注意，这里的`mp_size`需要设定为不大于输入的batch_size个，且`mp_size`和`dp_size`的和不能超过机器总卡数。
+- 高性能多卡推理指令：
+```shell
+# 执行多卡推理指令
+python -m paddle.distributed.launch --gpus "0,1,2,3" text_to_image_generation-stable_diffusion_3.py \
+--dtype float16 \
+--height 1024 \
+--width 1024 \
+--num-inference-steps 20 \
+--inference_optimize 1 \
+--mp_size 2 \
+--dp_size 2 \
+--benchmark 1
+```
+注：--inference_optimize 1 用于开启推理优化，--benchmark 1 用于开启性能测试。
+## 在 NVIDIA A800-SXM4-80GB 上测试的性能如下：
+| Paddle mp_size=2 & dp_size=2 |  Paddle mp_size=2   | Paddle dp_size=2 | Paddle Single Card | Paddle 动态图 |
+| ---------------------------- | ------------------- | ---------------- | ------------------ | ------------ |
+|            0.99s             |        1.581 s      |      1.319 s     |       2.376 s      |     3.2 s    |

VLMEvalKit_old/PaddleMIX/ppdiffusers/deploy/sd3/infer_dygraph_paddle.py ADDED Viewed

	@@ -0,0 +1,264 @@

+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import argparse
+import os
+import time
+import warnings
+import cv2
+import numpy as np
+import paddle
+from PIL import Image
+from tqdm.auto import trange
+from ppdiffusers import (
+    FlowMatchEulerDiscreteScheduler,
+    DDIMScheduler,
+    DDPMScheduler,
+    DEISMultistepScheduler,
+    DPMSolverMultistepScheduler,
+    DPMSolverSinglestepScheduler,
+    EulerAncestralDiscreteScheduler,
+    EulerDiscreteScheduler,
+    HeunDiscreteScheduler,
+    KDPM2AncestralDiscreteScheduler,
+    KDPM2DiscreteScheduler,
+    LMSDiscreteScheduler,
+    PNDMScheduler,
+    StableDiffusion3Pipeline,
+    UniPCMultistepScheduler,
+)
+from ppdiffusers.utils import load_image
+def strtobool(v):
+    if isinstance(v, bool):
+        return v
+    if v.lower() in ("yes", "true", "t", "y", "1"):
+        return True
+    elif v.lower() in ("no", "false", "f", "n", "0"):
+        return False
+    else:
+        raise ValueError(
+            f"Truthy value expected: got {v} but expected one of yes/no, true/false, t/f, y/n, 1/0 (case insensitive)."
+        )
+def change_scheduler(self, scheduler_type="ddim"):
+    self.orginal_scheduler_config = self.scheduler.config
+    scheduler_type = scheduler_type.lower()
+    if scheduler_type == "flow":
+        scheduler = FlowMatchEulerDiscreteScheduler.from_config(self.orginal_scheduler_config, skip_prk_steps=True)
+    elif scheduler_type == "pndm":
+        scheduler = PNDMScheduler.from_config(self.orginal_scheduler_config, skip_prk_steps=True)
+    elif scheduler_type == "lms":
+        scheduler = LMSDiscreteScheduler.from_config(self.orginal_scheduler_config)
+    elif scheduler_type == "heun":
+        scheduler = HeunDiscreteScheduler.from_config(self.orginal_scheduler_config)
+    elif scheduler_type == "euler":
+        scheduler = EulerDiscreteScheduler.from_config(self.orginal_scheduler_config)
+    elif scheduler_type == "euler-ancestral":
+        scheduler = EulerAncestralDiscreteScheduler.from_config(self.orginal_scheduler_config)
+    elif scheduler_type == "dpm-multi":
+        scheduler = DPMSolverMultistepScheduler.from_config(self.orginal_scheduler_config)
+    elif scheduler_type == "dpm-single":
+        scheduler = DPMSolverSinglestepScheduler.from_config(self.orginal_scheduler_config)
+    elif scheduler_type == "kdpm2-ancestral":
+        scheduler = KDPM2AncestralDiscreteScheduler.from_config(self.orginal_scheduler_config)
+    elif scheduler_type == "kdpm2":
+        scheduler = KDPM2DiscreteScheduler.from_config(self.orginal_scheduler_config)
+    elif scheduler_type == "unipc-multi":
+        scheduler = UniPCMultistepScheduler.from_config(self.orginal_scheduler_config)
+    elif scheduler_type == "ddim":
+        scheduler = DDIMScheduler.from_config(
+            self.orginal_scheduler_config,
+            steps_offset=1,
+            clip_sample=False,
+            set_alpha_to_one=False,
+        )
+    elif scheduler_type == "ddpm":
+        scheduler = DDPMScheduler.from_config(
+            self.orginal_scheduler_config,
+        )
+    elif scheduler_type == "deis-multi":
+        scheduler = DEISMultistepScheduler.from_config(
+            self.orginal_scheduler_config,
+        )
+    else:
+        raise ValueError(f"Scheduler of type {scheduler_type} doesn't exist!")
+    return scheduler
+def parse_arguments():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--pretrained_model_name_or_path",
+        type=str,
+        default="stabilityai/stable-diffusion-3-medium-diffusers",
+        help="Path to the `diffusers` checkpoint to convert (either a local directory or on the bos).",
+    )
+    parser.add_argument(
+        "--inference_steps",
+        type=int,
+        default=50,
+        help="The number of unet inference steps.",
+    )
+    parser.add_argument(
+        "--benchmark_steps",
+        type=int,
+        default=10,
+        help="The number of performance benchmark steps.",
+    )
+    parser.add_argument(
+        "--task_name",
+        type=str,
+        default="all",
+        choices=[
+            "text2img",
+            "img2img",
+            "inpaint_legacy",
+            "all",
+        ],
+        help="The task can be one of [text2img, img2img, inpaint_legacy, all]. ",
+    )
+    parser.add_argument(
+        "--parse_prompt_type",
+        type=str,
+        default="raw",
+        choices=[
+            "raw",
+            "lpw",
+        ],
+        help="The parse_prompt_type can be one of [raw, lpw]. ",
+    )
+    parser.add_argument("--use_fp16", type=strtobool, default=True, help="Wheter to use FP16 mode")
+    parser.add_argument(
+        "--attention_type", type=str, default="raw", choices=["raw", "cutlass", "flash", "all"], help="attention_type."
+    )
+    parser.add_argument("--device_id", type=int, default=0, help="The selected gpu id. -1 means use cpu")
+    parser.add_argument(
+        "--scheduler",
+        type=str,
+        default="euler-ancestral",
+        choices=[
+            "flow",
+            "pndm",
+            "lms",
+            "euler",
+            "euler-ancestral",
+            "dpm-multi",
+            "dpm-single",
+            "unipc-multi",
+            "ddim",
+            "ddpm",
+            "deis-multi",
+            "heun",
+            "kdpm2-ancestral",
+            "kdpm2",
+        ],
+        help="The scheduler type of stable diffusion.",
+    )
+    parser.add_argument("--height", type=int, default=512, help="Height of input image")
+    parser.add_argument("--width", type=int, default=512, help="Width of input image")
+    parser.add_argument("--strength", type=float, default=1.0, help="Strength for img2img / inpaint")
+    return parser.parse_args()
+def main(args):
+    seed = 1024
+    paddle_dtype = paddle.float16 if args.use_fp16 else paddle.float32
+    pipe = StableDiffusion3Pipeline.from_pretrained(
+        args.pretrained_model_name_or_path,
+        safety_checker=None,
+        feature_extractor=None,
+        requires_safety_checker=False,
+        paddle_dtype=paddle_dtype,
+    )
+    scheduler = change_scheduler(pipe, args.scheduler)
+    pipe.scheduler = scheduler
+    if args.attention_type == "all":
+        args.attention_type = ["raw", "cutlass", "flash"]
+    else:
+        args.attention_type = [args.attention_type]
+    for attention_type in args.attention_type:
+        if attention_type == "raw":
+            pipe.disable_xformers_memory_efficient_attention()
+        else:
+            try:
+                pipe.enable_xformers_memory_efficient_attention(attention_type)
+            except Exception as e:
+                if attention_type == "flash":
+                    warnings.warn(
+                        "Attention type flash is not supported on your GPU! We need to use 3060、3070、3080、3090、4060、4070、4080、4090、A30、A100 etc."
+                    )
+                    continue
+                else:
+                    raise ValueError(e)
+        if not args.use_fp16 and attention_type == "flash":
+            print("Flash attention is not supported dtype=float32! Please use float16 or bfloat16. We will skip this!")
+            continue
+        width = args.width
+        height = args.height
+        pipe.set_progress_bar_config(disable=False)
+        folder = f"paddle_attn_{attention_type}_fp16" if args.use_fp16 else f"paddle_attn_{attention_type}_fp32"
+        os.makedirs(folder, exist_ok=True)
+        if args.task_name in ["text2img", "all"]:
+            init_image = load_image(
+                "https://paddlenlp.bj.bcebos.com/models/community/junnyu/develop/control_bird_canny_demo.png"
+            )
+            # text2img
+            prompt = "bird"
+            time_costs = []
+            # warmup
+            pipe(
+                prompt,
+                num_inference_steps=10,
+                height=height,
+                width=width,
+            )
+            print("==> Test text2img performance.")
+            for step in trange(args.benchmark_steps):
+                start = time.time()
+                paddle.seed(seed)
+                images = pipe(
+                    prompt,
+                    num_inference_steps=args.inference_steps,
+                    height=height,
+                    width=width,
+                ).images
+                latency = time.time() - start
+                time_costs += [latency]
+                # print(f"No {step:3d} time cost: {latency:2f} s")
+            print(
+                f"Attention type: {attention_type}, "
+                f"Use fp16: {'true' if args.use_fp16 else 'false'}, "
+                f"Mean iter/sec: {1 / (np.mean(time_costs) / args.inference_steps):2f} it/s, "
+                f"Mean latency: {np.mean(time_costs):2f} s, p50 latency: {np.percentile(time_costs, 50):2f} s, "
+                f"p90 latency: {np.percentile(time_costs, 90):2f} s, p95 latency: {np.percentile(time_costs, 95):2f} s."
+            )
+            images[0].save(f"{folder}/text2img.png")
+if __name__ == "__main__":
+    args = parse_arguments()
+    main(args)

VLMEvalKit_old/PaddleMIX/ppdiffusers/deploy/sd3/infer_dygraph_torch.py ADDED Viewed

	@@ -0,0 +1,325 @@

+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import argparse
+import os
+import time
+import torch
+# torch.nn.functional.scaled_dot_product_attention_ = torch.nn.functional.scaled_dot_product_attention
+# delattr(torch.nn.functional, "scaled_dot_product_attention")
+import cv2
+import numpy as np
+from diffusers import (
+    FlowMatchEulerDiscreteScheduler,
+    DDIMScheduler,
+    DDPMScheduler,
+    DEISMultistepScheduler,
+    DPMSolverMultistepScheduler,
+    DPMSolverSinglestepScheduler,
+    EulerAncestralDiscreteScheduler,
+    EulerDiscreteScheduler,
+    HeunDiscreteScheduler,
+    KDPM2AncestralDiscreteScheduler,
+    KDPM2DiscreteScheduler,
+    LMSDiscreteScheduler,
+    PNDMScheduler,
+    StableDiffusion3Pipeline,
+    UniPCMultistepScheduler,
+)
+from diffusers.models.attention_processor import AttnProcessor, AttnProcessor2_0
+from diffusers.utils import load_image
+from PIL import Image
+from tqdm.auto import trange
+def strtobool(v):
+    if isinstance(v, bool):
+        return v
+    if v.lower() in ("yes", "true", "t", "y", "1"):
+        return True
+    elif v.lower() in ("no", "false", "f", "n", "0"):
+        return False
+    else:
+        raise ValueError(
+            f"Truthy value expected: got {v} but expected one of yes/no, true/false, t/f, y/n, 1/0 (case insensitive)."
+        )
+def change_scheduler(self, scheduler_type="ddim"):
+    self.orginal_scheduler_config = self.scheduler.config
+    scheduler_type = scheduler_type.lower()
+    if scheduler_type == "flow":
+        scheduler = FlowMatchEulerDiscreteScheduler.from_config(self.orginal_scheduler_config, skip_prk_steps=True)
+    elif scheduler_type == "pndm":
+        scheduler = PNDMScheduler.from_config(self.orginal_scheduler_config, skip_prk_steps=True)
+    elif scheduler_type == "lms":
+        scheduler = LMSDiscreteScheduler.from_config(self.orginal_scheduler_config)
+    elif scheduler_type == "heun":
+        scheduler = HeunDiscreteScheduler.from_config(self.orginal_scheduler_config)
+    elif scheduler_type == "euler":
+        scheduler = EulerDiscreteScheduler.from_config(self.orginal_scheduler_config)
+    elif scheduler_type == "euler-ancestral":
+        scheduler = EulerAncestralDiscreteScheduler.from_config(self.orginal_scheduler_config)
+    elif scheduler_type == "dpm-multi":
+        scheduler = DPMSolverMultistepScheduler.from_config(self.orginal_scheduler_config)
+    elif scheduler_type == "dpm-single":
+        scheduler = DPMSolverSinglestepScheduler.from_config(self.orginal_scheduler_config)
+    elif scheduler_type == "kdpm2-ancestral":
+        scheduler = KDPM2AncestralDiscreteScheduler.from_config(self.orginal_scheduler_config)
+    elif scheduler_type == "kdpm2":
+        scheduler = KDPM2DiscreteScheduler.from_config(self.orginal_scheduler_config)
+    elif scheduler_type == "unipc-multi":
+        scheduler = UniPCMultistepScheduler.from_config(self.orginal_scheduler_config)
+    elif scheduler_type == "ddim":
+        scheduler = DDIMScheduler.from_config(
+            self.orginal_scheduler_config,
+            steps_offset=1,
+            clip_sample=False,
+            set_alpha_to_one=False,
+        )
+    elif scheduler_type == "ddpm":
+        scheduler = DDPMScheduler.from_config(
+            self.orginal_scheduler_config,
+        )
+    elif scheduler_type == "deis-multi":
+        scheduler = DEISMultistepScheduler.from_config(
+            self.orginal_scheduler_config,
+        )
+    else:
+        raise ValueError(f"Scheduler of type {scheduler_type} doesn't exist!")
+    return scheduler
+def parse_arguments():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--pretrained_model_name_or_path",
+        type=str,
+        default="stabilityai/stable-diffusion-3-medium-diffusers",
+        help="Path to the `diffusers` checkpoint to convert (either a local directory or on the bos).",
+    )
+    parser.add_argument(
+        "--inference_steps",
+        type=int,
+        default=50,
+        help="The number of unet inference steps.",
+    )
+    parser.add_argument(
+        "--benchmark_steps",
+        type=int,
+        default=10,
+        help="The number of performance benchmark steps.",
+    )
+    parser.add_argument(
+        "--task_name",
+        type=str,
+        default="all",
+        choices=[
+            "text2img",
+            "img2img",
+            "inpaint_legacy",
+            "all",
+        ],
+        help="The task can be one of [text2img, img2img, inpaint_legacy, all]. ",
+    )
+    parser.add_argument(
+        "--parse_prompt_type",
+        type=str,
+        default="raw",
+        choices=[
+            "raw",
+            "lpw",
+        ],
+        help="The parse_prompt_type can be one of [raw, lpw]. ",
+    )
+    parser.add_argument(
+        "--channels_last",
+        type=strtobool,
+        default=False,
+        help="Wheter to use channels_last",
+    )
+    parser.add_argument("--use_fp16", type=strtobool, default=True, help="Wheter to use FP16 mode")
+    parser.add_argument("--tf32", type=strtobool, default=True, help="tf32")
+    parser.add_argument("--compile", type=strtobool, default=False, help="compile")
+    parser.add_argument(
+        "--attention_type",
+        type=str,
+        default="sdp",
+        choices=[
+            "raw",
+            "sdp",
+        ],
+        help="attention_type.",
+    )
+    parser.add_argument("--device_id", type=int, default=0, help="The selected gpu id. -1 means use cpu")
+    parser.add_argument(
+        "--scheduler",
+        type=str,
+        default="euler-ancestral",
+        choices=[
+            "flow",
+            "pndm",
+            "lms",
+            "euler",
+            "euler-ancestral",
+            "dpm-multi",
+            "dpm-single",
+            "unipc-multi",
+            "ddim",
+            "ddpm",
+            "deis-multi",
+            "heun",
+            "kdpm2-ancestral",
+            "kdpm2",
+        ],
+        help="The scheduler type of stable diffusion.",
+    )
+    parser.add_argument("--height", type=int, default=512, help="Height of input image")
+    parser.add_argument("--width", type=int, default=512, help="Width of input image")
+    parser.add_argument("--strength", type=float, default=1.0, help="Strength for img2img / inpaint")
+    return parser.parse_args()
+def attn_processors(self):
+    processors = {}
+    def fn_recursive_add_processors(name: str, module, processors):
+        if hasattr(module, "set_processor"):
+            processors[f"{name}.processor"] = module.processor
+        for sub_name, child in module.named_children():
+            fn_recursive_add_processors(f"{name}.{sub_name}", child, processors)
+        return processors
+    for name, module in self.named_children():
+        fn_recursive_add_processors(name, module, processors)
+    return processors
+def set_attn_processor(self, processor):
+    count = len(attn_processors(self).keys())
+    if isinstance(processor, dict) and len(processor) != count:
+        raise ValueError(
+            f"A dict of processors was passed, but the number of processors {len(processor)} does not match the"
+            f" number of attention layers: {count}. Please make sure to pass {count} processor classes."
+        )
+    def fn_recursive_attn_processor(name: str, module, processor):
+        if hasattr(module, "set_processor"):
+            if not isinstance(processor, dict):
+                module.set_processor(processor)
+            else:
+                module.set_processor(processor.pop(f"{name}.processor"))
+        for sub_name, child in module.named_children():
+            fn_recursive_attn_processor(f"{name}.{sub_name}", child, processor)
+    for name, module in self.named_children():
+        fn_recursive_attn_processor(name, module, processor)
+def main(args):
+    if args.tf32:
+        torch.backends.cuda.matmul.allow_tf32 = True
+    else:
+        torch.backends.cuda.matmul.allow_tf32 = False
+    seed = 1024
+    torch_dtype = torch.float16 if args.use_fp16 else torch.float32
+    pipe = StableDiffusion3Pipeline.from_pretrained(
+        args.pretrained_model_name_or_path,
+        safety_checker=None,
+        feature_extractor=None,
+        requires_safety_checker=False,
+        torch_dtype=torch_dtype,
+    )
+    scheduler = change_scheduler(pipe, args.scheduler)
+    pipe.scheduler = scheduler
+    if args.device_id >= 0:
+        pipe.to(f"cuda:{args.device_id}")
+    if args.attention_type == "all":
+        args.attention_type = ["raw", "sdp"]
+    else:
+        args.attention_type = [args.attention_type]
+    for attention_type in args.attention_type:
+        # attn_prrocessor_cls = AttnProcessor if attention_type == "raw" else AttnProcessor2_0
+        # if attention_type == "sdp":
+        #     torch.nn.functional.scaled_dot_product_attention = torch.nn.functional.scaled_dot_product_attention_
+        # set_attn_processor(pipe.transformer, attn_prrocessor_cls())
+        # set_attn_processor(pipe.vae, attn_prrocessor_cls())
+        # if args.channels_last:
+        #     pipe.transformer.to(memory_format=torch.channels_last)
+        # if args.compile:
+        #     print("Run torch compile")
+        #     pipe.unet = torch.compile(pipe.transformer, mode="reduce-overhead", fullgraph=True)
+        width = args.width
+        height = args.height
+        pipe.set_progress_bar_config(disable=False)
+        folder = f"torch_attn_{attention_type}_fp16" if args.use_fp16 else f"torch_attn_{attention_type}_fp32"
+        os.makedirs(folder, exist_ok=True)
+        if args.task_name in ["text2img", "all"]:
+            init_image = load_image(
+                "https://paddlenlp.bj.bcebos.com/models/community/junnyu/develop/control_bird_canny_demo.png"
+            )
+            # text2img
+            prompt = "bird"
+            time_costs = []
+            # warmup
+            pipe(
+                prompt,
+                num_inference_steps=10,
+                height=height,
+                width=width,
+            )
+            print("==> Test text2img performance.")
+            for step in trange(args.benchmark_steps):
+                start = time.time()
+                torch.cuda.manual_seed(seed)
+                images = pipe(
+                    prompt,
+                    num_inference_steps=args.inference_steps,
+                    height=height,
+                    width=width,
+                ).images
+                latency = time.time() - start
+                time_costs += [latency]
+                # print(f"No {step:3d} time cost: {latency:2f} s")
+            print(
+                f"Attention type: {attention_type}, "
+                f"Use fp16: {'true' if args.use_fp16 else 'false'}, "
+                f"Mean iter/sec: {1 / (np.mean(time_costs) / args.inference_steps):2f} it/s, "
+                f"Mean latency: {np.mean(time_costs):2f} s, p50 latency: {np.percentile(time_costs, 50):2f} s, "
+                f"p90 latency: {np.percentile(time_costs, 90):2f} s, p95 latency: {np.percentile(time_costs, 95):2f} s."
+            )
+            images[0].save(f"{folder}/text2img.png")
+if __name__ == "__main__":
+    args = parse_arguments()
+    main(args)

VLMEvalKit_old/PaddleMIX/ppdiffusers/deploy/sd3/scripts/benchmark_paddle.sh ADDED Viewed

	@@ -0,0 +1,32 @@

+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# attention raw fp16
+python infer_dygraph_paddle.py --scheduler "flow" --task_name all --attention_type raw --use_fp16 True --inference_steps 50 --height 1024 --width 1024 --benchmark_steps 10
+# attention cutlass fp16
+python infer_dygraph_paddle.py --scheduler "flow" --task_name all --attention_type cutlass --use_fp16 True --inference_steps 50 --height 1024 --width 1024 --benchmark_steps 10
+# attention flash fp16
+python infer_dygraph_paddle.py --scheduler "flow" --task_name all --attention_type flash --use_fp16 True --inference_steps 50 --height 1024 --width 1024 --benchmark_steps 10
+# attention raw fp32
+python infer_dygraph_paddle.py --scheduler "flow" --task_name all --attention_type raw --use_fp16 False --inference_steps 50 --height 1024 --width 1024 --benchmark_steps 10
+# attention cutlass fp32
+python infer_dygraph_paddle.py --scheduler "flow" --task_name all --attention_type cutlass --use_fp16 False --inference_steps 50 --height 1024 --width 1024 --benchmark_steps 10
+# attention flash fp32
+python infer_dygraph_paddle.py --scheduler "flow" --task_name all --attention_type flash --use_fp16 False --inference_steps 50 --height 1024 --width 1024 --benchmark_steps 10

VLMEvalKit_old/PaddleMIX/ppdiffusers/deploy/sd3/scripts/benchmark_torch.sh ADDED Viewed

	@@ -0,0 +1,21 @@

+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# sd3 do ot supprot attention raw
+# attention sdp
+python infer_dygraph_torch.py --scheduler "flow" --task_name all --attention_type sdp --use_fp16 True --inference_steps 50 --height 1024 --width 1024 --benchmark_steps 10
+# attention sdp fp32
+python infer_dygraph_torch.py --scheduler "flow" --task_name all --attention_type sdp --use_fp16 False --inference_steps 50 --height 1024 --width 1024 --benchmark_steps 10

VLMEvalKit_old/PaddleMIX/ppdiffusers/deploy/sd3/text_to_image_generation-stable_diffusion_3.py ADDED Viewed

	@@ -0,0 +1,149 @@

+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import argparse
+import os
+import paddle
+def parse_args():
+    parser = argparse.ArgumentParser(
+        description=" Use PaddleMIX to accelerate the Stable Diffusion3 image generation model."
+    )
+    parser.add_argument(
+        "--benchmark",
+        type=(lambda x: str(x).lower() in ["true", "1", "yes"]),
+        default=False,
+        help="if set to True, measure inference performance",
+    )
+    parser.add_argument(
+        "--inference_optimize",
+        type=(lambda x: str(x).lower() in ["true", "1", "yes"]),
+        default=False,
+        help="If set to True, all optimizations except Triton are enabled.",
+    )
+    parser.add_argument("--height", type=int, default=512, help="Height of the generated image.")
+    parser.add_argument("--width", type=int, default=512, help="Width of the generated image.")
+    parser.add_argument("--num-inference-steps", type=int, default=50, help="Number of inference steps.")
+    parser.add_argument("--dtype", type=str, default="float32", help="Inference data types.")
+    parser.add_argument(
+        "--mp_size", type=int, default=1, help="This size refers to the degree of parallelism using model parallel."
+    )
+    parser.add_argument(
+        "--dp_size", type=int, default=1, help="This size refers to the degree of parallelism using data parallel."
+    )
+    return parser.parse_args()
+args = parse_args()
+if args.inference_optimize:
+    os.environ["INFERENCE_OPTIMIZE"] = "True"
+    os.environ["INFERENCE_OPTIMIZE_TRITON"] = "True"
+    os.environ["INFERENCE_MP_SIZE"] = str(args.mp_size)
+    os.environ["INFERENCE_DP_SIZE"] = str(args.dp_size)
+if args.dtype == "float32":
+    inference_dtype = paddle.float32
+elif args.dtype == "float16":
+    inference_dtype = paddle.float16
+import paddle.distributed as dist
+import paddle.distributed.fleet as fleet
+if args.mp_size > 1 or args.dp_size > 1:
+    strategy = fleet.DistributedStrategy()
+    model_parallel_size = args.mp_size
+    data_parallel_size = args.dp_size
+    strategy.hybrid_configs = {"dp_degree": data_parallel_size, "mp_degree": model_parallel_size, "pp_degree": 1}
+    fleet.init(is_collective=True, strategy=strategy)
+    hcg = fleet.get_hybrid_communicate_group()
+    mp_id = hcg.get_model_parallel_rank()
+    dp_id = hcg.get_data_parallel_rank()
+    rank_id = dist.get_rank()
+    mp_degree = hcg.get_model_parallel_world_size()
+    dp_degree = hcg.get_data_parallel_world_size()
+    assert mp_degree == args.mp_size
+    assert dp_degree == args.dp_size
+    # this is for triton kernel cache for dynamic graph
+    # os.environ["TRITON_KERNEL_CACHE_DIR"] = f"./tmp/sd3_parallel/{rank_id}"
+import datetime
+from ppdiffusers import StableDiffusion3Pipeline
+pipe = StableDiffusion3Pipeline.from_pretrained(
+    "stabilityai/stable-diffusion-3-medium-diffusers",
+    paddle_dtype=inference_dtype,
+)
+pipe.transformer = paddle.incubate.jit.inference(
+    pipe.transformer,
+    save_model_dir="./tmp/sd3",
+    enable_new_ir=True,
+    cache_static_model=True,
+    exp_enable_use_cutlass=True,
+    delete_pass_lists=["add_norm_fuse_pass"],
+)
+generator = paddle.Generator().manual_seed(42)
+prompt = "A cat holding a sign that says hello world"
+image = pipe(
+    prompt, num_inference_steps=args.num_inference_steps, width=args.width, height=args.height, generator=generator
+).images[0]
+if args.benchmark:
+    # warmup
+    for i in range(3):
+        image = pipe(
+            prompt,
+            num_inference_steps=args.num_inference_steps,
+            width=args.width,
+            height=args.height,
+            generator=generator,
+        ).images[0]
+    repeat_times = 10
+    sumtime = 0.0
+    for i in range(repeat_times):
+        paddle.device.synchronize()
+        starttime = datetime.datetime.now()
+        image = pipe(
+            prompt,
+            num_inference_steps=args.num_inference_steps,
+            width=args.width,
+            height=args.height,
+            generator=generator,
+        ).images[0]
+        paddle.device.synchronize()
+        endtime = datetime.datetime.now()
+        duringtime = endtime - starttime
+        duringtime = duringtime.seconds * 1000 + duringtime.microseconds / 1000.0
+        sumtime += duringtime
+        print("SD3 end to end time : ", duringtime, "ms")
+    print("SD3 ave end to end time : ", sumtime / repeat_times, "ms")
+    cuda_mem_after_used = paddle.device.cuda.max_memory_allocated() / (1024**3)
+    print(f"Max used CUDA memory : {cuda_mem_after_used:.3f} GiB")
+rank_id = dist.get_rank()
+if rank_id == 0:
+    image.save("text_to_image_generation-stable_diffusion_3-result.png")

VLMEvalKit_old/PaddleMIX/ppdiffusers/deploy/sdxl/README.md ADDED Viewed

	@@ -0,0 +1,44 @@

+# PaddleInfer Stable Diffusion XL 模型高性能部署
+ **目录**
+   * [环境依赖](#环境依赖)
+   * [快速体验](#快速体验)
+       * [文图生成（Text-to-Image Generation）](#文图生成)
+       * [文本引导的图像变换（Image-to-Image Text-Guided Generation）](#文本引导的图像变换)
+       * [文本引导的图像编辑（Text-Guided Image Inpainting）](#文本引导的图像编辑)
+⚡️[PaddleInfer]是一款全场景、易用灵活、极致高效的AI推理部署工具，为开发者提供多硬件、多推理引擎后端的部署能力。开发者只需调用一行代码即可随意切换硬件、推理引擎后端。本示例展现如何通过 PaddleInfer 将我们 PPDiffusers 训练好的 Stable Diffusion XL模型进行多硬件、多推理引擎后端高性能部署。
+<a name="环境依赖"></a>
+## 环境依赖
+在示例中使用了 PaddleInfer，需要执行以下命令安装依赖。
+```shell
+python -m pip install paddlepaddle-gpu==2.6.0.post112 -f https://www.paddlepaddle.org.cn/whl/linux/mkl/avx/stable.html
+```
+<a name="快速体验"></a>
+## 静态图模型导出 (static model export)
+```
+export USE_PPXFORMERS=False
+python export_model.py --pretrained_model_name_or_path stabilityai/stable-diffusion-xl-base-1.0 --output_path static_model/stable-diffusion-xl-base-1.0
+```
+导出模型在static_model/stable-diffusion-xl-base-1.0目录下。
+### 文图生成（Text-to-Image Generation）
+```
+python infer.py --model_dir static_model/stable-diffusion-xl-base-1.0 --scheduler "preconfig-euler-ancestral" --backend paddle --device gpu --task_name text2img
+```
+### 文本引导的图像变换（Image-to-Image Text-Guided Generation）
+```
+python infer.py --model_dir static_model/stable-diffusion-xl-base-1.0 --scheduler "preconfig-euler-ancestral" --backend paddle --device gpu --task_name img2img
+```
+### 文本引导的图像编辑（Text-Guided Image Inpainting）
+```
+python infer.py --model_dir static_model/stable-diffusion-xl-base-1.0 --scheduler "preconfig-euler-ancestral" --backend paddle --device gpu --task_name inpaint
+```

VLMEvalKit_old/PaddleMIX/ppdiffusers/ppdiffusers/experimental/rl/__init__.py ADDED Viewed

	@@ -0,0 +1,15 @@

+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from .value_guided_sampling import ValueGuidedRLPipeline

VLMEvalKit_old/PaddleMIX/ppdiffusers/ppdiffusers/experimental/rl/value_guided_sampling.py ADDED Viewed

	@@ -0,0 +1,153 @@

+# Copyright 2023 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import numpy as np
+import paddle
+from ...models.unet_1d import UNet1DModel
+from ...pipelines import DiffusionPipeline
+from ...utils.dummy_paddle_objects import DDPMScheduler
+from ...utils.paddle_utils import randn_tensor
+class ValueGuidedRLPipeline(DiffusionPipeline):
+    r"""
+    Pipeline for value-guided sampling from a diffusion model trained to predict sequences of states.
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods
+    implemented for all pipelines (downloading, saving, running on a particular device, etc.).
+    Parameters:
+        value_function ([`UNet1DModel`]):
+            A specialized UNet for fine-tuning trajectories base on reward.
+        unet ([`UNet1DModel`]):
+            UNet architecture to denoise the encoded trajectories.
+        scheduler ([`SchedulerMixin`]):
+            A scheduler to be used in combination with `unet` to denoise the encoded trajectories. Default for this
+            application is [`DDPMScheduler`].
+        env ():
+            An environment following the OpenAI gym API to act in. For now only Hopper has pretrained models.
+    """
+    def __init__(
+        self,
+        value_function: UNet1DModel,
+        unet: UNet1DModel,
+        scheduler: DDPMScheduler,
+        env,
+    ):
+        super().__init__()
+        self.value_function = value_function
+        self.unet = unet
+        self.scheduler = scheduler
+        self.env = env
+        self.data = env.get_dataset()
+        self.means = {}
+        for key in self.data.keys():
+            try:
+                self.means[key] = self.data[key].mean()
+            except Exception:
+                pass
+        self.stds = {}
+        for key in self.data.keys():
+            try:
+                self.stds[key] = self.data[key].std()
+            except Exception:
+                pass
+        self.state_dim = env.observation_space.shape[0]
+        self.action_dim = env.action_space.shape[0]
+    def normalize(self, x_in, key):
+        return (x_in - self.means[key]) / self.stds[key]
+    def de_normalize(self, x_in, key):
+        return x_in * self.stds[key] + self.means[key]
+    def to_paddle(self, x_in):
+        if isinstance(x_in, dict):
+            return {k: self.to_paddle(v) for k, v in x_in.items()}
+        elif paddle.is_tensor(x_in):
+            return x_in
+        return paddle.to_tensor(x_in)
+    def reset_x0(self, x_in, cond, act_dim):
+        for key, val in cond.items():
+            x_in[:, key, act_dim:] = val.clone()
+        return x_in
+    def run_diffusion(self, x, conditions, n_guide_steps, scale):
+        batch_size = x.shape[0]
+        y = None
+        for i in self.progress_bar(self.scheduler.timesteps):
+            # create batch of timesteps to pass into model
+            timesteps = paddle.full((batch_size,), i, dtype=paddle.int64)
+            for _ in range(n_guide_steps):
+                with paddle.set_grad_enabled(True):
+                    x.stop_gradient = False
+                    # permute to match dimension for pre-trained models
+                    y = self.value_function(x.transpose([0, 2, 1]), timesteps).sample
+                    grad = paddle.autograd.grad([y.sum()], [x])[0]
+                    posterior_variance = self.scheduler._get_variance(i)
+                    model_std = paddle.exp(0.5 * posterior_variance)
+                    grad = model_std * grad
+                grad[timesteps < 2] = 0
+                x = x.detach()
+                x = x + scale * grad
+                x = self.reset_x0(x, conditions, self.action_dim)
+            prev_x = self.unet(x.transpose([0, 2, 1]), timesteps).sample.transpose([0, 2, 1])
+            # TODO: verify deprecation of this kwarg
+            x = self.scheduler.step(prev_x, i, x, predict_epsilon=False)["prev_sample"]
+            # apply conditions to the trajectory (set the initial state)
+            x = self.reset_x0(x, conditions, self.action_dim)
+            x = self.to_paddle(x)
+        return x, y
+    def __call__(self, obs, batch_size=64, planning_horizon=32, n_guide_steps=2, scale=0.1):
+        # normalize the observations and create  batch dimension
+        obs = self.normalize(obs, "observations")
+        obs = obs[None].repeat(batch_size, axis=0)
+        conditions = {0: self.to_paddle(obs)}
+        shape = (batch_size, planning_horizon, self.state_dim + self.action_dim)
+        # generate initial noise and apply our conditions (to make the trajectories start at current state)
+        x1 = randn_tensor(shape, dtype=self.unet.dtype)
+        x = self.reset_x0(x1, conditions, self.action_dim)
+        x = self.to_paddle(x)
+        # run the diffusion process
+        x, y = self.run_diffusion(x, conditions, n_guide_steps, scale)
+        # sort output trajectories by value
+        sorted_idx = paddle.argsort(y, 0, descending=True).squeeze()
+        sorted_values = x[sorted_idx]
+        actions = sorted_values[:, :, : self.action_dim]
+        actions = actions.detach().cpu().numpy()
+        denorm_actions = self.de_normalize(actions, key="actions")
+        # select the action with the highest value
+        if y is not None:
+            selected_index = 0
+        else:
+            # if we didn't run value guiding, select a random action
+            selected_index = np.random.randint(0, batch_size)
+        denorm_actions = denorm_actions[selected_index, 0]
+        return denorm_actions

VLMEvalKit_old/PaddleMIX/ppdiffusers/ppdiffusers/models/animate_anyone/pose_guider.py ADDED Viewed

	@@ -0,0 +1,68 @@

+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import Tuple
+import paddle
+from ppdiffusers.models.animate_anyone.motion_module import zero_module
+from ppdiffusers.models.animate_anyone.resnet import InflatedConv3d
+from ppdiffusers.models.modeling_utils import ContextManagers, ModelMixin
+class PoseGuider(ModelMixin):
+    def __init__(
+        self,
+        conditioning_embedding_channels: int,
+        conditioning_channels: int = 3,
+        block_out_channels: Tuple[int] = (16, 32, 64, 128),
+        weight_dtype=None,
+    ):
+        super().__init__()
+        init_contexts = []
+        if weight_dtype is not None:
+            init_contexts.append(paddle.dtype_guard(weight_dtype))
+        with ContextManagers(init_contexts):
+            self.conv_in = InflatedConv3d(conditioning_channels, block_out_channels[0], kernel_size=3, padding=1)
+            self.blocks = paddle.nn.LayerList(sublayers=[])
+            for i in range(len(block_out_channels) - 1):
+                channel_in = block_out_channels[i]
+                channel_out = block_out_channels[i + 1]
+                self.blocks.append(InflatedConv3d(channel_in, channel_in, kernel_size=3, padding=1))
+                self.blocks.append(InflatedConv3d(channel_in, channel_out, kernel_size=3, padding=1, stride=2))
+            self.conv_out = zero_module(
+                InflatedConv3d(
+                    block_out_channels[-1],
+                    conditioning_embedding_channels,
+                    kernel_size=3,
+                    padding=1,
+                )
+            )
+    def forward(self, conditioning):
+        embedding = self.conv_in(conditioning)
+        embedding = paddle.nn.functional.silu(x=embedding)
+        for block in self.blocks:
+            embedding = block(embedding)
+            embedding = paddle.nn.functional.silu(x=embedding)
+        embedding = self.conv_out(embedding)
+        return embedding

VLMEvalKit_old/PaddleMIX/ppdiffusers/ppdiffusers/models/animate_anyone/resnet.py ADDED Viewed

	@@ -0,0 +1,235 @@

+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# Adapted from https://github.com/huggingface/ppdiffusers/blob/main/src/ppdiffusers/models/resnet.py
+import paddle
+from einops import rearrange
+class InflatedConv3d(paddle.nn.Conv2D):
+    def forward(self, x):
+        video_length = x.shape[2]
+        x = rearrange(x, "b c f h w -> (b f) c h w")
+        x = super().forward(x)
+        x = rearrange(x, "(b f) c h w -> b c f h w", f=video_length)
+        return x
+class InflatedGroupNorm(paddle.nn.GroupNorm):
+    def forward(self, x):
+        video_length = x.shape[2]
+        x = rearrange(x, "b c f h w -> (b f) c h w")
+        x = super().forward(x)
+        x = rearrange(x, "(b f) c h w -> b c f h w", f=video_length)
+        return x
+class Upsample3D(paddle.nn.Layer):
+    def __init__(
+        self,
+        channels,
+        use_conv=False,
+        use_conv_transpose=False,
+        out_channels=None,
+        name="conv",
+    ):
+        super().__init__()
+        self.channels = channels
+        self.out_channels = out_channels or channels
+        self.use_conv = use_conv
+        self.use_conv_transpose = use_conv_transpose
+        self.name = name
+        if use_conv_transpose:
+            raise NotImplementedError
+        elif use_conv:
+            self.conv = InflatedConv3d(self.channels, self.out_channels, 3, padding=1)
+    def forward(self, hidden_states, output_size=None):
+        assert hidden_states.shape[1] == self.channels
+        if self.use_conv_transpose:
+            raise NotImplementedError
+        # Cast to float32 to as 'upsample_nearest2d_out_frame' op does not support bfloat16
+        dtype = hidden_states.dtype
+        if dtype == "bfloat16":
+            hidden_states = hidden_states.to("float32")
+        # upsample_nearest_nhwc fails with large batch sizes. see https://github.com/huggingface/ppdiffusers/issues/984
+        if hidden_states.shape[0] >= 64:
+            hidden_states = hidden_states.contiguous()
+        if output_size is None:
+            hidden_states = paddle.nn.functional.interpolate(
+                x=hidden_states, scale_factor=[1.0, 2.0, 2.0], mode="nearest", data_format="NCDHW"
+            )
+        else:
+            hidden_states = paddle.nn.functional.interpolate(
+                x=hidden_states, size=output_size, mode="nearest", data_format="NCDHW"
+            )
+        # If the input is bfloat16, we cast back to bfloat16
+        if dtype == "bfloat16":
+            hidden_states = hidden_states.to(dtype)
+        hidden_states = self.conv(hidden_states)
+        return hidden_states
+class Downsample3D(paddle.nn.Layer):
+    def __init__(self, channels, use_conv=False, out_channels=None, padding=1, name="conv"):
+        super().__init__()
+        self.channels = channels
+        self.out_channels = out_channels or channels
+        self.use_conv = use_conv
+        self.padding = padding
+        stride = 2
+        self.name = name
+        if use_conv:
+            self.conv = InflatedConv3d(self.channels, self.out_channels, 3, stride=stride, padding=padding)
+        else:
+            raise NotImplementedError
+    def forward(self, hidden_states):
+        assert hidden_states.shape[1] == self.channels
+        if self.use_conv and self.padding == 0:
+            raise NotImplementedError
+        assert hidden_states.shape[1] == self.channels
+        hidden_states = self.conv(hidden_states)
+        return hidden_states
+class ResnetBlock3D(paddle.nn.Layer):
+    def __init__(
+        self,
+        *,
+        in_channels,
+        out_channels=None,
+        conv_shortcut=False,
+        dropout=0.0,
+        temb_channels=512,
+        groups=32,
+        groups_out=None,
+        pre_norm=True,
+        eps=1e-6,
+        non_linearity="swish",
+        time_embedding_norm="default",
+        output_scale_factor=1.0,
+        use_in_shortcut=None,
+        use_inflated_groupnorm=None,
+    ):
+        super().__init__()
+        self.pre_norm = pre_norm
+        self.pre_norm = True
+        self.in_channels = in_channels
+        out_channels = in_channels if out_channels is None else out_channels
+        self.out_channels = out_channels
+        self.use_conv_shortcut = conv_shortcut
+        self.time_embedding_norm = time_embedding_norm
+        self.output_scale_factor = output_scale_factor
+        if groups_out is None:
+            groups_out = groups
+        assert use_inflated_groupnorm is not None
+        if use_inflated_groupnorm:
+            self.norm1 = InflatedGroupNorm(num_groups=groups, num_channels=in_channels, epsilon=eps)
+        else:
+            self.norm1 = paddle.nn.GroupNorm(
+                num_groups=groups, num_channels=in_channels, epsilon=eps, weight_attr=True, bias_attr=True
+            )
+        self.conv1 = InflatedConv3d(in_channels, out_channels, kernel_size=3, stride=1, padding=1)
+        if temb_channels is not None:
+            if self.time_embedding_norm == "default":
+                time_emb_proj_out_channels = out_channels
+            elif self.time_embedding_norm == "scale_shift":
+                time_emb_proj_out_channels = out_channels * 2
+            else:
+                raise ValueError(f"unknown time_embedding_norm : {self.time_embedding_norm} ")
+            self.time_emb_proj = paddle.nn.Linear(in_features=temb_channels, out_features=time_emb_proj_out_channels)
+        else:
+            self.time_emb_proj = None
+        if use_inflated_groupnorm:
+            self.norm2 = InflatedGroupNorm(num_groups=groups_out, num_channels=out_channels, epsilon=eps)
+        else:
+            self.norm2 = paddle.nn.GroupNorm(
+                num_groups=groups_out, num_channels=out_channels, epsilon=eps, weight_attr=True, bias_attr=True
+            )
+        self.dropout = paddle.nn.Dropout(p=dropout)
+        self.conv2 = InflatedConv3d(out_channels, out_channels, kernel_size=3, stride=1, padding=1)
+        if non_linearity == "swish":
+            self.nonlinearity = lambda x: paddle.nn.functional.silu(x=x)
+        elif non_linearity == "mish":
+            self.nonlinearity = Mish()
+        elif non_linearity == "silu":
+            self.nonlinearity = paddle.nn.Silu()
+        self.use_in_shortcut = self.in_channels != self.out_channels if use_in_shortcut is None else use_in_shortcut
+        self.conv_shortcut = None
+        if self.use_in_shortcut:
+            self.conv_shortcut = InflatedConv3d(in_channels, out_channels, kernel_size=1, stride=1, padding=0)
+    def forward(self, input_tensor, temb):
+        hidden_states = input_tensor
+        hidden_states = self.norm1(hidden_states)
+        hidden_states = self.nonlinearity(hidden_states)
+        hidden_states = self.conv1(hidden_states)
+        if temb is not None:
+            temb = self.time_emb_proj(self.nonlinearity(temb))[:, :, None, None, None]
+        if temb is not None and self.time_embedding_norm == "default":
+            hidden_states = hidden_states + temb
+        hidden_states = self.norm2(hidden_states)
+        if temb is not None and self.time_embedding_norm == "scale_shift":
+            scale, shift = paddle.chunk(x=temb, chunks=2, axis=1)
+            hidden_states = hidden_states * (1 + scale) + shift
+        hidden_states = self.nonlinearity(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.conv2(hidden_states)
+        if self.conv_shortcut is not None:
+            input_tensor = self.conv_shortcut(input_tensor)
+        output_tensor = (input_tensor + hidden_states) / self.output_scale_factor
+        return output_tensor
+class Mish(paddle.nn.Layer):
+    def forward(self, hidden_states):
+        return hidden_states * paddle.nn.functional.tanh(x=paddle.nn.functional.softplus(x=hidden_states))

VLMEvalKit_old/PaddleMIX/ppdiffusers/ppdiffusers/models/animate_anyone/transformer_3d.py ADDED Viewed

	@@ -0,0 +1,155 @@

+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from dataclasses import dataclass
+from typing import Optional
+import paddle
+from einops import rearrange, repeat
+from ppdiffusers.configuration_utils import ConfigMixin, register_to_config
+from ppdiffusers.models import ModelMixin
+from ppdiffusers.utils import BaseOutput
+from .attention import TemporalBasicTransformerBlock
+@dataclass
+class Transformer3DModelOutput(BaseOutput):
+    sample: paddle.Tensor
+class Transformer3DModel(ModelMixin, ConfigMixin):
+    _supports_gradient_checkpointing = True
+    @register_to_config
+    def __init__(
+        self,
+        num_attention_heads: int = 16,
+        attention_head_dim: int = 88,
+        in_channels: Optional[int] = None,
+        num_layers: int = 1,
+        dropout: float = 0.0,
+        norm_num_groups: int = 32,
+        cross_attention_dim: Optional[int] = None,
+        attention_bias: bool = False,
+        activation_fn: str = "geglu",
+        num_embeds_ada_norm: Optional[int] = None,
+        use_linear_projection: bool = False,
+        only_cross_attention: bool = False,
+        upcast_attention: bool = False,
+        unet_use_cross_frame_attention=None,
+        unet_use_temporal_attention=None,
+    ):
+        super().__init__()
+        self.use_linear_projection = use_linear_projection
+        self.num_attention_heads = num_attention_heads
+        self.attention_head_dim = attention_head_dim
+        inner_dim = num_attention_heads * attention_head_dim
+        # Define input layers
+        self.in_channels = in_channels
+        self.norm = paddle.nn.GroupNorm(
+            num_groups=norm_num_groups, num_channels=in_channels, epsilon=1e-06, weight_attr=True, bias_attr=True
+        )
+        if use_linear_projection:
+            self.proj_in = paddle.nn.Linear(in_features=in_channels, out_features=inner_dim)
+        else:
+            self.proj_in = paddle.nn.Conv2D(
+                in_channels=in_channels, out_channels=inner_dim, kernel_size=1, stride=1, padding=0
+            )
+        self.transformer_blocks = paddle.nn.LayerList(
+            sublayers=[
+                TemporalBasicTransformerBlock(
+                    inner_dim,
+                    num_attention_heads,
+                    attention_head_dim,
+                    dropout=dropout,
+                    cross_attention_dim=cross_attention_dim,
+                    activation_fn=activation_fn,
+                    num_embeds_ada_norm=num_embeds_ada_norm,
+                    attention_bias=attention_bias,
+                    only_cross_attention=only_cross_attention,
+                    upcast_attention=upcast_attention,
+                    unet_use_cross_frame_attention=unet_use_cross_frame_attention,
+                    unet_use_temporal_attention=unet_use_temporal_attention,
+                )
+                for d in range(num_layers)
+            ]
+        )
+        if use_linear_projection:
+            self.proj_out = paddle.nn.Linear(in_features=in_channels, out_features=inner_dim)
+        else:
+            self.proj_out = paddle.nn.Conv2D(
+                in_channels=inner_dim, out_channels=in_channels, kernel_size=1, stride=1, padding=0
+            )
+        self.gradient_checkpointing = False
+    def _set_gradient_checkpointing(self, module, value=False):
+        if hasattr(module, "gradient_checkpointing"):
+            module.gradient_checkpointing = value
+    def forward(
+        self,
+        hidden_states,
+        encoder_hidden_states=None,
+        timestep=None,
+        return_dict: bool = True,
+    ):
+        # Input
+        assert hidden_states.dim() == 5, f"Expected hidden_states to have ndim=5, but got ndim={hidden_states.dim()}."
+        video_length = hidden_states.shape[2]
+        hidden_states = rearrange(hidden_states, "b c f h w -> (b f) c h w")
+        if encoder_hidden_states.shape[0] != hidden_states.shape[0]:
+            encoder_hidden_states = repeat(encoder_hidden_states, "b n c -> (b f) n c", f=video_length)
+        batch, channel, height, weight = hidden_states.shape
+        residual = hidden_states
+        hidden_states = self.norm(hidden_states)
+        if not self.use_linear_projection:
+            hidden_states = self.proj_in(hidden_states)
+            inner_dim = hidden_states.shape[1]
+            hidden_states = hidden_states.transpose(perm=[0, 2, 3, 1]).reshape((batch, height * weight, inner_dim))
+        else:
+            inner_dim = hidden_states.shape[1]
+            hidden_states = hidden_states.transpose(perm=[0, 2, 3, 1]).reshape((batch, height * weight, inner_dim))
+            hidden_states = self.proj_in(hidden_states)
+        # Blocks
+        for i, block in enumerate(self.transformer_blocks):
+            hidden_states = block(
+                hidden_states,
+                encoder_hidden_states=encoder_hidden_states,
+                timestep=timestep,
+                video_length=video_length,
+            )
+        # Output
+        if not self.use_linear_projection:
+            hidden_states = hidden_states.reshape((batch, height, weight, inner_dim)).transpose(perm=[0, 3, 1, 2])
+            hidden_states = self.proj_out(hidden_states)
+        else:
+            hidden_states = self.proj_out(hidden_states)
+            hidden_states = hidden_states.reshape((batch, height, weight, inner_dim)).transpose(perm=[0, 3, 1, 2])
+        output = hidden_states + residual
+        output = rearrange(output, "(b f) c h w -> b c f h w", f=video_length)
+        if not return_dict:
+            return (output,)
+        return Transformer3DModelOutput(sample=output)

VLMEvalKit_old/PaddleMIX/ppdiffusers/ppdiffusers/models/animate_anyone/unet_3d.py ADDED Viewed

	@@ -0,0 +1,615 @@

+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# Adapted from https://github.com/guoyww/AnimateDiff/blob/main/animatediff/models/unet_blocks.py
+from dataclasses import dataclass
+from os import PathLike
+from pathlib import Path
+from typing import Dict, List, Optional, Tuple, Union
+import paddle
+from ppdiffusers.configuration_utils import ConfigMixin, register_to_config
+from ppdiffusers.models.attention_processor import AttentionProcessor
+from ppdiffusers.models.embeddings import TimestepEmbedding, Timesteps
+from ppdiffusers.models.modeling_utils import ContextManagers, ModelMixin
+from ppdiffusers.utils import BaseOutput, logging
+from .resnet import InflatedConv3d, InflatedGroupNorm
+from .unet_3d_blocks import UNetMidBlock3DCrossAttn, get_down_block, get_up_block
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+@dataclass
+class UNet3DConditionOutput(BaseOutput):
+    sample: paddle.Tensor
+class UNet3DConditionModel(ModelMixin, ConfigMixin):
+    _supports_gradient_checkpointing = True
+    @register_to_config
+    def __init__(
+        self,
+        sample_size: Optional[int] = None,
+        in_channels: int = 4,
+        out_channels: int = 4,
+        center_input_sample: bool = False,
+        flip_sin_to_cos: bool = True,
+        freq_shift: int = 0,
+        down_block_types: Tuple[str] = (
+            "CrossAttnDownBlock3D",
+            "CrossAttnDownBlock3D",
+            "CrossAttnDownBlock3D",
+            "DownBlock3D",
+        ),
+        mid_block_type: str = "UNetMidBlock3DCrossAttn",
+        up_block_types: Tuple[str] = (
+            "UpBlock3D",
+            "CrossAttnUpBlock3D",
+            "CrossAttnUpBlock3D",
+            "CrossAttnUpBlock3D",
+        ),
+        only_cross_attention: Union[bool, Tuple[bool]] = False,
+        block_out_channels: Tuple[int] = (320, 640, 1280, 1280),
+        layers_per_block: int = 2,
+        downsample_padding: int = 1,
+        mid_block_scale_factor: float = 1,
+        act_fn: str = "silu",
+        norm_num_groups: int = 32,
+        norm_eps: float = 1e-5,
+        cross_attention_dim: int = 1280,
+        attention_head_dim: Union[int, Tuple[int]] = 8,
+        dual_cross_attention: bool = False,
+        use_linear_projection: bool = False,
+        class_embed_type: Optional[str] = None,
+        num_class_embeds: Optional[int] = None,
+        upcast_attention: bool = False,
+        resnet_time_scale_shift: str = "default",
+        use_inflated_groupnorm=False,
+        # Additional
+        use_motion_module=False,
+        motion_module_resolutions=(1, 2, 4, 8),
+        motion_module_mid_block=False,
+        motion_module_decoder_only=False,
+        motion_module_type=None,
+        motion_module_kwargs={},
+        unet_use_cross_frame_attention=None,
+        unet_use_temporal_attention=None,
+    ):
+        super().__init__()
+        self.sample_size = sample_size
+        time_embed_dim = block_out_channels[0] * 4
+        # input
+        self.conv_in = InflatedConv3d(in_channels, block_out_channels[0], kernel_size=3, padding=(1, 1))
+        # time
+        self.time_proj = Timesteps(block_out_channels[0], flip_sin_to_cos, freq_shift)
+        timestep_input_dim = block_out_channels[0]
+        self.time_embedding = TimestepEmbedding(timestep_input_dim, time_embed_dim)
+        # class embedding
+        if class_embed_type is None and num_class_embeds is not None:
+            self.class_embedding = paddle.nn.Embedding(num_embeddings=num_class_embeds, embedding_dim=time_embed_dim)
+        elif class_embed_type == "timestep":
+            self.class_embedding = TimestepEmbedding(timestep_input_dim, time_embed_dim)
+        elif class_embed_type == "identity":
+            self.class_embedding = paddle.nn.Identity(time_embed_dim, time_embed_dim)
+        else:
+            self.class_embedding = None
+        self.down_blocks = paddle.nn.LayerList(sublayers=[])
+        self.mid_block = None
+        self.up_blocks = paddle.nn.LayerList(sublayers=[])
+        if isinstance(only_cross_attention, bool):
+            only_cross_attention = [only_cross_attention] * len(down_block_types)
+        if isinstance(attention_head_dim, int):
+            attention_head_dim = (attention_head_dim,) * len(down_block_types)
+        # down
+        output_channel = block_out_channels[0]
+        for i, down_block_type in enumerate(down_block_types):
+            res = 2**i
+            input_channel = output_channel
+            output_channel = block_out_channels[i]
+            is_final_block = i == len(block_out_channels) - 1
+            down_block = get_down_block(
+                down_block_type,
+                num_layers=layers_per_block,
+                in_channels=input_channel,
+                out_channels=output_channel,
+                temb_channels=time_embed_dim,
+                add_downsample=not is_final_block,
+                resnet_eps=norm_eps,
+                resnet_act_fn=act_fn,
+                resnet_groups=norm_num_groups,
+                cross_attention_dim=cross_attention_dim,
+                attn_num_head_channels=attention_head_dim[i],
+                downsample_padding=downsample_padding,
+                dual_cross_attention=dual_cross_attention,
+                use_linear_projection=use_linear_projection,
+                only_cross_attention=only_cross_attention[i],
+                upcast_attention=upcast_attention,
+                resnet_time_scale_shift=resnet_time_scale_shift,
+                unet_use_cross_frame_attention=unet_use_cross_frame_attention,
+                unet_use_temporal_attention=unet_use_temporal_attention,
+                use_inflated_groupnorm=use_inflated_groupnorm,
+                use_motion_module=use_motion_module
+                and (res in motion_module_resolutions)
+                and (not motion_module_decoder_only),
+                motion_module_type=motion_module_type,
+                motion_module_kwargs=motion_module_kwargs,
+            )
+            self.down_blocks.append(down_block)
+        # mid
+        if mid_block_type == "UNetMidBlock3DCrossAttn":
+            self.mid_block = UNetMidBlock3DCrossAttn(
+                in_channels=block_out_channels[-1],
+                temb_channels=time_embed_dim,
+                resnet_eps=norm_eps,
+                resnet_act_fn=act_fn,
+                output_scale_factor=mid_block_scale_factor,
+                resnet_time_scale_shift=resnet_time_scale_shift,
+                cross_attention_dim=cross_attention_dim,
+                attn_num_head_channels=attention_head_dim[-1],
+                resnet_groups=norm_num_groups,
+                dual_cross_attention=dual_cross_attention,
+                use_linear_projection=use_linear_projection,
+                upcast_attention=upcast_attention,
+                unet_use_cross_frame_attention=unet_use_cross_frame_attention,
+                unet_use_temporal_attention=unet_use_temporal_attention,
+                use_inflated_groupnorm=use_inflated_groupnorm,
+                use_motion_module=use_motion_module and motion_module_mid_block,
+                motion_module_type=motion_module_type,
+                motion_module_kwargs=motion_module_kwargs,
+            )
+        else:
+            raise ValueError(f"unknown mid_block_type : {mid_block_type}")
+        # count how many layers upsample the videos
+        self.num_upsamplers = 0
+        # up
+        reversed_block_out_channels = list(reversed(block_out_channels))
+        reversed_attention_head_dim = list(reversed(attention_head_dim))
+        only_cross_attention = list(reversed(only_cross_attention))
+        output_channel = reversed_block_out_channels[0]
+        for i, up_block_type in enumerate(up_block_types):
+            res = 2 ** (3 - i)
+            is_final_block = i == len(block_out_channels) - 1
+            prev_output_channel = output_channel
+            output_channel = reversed_block_out_channels[i]
+            input_channel = reversed_block_out_channels[min(i + 1, len(block_out_channels) - 1)]
+            # add upsample block for all BUT final layer
+            if not is_final_block:
+                add_upsample = True
+                self.num_upsamplers += 1
+            else:
+                add_upsample = False
+            up_block = get_up_block(
+                up_block_type,
+                num_layers=layers_per_block + 1,
+                in_channels=input_channel,
+                out_channels=output_channel,
+                prev_output_channel=prev_output_channel,
+                temb_channels=time_embed_dim,
+                add_upsample=add_upsample,
+                resnet_eps=norm_eps,
+                resnet_act_fn=act_fn,
+                resnet_groups=norm_num_groups,
+                cross_attention_dim=cross_attention_dim,
+                attn_num_head_channels=reversed_attention_head_dim[i],
+                dual_cross_attention=dual_cross_attention,
+                use_linear_projection=use_linear_projection,
+                only_cross_attention=only_cross_attention[i],
+                upcast_attention=upcast_attention,
+                resnet_time_scale_shift=resnet_time_scale_shift,
+                unet_use_cross_frame_attention=unet_use_cross_frame_attention,
+                unet_use_temporal_attention=unet_use_temporal_attention,
+                use_inflated_groupnorm=use_inflated_groupnorm,
+                use_motion_module=use_motion_module and (res in motion_module_resolutions),
+                motion_module_type=motion_module_type,
+                motion_module_kwargs=motion_module_kwargs,
+            )
+            self.up_blocks.append(up_block)
+            prev_output_channel = output_channel
+        # out
+        if use_inflated_groupnorm:
+            self.conv_norm_out = InflatedGroupNorm(
+                num_channels=block_out_channels[0],
+                num_groups=norm_num_groups,
+                epsilon=norm_eps,
+            )
+        else:
+            self.conv_norm_out = paddle.nn.GroupNorm(
+                num_channels=block_out_channels[0], num_groups=norm_num_groups, epsilon=norm_eps
+            )
+        self.conv_act = paddle.nn.Silu()
+        self.conv_out = InflatedConv3d(block_out_channels[0], out_channels, kernel_size=3, padding=1)
+    @property
+    # Copied from ppdiffusers.models.unet_2d_condition.UNet2DConditionModel.attn_processors
+    def attn_processors(self) -> Dict[str, AttentionProcessor]:
+        r"""
+        Returns:
+            `dict` of attention processors: A dictionary containing all attention processors used in the model with
+            indexed by its weight name.
+        """
+        # set recursively
+        processors = {}
+        def fn_recursive_add_processors(
+            name: str,
+            module: paddle.nn.Layer,
+            processors: Dict[str, AttentionProcessor],
+        ):
+            if hasattr(module, "set_processor"):
+                processors[f"{name}.processor"] = module.processor
+            for sub_name, child in module.named_children():
+                if "temporal_transformer" not in sub_name:
+                    fn_recursive_add_processors(f"{name}.{sub_name}", child, processors)
+            return processors
+        for name, module in self.named_children():
+            if "temporal_transformer" not in name:
+                fn_recursive_add_processors(name, module, processors)
+        return processors
+    def set_attention_slice(self, slice_size):
+        r"""
+        Enable sliced attention computation.
+        When this option is enabled, the attention module will split the input tensor in slices, to compute attention
+        in several steps. This is useful to save some memory in exchange for a small speed decrease.
+        Args:
+            slice_size (`str` or `int` or `list(int)`, *optional*, defaults to `"auto"`):
+                When `"auto"`, halves the input to the attention heads, so attention will be computed in two steps. If
+                `"max"`, maxium amount of memory will be saved by running only one slice at a time. If a number is
+                provided, uses as many slices as `attention_head_dim // slice_size`. In this case, `attention_head_dim`
+                must be a multiple of `slice_size`.
+        """
+        sliceable_head_dims = []
+        def fn_recursive_retrieve_slicable_dims(module: paddle.nn.Layer):
+            if hasattr(module, "set_attention_slice"):
+                sliceable_head_dims.append(module.sliceable_head_dim)
+            for child in module.children():
+                fn_recursive_retrieve_slicable_dims(child)
+        # retrieve number of attention layers
+        for module in self.children():
+            fn_recursive_retrieve_slicable_dims(module)
+        num_slicable_layers = len(sliceable_head_dims)
+        if slice_size == "auto":
+            # half the attention head size is usually a good trade-off between
+            # speed and memory
+            slice_size = [dim // 2 for dim in sliceable_head_dims]
+        elif slice_size == "max":
+            # make smallest slice possible
+            slice_size = num_slicable_layers * [1]
+        slice_size = num_slicable_layers * [slice_size] if not isinstance(slice_size, list) else slice_size
+        if len(slice_size) != len(sliceable_head_dims):
+            raise ValueError(
+                f"You have provided {len(slice_size)}, but {self.config} has {len(sliceable_head_dims)} different"
+                f" attention layers. Make sure to match `len(slice_size)` to be {len(sliceable_head_dims)}."
+            )
+        for i in range(len(slice_size)):
+            size = slice_size[i]
+            dim = sliceable_head_dims[i]
+            if size is not None and size > dim:
+                raise ValueError(f"size {size} has to be smaller or equal to {dim}.")
+        # Recursively walk through all the children.
+        # Any children which exposes the set_attention_slice method
+        # gets the message
+        def fn_recursive_set_attention_slice(module: paddle.nn.Layer, slice_size: List[int]):
+            if hasattr(module, "set_attention_slice"):
+                module.set_attention_slice(slice_size.pop())
+            for child in module.children():
+                fn_recursive_set_attention_slice(child, slice_size)
+        reversed_slice_size = list(reversed(slice_size))
+        for module in self.children():
+            fn_recursive_set_attention_slice(module, reversed_slice_size)
+    def _set_gradient_checkpointing(self, module, value=False):
+        if hasattr(module, "gradient_checkpointing"):
+            module.gradient_checkpointing = value
+    # Copied from ppdiffusers.models.unet_2d_condition.UNet2DConditionModel.set_attn_processor
+    def set_attn_processor(self, processor: Union[AttentionProcessor, Dict[str, AttentionProcessor]]):
+        r"""
+        Sets the attention processor to use to compute attention.
+        Parameters:
+            processor (`dict` of `AttentionProcessor` or only `AttentionProcessor`):
+                The instantiated processor class or a dictionary of processor classes that will be set as the processor
+                for **all** `Attention` layers.
+                If `processor` is a dict, the key needs to define the path to the corresponding cross attention
+                processor. This is strongly recommended when setting trainable attention processors.
+        """
+        count = len(self.attn_processors.keys())
+        if isinstance(processor, dict) and len(processor) != count:
+            raise ValueError(
+                f"A dict of processors was passed, but the number of processors {len(processor)} does not match the"
+                f" number of attention layers: {count}. Please make sure to pass {count} processor classes."
+            )
+        def fn_recursive_attn_processor(name: str, module: paddle.nn.Layer, processor):
+            if hasattr(module, "set_processor"):
+                if not isinstance(processor, dict):
+                    module.set_processor(processor)
+                else:
+                    module.set_processor(processor.pop(f"{name}.processor"))
+            for sub_name, child in module.named_children():
+                if "temporal_transformer" not in sub_name:
+                    fn_recursive_attn_processor(f"{name}.{sub_name}", child, processor)
+        for name, module in self.named_children():
+            if "temporal_transformer" not in name:
+                fn_recursive_attn_processor(name, module, processor)
+    def forward(
+        self,
+        sample: paddle.Tensor,
+        timestep: Union[paddle.Tensor, float, int],
+        encoder_hidden_states: paddle.Tensor,
+        class_labels: Optional[paddle.Tensor] = None,
+        pose_cond_fea: Optional[paddle.Tensor] = None,
+        attention_mask: Optional[paddle.Tensor] = None,
+        down_block_additional_residuals: Optional[Tuple[paddle.Tensor]] = None,
+        mid_block_additional_residual: Optional[paddle.Tensor] = None,
+        return_dict: bool = True,
+    ) -> Union[UNet3DConditionOutput, Tuple]:
+        r"""
+        Args:
+            sample (`paddle.Tensor`): (batch, channel, height, width) noisy inputs tensor
+            timestep (`paddle.Tensor` or `float` or `int`): (batch) timesteps
+            encoder_hidden_states (`paddle.Tensor`): (batch, sequence_length, feature_dim) encoder hidden states
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`models.unet_2d_condition.UNet2DConditionOutput`] instead of a plain tuple.
+        Returns:
+            [`~models.unet_2d_condition.UNet2DConditionOutput`] or `tuple`:
+            [`~models.unet_2d_condition.UNet2DConditionOutput`] if `return_dict` is True, otherwise a `tuple`. When
+            returning a tuple, the first element is the sample tensor.
+        """
+        # By default samples have to be AT least a multiple of the overall upsampling factor.
+        # The overall upsampling factor is equal to 2 ** (# num of upsampling layears).
+        # However, the upsampling interpolation output size can be forced to fit any upsampling size
+        # on the fly if necessary.
+        default_overall_up_factor = 2**self.num_upsamplers
+        # upsample size should be forwarded when sample is not a multiple of `default_overall_up_factor`
+        forward_upsample_size = False
+        upsample_size = None
+        if any(s % default_overall_up_factor != 0 for s in sample.shape[-2:]):
+            logger.info("Forward upsample size to force interpolation output size.")
+            forward_upsample_size = True
+        # prepare attention_mask
+        if attention_mask is not None:
+            attention_mask = (1 - attention_mask.to(sample.dtype)) * -10000.0
+            attention_mask = attention_mask.unsqueeze(1)
+        # center input if necessary
+        if self.config.center_input_sample:
+            sample = 2 * sample - 1.0
+        # time
+        timesteps = timestep
+        if not paddle.is_tensor(timesteps):
+            # This would be a good case for the `match` statement (Python 3.10+)
+            is_mps = sample.device.type == "mps"
+            if isinstance(timestep, float):
+                dtype = "float32" if is_mps else "float64"
+            else:
+                dtype = "int32" if is_mps else "int64"
+            timesteps = paddle.Tensor([timesteps], dtype=dtype)
+        elif len(timesteps.shape) == 0:
+            timesteps = timesteps[None]
+        # broadcast to batch dimension in a way that's compatible with ONNX/Core ML
+        timesteps = timesteps.expand(sample.shape[0])
+        t_emb = self.time_proj(timesteps)
+        # timesteps does not contain any weights and will always return f32 tensors
+        # but time_embedding might actually be running in fp16. so we need to cast here.
+        # there might be better ways to encapsulate this.
+        t_emb = t_emb.to(dtype=self.dtype)
+        emb = self.time_embedding(t_emb)
+        if self.class_embedding is not None:
+            if class_labels is None:
+                raise ValueError("class_labels should be provided when num_class_embeds > 0")
+            if self.config.class_embed_type == "timestep":
+                class_labels = self.time_proj(class_labels)
+            class_emb = self.class_embedding(class_labels).to(dtype=self.dtype)
+            emb = emb + class_emb
+        # pre-process
+        sample = self.conv_in(sample)
+        if pose_cond_fea is not None:
+            sample = sample + pose_cond_fea
+        # down
+        down_block_res_samples = (sample,)
+        for downsample_block in self.down_blocks:
+            if hasattr(downsample_block, "has_cross_attention") and downsample_block.has_cross_attention:
+                sample, res_samples = downsample_block(
+                    hidden_states=sample,
+                    temb=emb,
+                    encoder_hidden_states=encoder_hidden_states,
+                    attention_mask=attention_mask,
+                )
+            else:
+                sample, res_samples = downsample_block(
+                    hidden_states=sample,
+                    temb=emb,
+                    encoder_hidden_states=encoder_hidden_states,
+                )
+            down_block_res_samples += res_samples
+        if down_block_additional_residuals is not None:
+            new_down_block_res_samples = ()
+            for down_block_res_sample, down_block_additional_residual in zip(
+                down_block_res_samples, down_block_additional_residuals
+            ):
+                down_block_res_sample = down_block_res_sample + down_block_additional_residual
+                new_down_block_res_samples += (down_block_res_sample,)
+            down_block_res_samples = new_down_block_res_samples
+        # mid
+        sample = self.mid_block(
+            sample,
+            emb,
+            encoder_hidden_states=encoder_hidden_states,
+            attention_mask=attention_mask,
+        )
+        if mid_block_additional_residual is not None:
+            sample = sample + mid_block_additional_residual
+        # up
+        for i, upsample_block in enumerate(self.up_blocks):
+            is_final_block = i == len(self.up_blocks) - 1
+            res_samples = down_block_res_samples[-len(upsample_block.resnets) :]
+            down_block_res_samples = down_block_res_samples[: -len(upsample_block.resnets)]
+            # if we have not reached the final block and need to forward the
+            # upsample size, we do it here
+            if not is_final_block and forward_upsample_size:
+                upsample_size = down_block_res_samples[-1].shape[2:]
+            if hasattr(upsample_block, "has_cross_attention") and upsample_block.has_cross_attention:
+                sample = upsample_block(
+                    hidden_states=sample,
+                    temb=emb,
+                    res_hidden_states_tuple=res_samples,
+                    encoder_hidden_states=encoder_hidden_states,
+                    upsample_size=upsample_size,
+                    attention_mask=attention_mask,
+                )
+            else:
+                sample = upsample_block(
+                    hidden_states=sample,
+                    temb=emb,
+                    res_hidden_states_tuple=res_samples,
+                    upsample_size=upsample_size,
+                    encoder_hidden_states=encoder_hidden_states,
+                )
+        # post-process
+        sample = self.conv_norm_out(sample)
+        sample = self.conv_act(sample)
+        sample = self.conv_out(sample)
+        if not return_dict:
+            return (sample,)
+        return UNet3DConditionOutput(sample=sample)
+    @classmethod
+    def from_pretrained_2d(
+        cls,
+        denoising_unet_config_path: Optional[Union[str, PathLike]],
+        base_model_path: Optional[Union[str, PathLike]] = None,
+        motion_module_path: Optional[Union[str, PathLike]] = None,
+        weight_dtype=None,
+        unet_additional_kwargs=None,
+    ):
+        config_file = denoising_unet_config_path
+        if not (Path(config_file).exists() and Path(config_file).is_file()):
+            raise RuntimeError(f"{config_file} does not exist or is not a file")
+        unet_config = cls.load_config(config_file)
+        unet_config["_class_name"] = cls.__name__
+        unet_config["down_block_types"] = [
+            "CrossAttnDownBlock3D",
+            "CrossAttnDownBlock3D",
+            "CrossAttnDownBlock3D",
+            "DownBlock3D",
+        ]
+        unet_config["up_block_types"] = [
+            "UpBlock3D",
+            "CrossAttnUpBlock3D",
+            "CrossAttnUpBlock3D",
+            "CrossAttnUpBlock3D",
+        ]
+        unet_config["mid_block_type"] = "UNetMidBlock3DCrossAttn"
+        init_contexts = []
+        if weight_dtype is not None:
+            init_contexts.append(paddle.dtype_guard(weight_dtype))
+        with ContextManagers(init_contexts):
+            model = cls.from_config(unet_config, **unet_additional_kwargs)
+        state_dict = paddle.load(base_model_path)
+        # motion module updating
+        if motion_module_path is not None:
+            motion_state_dict = paddle.load(motion_module_path)
+            state_dict.update(motion_state_dict)
+        if weight_dtype is not None:
+            for k in state_dict.keys():
+                state_dict[k] = state_dict[k].astype(weight_dtype)
+        m, u = model.set_state_dict(state_dict)
+        print(f"### missing keys: {len(m)}; \n### unexpected keys: {len(u)};")
+        return model

VLMEvalKit_old/PaddleMIX/ppdiffusers/ppdiffusers/models/hotshot_xl/__init__.py ADDED Viewed

	@@ -0,0 +1,28 @@

+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from dataclasses import dataclass
+from typing import Union
+import numpy as np
+import paddle
+import ppdiffusers
+from .unet import UNet3DConditionModel  # noqa: *
+@dataclass
+class HotshotPipelineXLOutput(ppdiffusers.utils.BaseOutput):
+    videos: Union[paddle.Tensor, np.ndarray]

VLMEvalKit_old/PaddleMIX/ppdiffusers/ppdiffusers/models/hotshot_xl/resnet.py ADDED Viewed

	@@ -0,0 +1,124 @@

+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import paddle
+from einops import rearrange
+import ppdiffusers
+from ppdiffusers.models import resnet
+class Upsample3D(resnet.Upsample2D):
+    def forward(self, hidden_states, output_size=None, scale: float = 1.0):
+        f = tuple(hidden_states.shape)[2]
+        hidden_states = rearrange(hidden_states, "b c f h w -> (b f) c h w")
+        hidden_states = super(Upsample3D, self).forward(hidden_states, output_size, scale)
+        return rearrange(hidden_states, "(b f) c h w -> b c f h w", f=f)
+class Downsample3D(ppdiffusers.models.resnet.Downsample2D):
+    def forward(self, hidden_states, scale: float = 1.0):
+        f = tuple(hidden_states.shape)[2]
+        hidden_states = rearrange(hidden_states, "b c f h w -> (b f) c h w")
+        hidden_states = super(Downsample3D, self).forward(hidden_states, scale)
+        return rearrange(hidden_states, "(b f) c h w -> b c f h w", f=f)
+class Conv3d(ppdiffusers.models.resnet.LoRACompatibleConv):
+    def forward(self, hidden_states, scale: float = 1.0):
+        f = tuple(hidden_states.shape)[2]
+        hidden_states = rearrange(hidden_states, "b c f h w -> (b f) c h w")
+        hidden_states = super().forward(hidden_states, scale)
+        return rearrange(hidden_states, "(b f) c h w -> b c f h w", f=f)
+class ResnetBlock3D(paddle.nn.Layer):
+    def __init__(
+        self,
+        *,
+        in_channels,
+        out_channels=None,
+        conv_shortcut=False,
+        dropout=0.0,
+        temb_channels=512,
+        groups=32,
+        groups_out=None,
+        pre_norm=True,
+        eps=1e-06,
+        non_linearity="silu",
+        time_embedding_norm="default",
+        output_scale_factor=1.0,
+        use_in_shortcut=None,
+        conv_shortcut_bias: bool = True
+    ):
+        super().__init__()
+        self.pre_norm = pre_norm
+        self.pre_norm = True
+        self.in_channels = in_channels
+        out_channels = in_channels if out_channels is None else out_channels
+        self.out_channels = out_channels
+        self.use_conv_shortcut = conv_shortcut
+        self.time_embedding_norm = time_embedding_norm
+        self.output_scale_factor = output_scale_factor
+        if groups_out is None:
+            groups_out = groups
+        self.norm1 = paddle.nn.GroupNorm(
+            num_groups=groups, num_channels=in_channels, epsilon=eps, weight_attr=True, bias_attr=True
+        )
+        self.conv1 = Conv3d(in_channels, out_channels, kernel_size=3, stride=1, padding=1)
+        if temb_channels is not None:
+            if self.time_embedding_norm == "default":
+                time_emb_proj_out_channels = out_channels
+            elif self.time_embedding_norm == "scale_shift":
+                time_emb_proj_out_channels = out_channels * 2
+            else:
+                raise ValueError(f"unknown time_embedding_norm : {self.time_embedding_norm} ")
+            self.time_emb_proj = paddle.nn.Linear(in_features=temb_channels, out_features=time_emb_proj_out_channels)
+        else:
+            self.time_emb_proj = None
+        self.norm2 = paddle.nn.GroupNorm(
+            num_groups=groups_out, num_channels=out_channels, epsilon=eps, weight_attr=True, bias_attr=True
+        )
+        self.dropout = paddle.nn.Dropout(p=dropout)
+        self.conv2 = Conv3d(out_channels, out_channels, kernel_size=3, stride=1, padding=1)
+        assert non_linearity == "silu"
+        self.nonlinearity = paddle.nn.Silu()
+        self.use_in_shortcut = self.in_channels != self.out_channels if use_in_shortcut is None else use_in_shortcut
+        self.conv_shortcut = None
+        if self.use_in_shortcut:
+            self.conv_shortcut = Conv3d(
+                in_channels, out_channels, kernel_size=1, stride=1, padding=0, bias_attr=conv_shortcut_bias
+            )
+    def forward(self, input_tensor, temb):
+        hidden_states = input_tensor
+        hidden_states = self.norm1(hidden_states)
+        hidden_states = self.nonlinearity(hidden_states)
+        hidden_states = self.conv1(hidden_states)
+        if temb is not None:
+            temb = self.nonlinearity(temb)
+            temb = self.time_emb_proj(temb)[:, :, None, None, None]
+        if temb is not None and self.time_embedding_norm == "default":
+            hidden_states = hidden_states + temb
+        hidden_states = self.norm2(hidden_states)
+        if temb is not None and self.time_embedding_norm == "scale_shift":
+            scale, shift = paddle.chunk(x=temb, chunks=2, axis=1)
+            hidden_states = hidden_states * (1 + scale) + shift
+        hidden_states = self.nonlinearity(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.conv2(hidden_states)
+        if self.conv_shortcut is not None:
+            input_tensor = self.conv_shortcut(input_tensor)
+        output_tensor = (input_tensor + hidden_states) / self.output_scale_factor
+        return output_tensor

VLMEvalKit_old/PaddleMIX/ppdiffusers/ppdiffusers/models/hotshot_xl/transformer_3d.py ADDED Viewed

	@@ -0,0 +1,77 @@

+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from dataclasses import dataclass
+from typing import Any, Dict, Optional
+import paddle
+from einops import rearrange, repeat
+import ppdiffusers
+@dataclass
+class Transformer3DModelOutput(ppdiffusers.utils.BaseOutput):
+    """
+    The output of [`Transformer3DModel`].
+    Args:
+        sample (`paddle.FloatTensor` of shape `(batch_size, num_channels, height, width)`:
+            The hidden states output conditioned on the `encoder_hidden_states` input.
+    """
+    sample: paddle.float32
+class Transformer3DModel(ppdiffusers.models.transformer_2d.Transformer2DModel):
+    def __init__(self, *args, **kwargs):
+        super(Transformer3DModel, self).__init__(*args, **kwargs)
+        init_Constant = paddle.nn.initializer.Constant(value=0.0)
+        init_Constant(self.proj_out.weight.data)
+        init_Constant = paddle.nn.initializer.Constant(value=0.0)
+        init_Constant(self.proj_out.bias.data)
+    def forward(
+        self,
+        hidden_states: paddle.Tensor,
+        encoder_hidden_states: Optional[paddle.Tensor] = None,
+        timestep: Optional[int] = None,
+        class_labels: Optional[int] = None,
+        cross_attention_kwargs: Dict[str, Any] = None,
+        attention_mask: Optional[paddle.Tensor] = None,
+        encoder_attention_mask: Optional[paddle.Tensor] = None,
+        enable_temporal_layers: bool = True,
+        positional_embedding: Optional[paddle.Tensor] = None,
+        return_dict: bool = True,
+    ):
+        is_video = len(tuple(hidden_states.shape)) == 5
+        if is_video:
+            f = tuple(hidden_states.shape)[2]
+            hidden_states = rearrange(hidden_states, "b c f h w -> (b f) c h w")
+            encoder_hidden_states = repeat(encoder_hidden_states, "b n c -> (b f) n c", f=f)
+        hidden_states = super(Transformer3DModel, self).forward(
+            hidden_states,
+            encoder_hidden_states,
+            timestep,
+            class_labels,
+            cross_attention_kwargs,
+            attention_mask,
+            encoder_attention_mask,
+            return_dict=False,
+        )[0]
+        if is_video:
+            hidden_states = rearrange(hidden_states, "(b f) c h w -> b c f h w", f=f)
+        if not return_dict:
+            return (hidden_states,)
+        return Transformer3DModelOutput(sample=hidden_states)

VLMEvalKit_old/PaddleMIX/ppdiffusers/ppdiffusers/models/hotshot_xl/unet.py ADDED Viewed

	@@ -0,0 +1,778 @@

+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+from dataclasses import dataclass
+from typing import Any, Dict, List, Optional, Tuple, Union
+import paddle
+import ppdiffusers
+from ppdiffusers import loaders, transformers  # noqa: *
+from .resnet import Conv3d
+from .unet_blocks import (
+    CrossAttnDownBlock3D,
+    CrossAttnUpBlock3D,
+    DownBlock3D,
+    UNetMidBlock3DCrossAttn,
+    UpBlock3D,
+    get_down_block,
+    get_up_block,
+)
+logger = ppdiffusers.utils.logging.get_logger(__name__)
+@dataclass
+class UNet3DConditionOutput(ppdiffusers.utils.BaseOutput):
+    """
+    The output of [`UNet2DConditionModel`].
+    Args:
+        sample (`paddle.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
+            The hidden states output conditioned on `encoder_hidden_states` input. Output of last layer of model.
+    """
+    sample: paddle.float32 = None
+class UNet3DConditionModel(
+    ppdiffusers.models.modeling_utils.ModelMixin,
+    ppdiffusers.configuration_utils.ConfigMixin,
+    loaders.UNet2DConditionLoadersMixin,
+):
+    _supports_gradient_checkpointing = True
+    @ppdiffusers.configuration_utils.register_to_config
+    def __init__(
+        self,
+        sample_size: Optional[int] = None,
+        in_channels: int = 4,
+        out_channels: int = 4,
+        center_input_sample: bool = False,
+        flip_sin_to_cos: bool = True,
+        freq_shift: int = 0,
+        down_block_types: Tuple[str] = ("CrossAttnDownBlock3D", "CrossAttnDownBlock3D", "DownBlock3D"),
+        mid_block_type: Optional[str] = "UNetMidBlock3DCrossAttn",
+        up_block_types: Tuple[str] = ("UpBlock3D", "CrossAttnUpBlock3D", "CrossAttnUpBlock3D"),
+        only_cross_attention: Union[bool, Tuple[bool]] = False,
+        block_out_channels: Tuple[int] = (320, 640, 1280, 1280),
+        layers_per_block: Union[int, Tuple[int]] = 2,
+        downsample_padding: int = 1,
+        mid_block_scale_factor: float = 1,
+        act_fn: str = "silu",
+        norm_num_groups: Optional[int] = 32,
+        norm_eps: float = 1e-05,
+        cross_attention_dim: Union[int, Tuple[int]] = 1280,
+        transformer_layers_per_block: Union[int, Tuple[int]] = 1,
+        encoder_hid_dim: Optional[int] = None,
+        encoder_hid_dim_type: Optional[str] = None,
+        attention_head_dim: Union[int, Tuple[int]] = 8,
+        num_attention_heads: Optional[Union[int, Tuple[int]]] = None,
+        dual_cross_attention: bool = False,
+        use_linear_projection: bool = False,
+        class_embed_type: Optional[str] = None,
+        addition_embed_type: Optional[str] = None,
+        addition_time_embed_dim: Optional[int] = None,
+        num_class_embeds: Optional[int] = None,
+        upcast_attention: bool = False,
+        resnet_time_scale_shift: str = "default",
+        resnet_skip_time_act: bool = False,
+        resnet_out_scale_factor: int = 1.0,
+        time_embedding_type: str = "positional",
+        time_embedding_dim: Optional[int] = None,
+        time_embedding_act_fn: Optional[str] = None,
+        timestep_post_act: Optional[str] = None,
+        time_cond_proj_dim: Optional[int] = None,
+        conv_in_kernel: int = 3,
+        conv_out_kernel: int = 3,
+        projection_class_embeddings_input_dim: Optional[int] = None,
+        class_embeddings_concat: bool = False,
+        mid_block_only_cross_attention: Optional[bool] = None,
+        cross_attention_norm: Optional[str] = None,
+        addition_embed_type_num_heads=64,
+    ):
+        super().__init__()
+        self.sample_size = sample_size
+        if num_attention_heads is not None:
+            raise ValueError(
+                "At the moment it is not possible to define the number of attention heads via `num_attention_heads` because of a naming issue as described in https://github.com/huggingface/diffusers/issues/2011#issuecomment-1547958131. Passing `num_attention_heads` will only be supported in diffusers v0.19."
+            )
+        num_attention_heads = num_attention_heads or attention_head_dim
+        if len(down_block_types) != len(up_block_types):
+            raise ValueError(
+                f"Must provide the same number of `down_block_types` as `up_block_types`. `down_block_types`: {down_block_types}. `up_block_types`: {up_block_types}."
+            )
+        if len(block_out_channels) != len(down_block_types):
+            raise ValueError(
+                f"Must provide the same number of `block_out_channels` as `down_block_types`. `block_out_channels`: {block_out_channels}. `down_block_types`: {down_block_types}."
+            )
+        if not isinstance(only_cross_attention, bool) and len(only_cross_attention) != len(down_block_types):
+            raise ValueError(
+                f"Must provide the same number of `only_cross_attention` as `down_block_types`. `only_cross_attention`: {only_cross_attention}. `down_block_types`: {down_block_types}."
+            )
+        if not isinstance(num_attention_heads, int) and len(num_attention_heads) != len(down_block_types):
+            raise ValueError(
+                f"Must provide the same number of `num_attention_heads` as `down_block_types`. `num_attention_heads`: {num_attention_heads}. `down_block_types`: {down_block_types}."
+            )
+        if not isinstance(attention_head_dim, int) and len(attention_head_dim) != len(down_block_types):
+            raise ValueError(
+                f"Must provide the same number of `attention_head_dim` as `down_block_types`. `attention_head_dim`: {attention_head_dim}. `down_block_types`: {down_block_types}."
+            )
+        if isinstance(cross_attention_dim, list) and len(cross_attention_dim) != len(down_block_types):
+            raise ValueError(
+                f"Must provide the same number of `cross_attention_dim` as `down_block_types`. `cross_attention_dim`: {cross_attention_dim}. `down_block_types`: {down_block_types}."
+            )
+        if not isinstance(layers_per_block, int) and len(layers_per_block) != len(down_block_types):
+            raise ValueError(
+                f"Must provide the same number of `layers_per_block` as `down_block_types`. `layers_per_block`: {layers_per_block}. `down_block_types`: {down_block_types}."
+            )
+        conv_in_padding = (conv_in_kernel - 1) // 2
+        self.conv_in = Conv3d(in_channels, block_out_channels[0], kernel_size=conv_in_kernel, padding=conv_in_padding)
+        if time_embedding_type == "fourier":
+            time_embed_dim = time_embedding_dim or block_out_channels[0] * 2
+            if time_embed_dim % 2 != 0:
+                raise ValueError(f"`time_embed_dim` should be divisible by 2, but is {time_embed_dim}.")
+            self.time_proj = ppdiffusers.models.embeddings.GaussianFourierProjection(
+                time_embed_dim // 2, set_W_to_weight=False, log=False, flip_sin_to_cos=flip_sin_to_cos
+            )
+            timestep_input_dim = time_embed_dim
+        elif time_embedding_type == "positional":
+            time_embed_dim = time_embedding_dim or block_out_channels[0] * 4
+            self.time_proj = ppdiffusers.models.embeddings.Timesteps(
+                block_out_channels[0], flip_sin_to_cos, freq_shift
+            )
+            timestep_input_dim = block_out_channels[0]
+        else:
+            raise ValueError(
+                f"{time_embedding_type} does not exist. Please make sure to use one of `fourier` or `positional`."
+            )
+        self.time_embedding = ppdiffusers.models.embeddings.TimestepEmbedding(
+            timestep_input_dim,
+            time_embed_dim,
+            act_fn=act_fn,
+            post_act_fn=timestep_post_act,
+            cond_proj_dim=time_cond_proj_dim,
+        )
+        if encoder_hid_dim_type is None and encoder_hid_dim is not None:
+            encoder_hid_dim_type = "text_proj"
+            self.register_to_config(encoder_hid_dim_type=encoder_hid_dim_type)
+            logger.info("encoder_hid_dim_type defaults to 'text_proj' as `encoder_hid_dim` is defined.")
+        if encoder_hid_dim is None and encoder_hid_dim_type is not None:
+            raise ValueError(
+                f"`encoder_hid_dim` has to be defined when `encoder_hid_dim_type` is set to {encoder_hid_dim_type}."
+            )
+        if encoder_hid_dim_type == "text_proj":
+            self.encoder_hid_proj = paddle.nn.Linear(in_features=encoder_hid_dim, out_features=cross_attention_dim)
+        elif encoder_hid_dim_type == "text_image_proj":
+            self.encoder_hid_proj = ppdiffusers.models.embeddings.TextImageProjection(
+                text_embed_dim=encoder_hid_dim,
+                image_embed_dim=cross_attention_dim,
+                cross_attention_dim=cross_attention_dim,
+            )
+        elif encoder_hid_dim_type == "image_proj":
+            self.encoder_hid_proj = ppdiffusers.models.embeddings.ImageProjection(
+                image_embed_dim=encoder_hid_dim, cross_attention_dim=cross_attention_dim
+            )
+        elif encoder_hid_dim_type is not None:
+            raise ValueError(
+                f"encoder_hid_dim_type: {encoder_hid_dim_type} must be None, 'text_proj' or 'text_image_proj'."
+            )
+        else:
+            self.encoder_hid_proj = None
+        if class_embed_type is None and num_class_embeds is not None:
+            self.class_embedding = paddle.nn.Embedding(num_embeddings=num_class_embeds, embedding_dim=time_embed_dim)
+        elif class_embed_type == "timestep":
+            self.class_embedding = ppdiffusers.models.embeddings.TimestepEmbedding(
+                timestep_input_dim, time_embed_dim, act_fn=act_fn
+            )
+        elif class_embed_type == "identity":
+            self.class_embedding = paddle.nn.Identity(time_embed_dim, time_embed_dim)
+        elif class_embed_type == "projection":
+            if projection_class_embeddings_input_dim is None:
+                raise ValueError(
+                    "`class_embed_type`: 'projection' requires `projection_class_embeddings_input_dim` be set"
+                )
+            self.class_embedding = ppdiffusers.models.embeddings.TimestepEmbedding(
+                projection_class_embeddings_input_dim, time_embed_dim
+            )
+        elif class_embed_type == "simple_projection":
+            if projection_class_embeddings_input_dim is None:
+                raise ValueError(
+                    "`class_embed_type`: 'simple_projection' requires `projection_class_embeddings_input_dim` be set"
+                )
+            self.class_embedding = paddle.nn.Linear(
+                in_features=projection_class_embeddings_input_dim, out_features=time_embed_dim
+            )
+        else:
+            self.class_embedding = None
+        if addition_embed_type == "text":
+            if encoder_hid_dim is not None:
+                text_time_embedding_from_dim = encoder_hid_dim
+            else:
+                text_time_embedding_from_dim = cross_attention_dim
+            self.add_embedding = ppdiffusers.models.embeddings.TextTimeEmbedding(
+                text_time_embedding_from_dim, time_embed_dim, num_heads=addition_embed_type_num_heads
+            )
+        elif addition_embed_type == "text_image":
+            self.add_embedding = ppdiffusers.models.embeddings.TextImageTimeEmbedding(
+                text_embed_dim=cross_attention_dim, image_embed_dim=cross_attention_dim, time_embed_dim=time_embed_dim
+            )
+        elif addition_embed_type == "text_time":
+            self.add_time_proj = ppdiffusers.models.embeddings.Timesteps(
+                addition_time_embed_dim, flip_sin_to_cos, freq_shift
+            )
+            self.add_embedding = ppdiffusers.models.embeddings.TimestepEmbedding(
+                projection_class_embeddings_input_dim, time_embed_dim
+            )
+        elif addition_embed_type == "image":
+            self.add_embedding = ppdiffusers.models.embeddings.ImageTimeEmbedding(
+                image_embed_dim=encoder_hid_dim, time_embed_dim=time_embed_dim
+            )
+        elif addition_embed_type == "image_hint":
+            self.add_embedding = ppdiffusers.models.embeddings.ImageHintTimeEmbedding(
+                image_embed_dim=encoder_hid_dim, time_embed_dim=time_embed_dim
+            )
+        elif addition_embed_type is not None:
+            raise ValueError(f"addition_embed_type: {addition_embed_type} must be None, 'text' or 'text_image'.")
+        if time_embedding_act_fn is None:
+            self.time_embed_act = None
+        else:
+            self.time_embed_act = ppdiffusers.models.activations.get_activation(time_embedding_act_fn)
+        self.down_blocks = paddle.nn.LayerList(sublayers=[])
+        self.up_blocks = paddle.nn.LayerList(sublayers=[])
+        if isinstance(only_cross_attention, bool):
+            if mid_block_only_cross_attention is None:
+                mid_block_only_cross_attention = only_cross_attention
+            only_cross_attention = [only_cross_attention] * len(down_block_types)
+        if mid_block_only_cross_attention is None:
+            mid_block_only_cross_attention = False
+        if isinstance(num_attention_heads, int):
+            num_attention_heads = (num_attention_heads,) * len(down_block_types)
+        if isinstance(attention_head_dim, int):
+            attention_head_dim = (attention_head_dim,) * len(down_block_types)
+        if isinstance(cross_attention_dim, int):
+            cross_attention_dim = (cross_attention_dim,) * len(down_block_types)
+        if isinstance(layers_per_block, int):
+            layers_per_block = [layers_per_block] * len(down_block_types)
+        if isinstance(transformer_layers_per_block, int):
+            transformer_layers_per_block = [transformer_layers_per_block] * len(down_block_types)
+        if class_embeddings_concat:
+            blocks_time_embed_dim = time_embed_dim * 2
+        else:
+            blocks_time_embed_dim = time_embed_dim
+        output_channel = block_out_channels[0]
+        for i, down_block_type in enumerate(down_block_types):
+            res = 2**i
+            input_channel = output_channel
+            output_channel = block_out_channels[i]
+            is_final_block = i == len(block_out_channels) - 1
+            down_block = get_down_block(
+                down_block_type,
+                num_layers=layers_per_block[i],
+                transformer_layers_per_block=transformer_layers_per_block[i],
+                in_channels=input_channel,
+                out_channels=output_channel,
+                temb_channels=blocks_time_embed_dim,
+                add_downsample=not is_final_block,
+                resnet_eps=norm_eps,
+                resnet_act_fn=act_fn,
+                resnet_groups=norm_num_groups,
+                cross_attention_dim=cross_attention_dim[i],
+                num_attention_heads=num_attention_heads[i],
+                downsample_padding=downsample_padding,
+                dual_cross_attention=dual_cross_attention,
+                use_linear_projection=use_linear_projection,
+                only_cross_attention=only_cross_attention[i],
+                upcast_attention=upcast_attention,
+                resnet_time_scale_shift=resnet_time_scale_shift,
+                resnet_skip_time_act=resnet_skip_time_act,
+                resnet_out_scale_factor=resnet_out_scale_factor,
+                cross_attention_norm=cross_attention_norm,
+                attention_head_dim=attention_head_dim[i] if attention_head_dim[i] is not None else output_channel,
+            )
+            self.down_blocks.append(down_block)
+        if mid_block_type == "UNetMidBlock3DCrossAttn":
+            self.mid_block = UNetMidBlock3DCrossAttn(
+                transformer_layers_per_block=transformer_layers_per_block[-1],
+                in_channels=block_out_channels[-1],
+                temb_channels=blocks_time_embed_dim,
+                resnet_eps=norm_eps,
+                resnet_act_fn=act_fn,
+                output_scale_factor=mid_block_scale_factor,
+                resnet_time_scale_shift=resnet_time_scale_shift,
+                cross_attention_dim=cross_attention_dim[-1],
+                num_attention_heads=num_attention_heads[-1],
+                resnet_groups=norm_num_groups,
+                dual_cross_attention=dual_cross_attention,
+                use_linear_projection=use_linear_projection,
+                upcast_attention=upcast_attention,
+            )
+        elif mid_block_type == "UNetMidBlock2DSimpleCrossAttn":
+            raise ValueError("UNetMidBlock2DSimpleCrossAttn not supported")
+        elif mid_block_type is None:
+            self.mid_block = None
+        else:
+            raise ValueError(f"unknown mid_block_type : {mid_block_type}")
+        self.num_upsamplers = 0
+        reversed_block_out_channels = list(reversed(block_out_channels))
+        reversed_num_attention_heads = list(reversed(num_attention_heads))
+        reversed_layers_per_block = list(reversed(layers_per_block))
+        reversed_cross_attention_dim = list(reversed(cross_attention_dim))
+        reversed_transformer_layers_per_block = list(reversed(transformer_layers_per_block))
+        only_cross_attention = list(reversed(only_cross_attention))
+        output_channel = reversed_block_out_channels[0]
+        for i, up_block_type in enumerate(up_block_types):
+            res = 2 ** (len(up_block_types) - 1 - i)  # noqa: *
+            is_final_block = i == len(block_out_channels) - 1
+            prev_output_channel = output_channel
+            output_channel = reversed_block_out_channels[i]
+            input_channel = reversed_block_out_channels[min(i + 1, len(block_out_channels) - 1)]
+            if not is_final_block:
+                add_upsample = True
+                self.num_upsamplers += 1
+            else:
+                add_upsample = False
+            up_block = get_up_block(
+                up_block_type,
+                num_layers=reversed_layers_per_block[i] + 1,
+                transformer_layers_per_block=reversed_transformer_layers_per_block[i],
+                in_channels=input_channel,
+                out_channels=output_channel,
+                prev_output_channel=prev_output_channel,
+                temb_channels=blocks_time_embed_dim,
+                add_upsample=add_upsample,
+                resnet_eps=norm_eps,
+                resnet_act_fn=act_fn,
+                resnet_groups=norm_num_groups,
+                cross_attention_dim=reversed_cross_attention_dim[i],
+                num_attention_heads=reversed_num_attention_heads[i],
+                dual_cross_attention=dual_cross_attention,
+                use_linear_projection=use_linear_projection,
+                only_cross_attention=only_cross_attention[i],
+                upcast_attention=upcast_attention,
+                resnet_time_scale_shift=resnet_time_scale_shift,
+                resnet_skip_time_act=resnet_skip_time_act,
+                resnet_out_scale_factor=resnet_out_scale_factor,
+                cross_attention_norm=cross_attention_norm,
+                attention_head_dim=attention_head_dim[i] if attention_head_dim[i] is not None else output_channel,
+            )
+            self.up_blocks.append(up_block)
+            prev_output_channel = output_channel
+        if norm_num_groups is not None:
+            self.conv_norm_out = paddle.nn.GroupNorm(
+                num_channels=block_out_channels[0], num_groups=norm_num_groups, epsilon=norm_eps
+            )
+            self.conv_act = ppdiffusers.models.activations.get_activation(act_fn)
+        else:
+            self.conv_norm_out = None
+            self.conv_act = None
+        conv_out_padding = (conv_out_kernel - 1) // 2
+        self.conv_out = Conv3d(
+            block_out_channels[0], out_channels, kernel_size=conv_out_kernel, padding=conv_out_padding
+        )
+    def temporal_parameters(self) -> list:
+        output = []
+        all_blocks = list(self.down_blocks) + list(self.up_blocks) + [self.mid_block]
+        for block in all_blocks:
+            output.extend(block.temporal_parameters())
+        return output
+    @property
+    def attn_processors(self) -> Dict[str, ppdiffusers.models.attention_processor.AttentionProcessor]:
+        return self.get_attn_processors(include_temporal_layers=False)
+    def get_attn_processors(
+        self, include_temporal_layers=True
+    ) -> Dict[str, ppdiffusers.models.attention_processor.AttentionProcessor]:
+        """
+        Returns:
+            `dict` of attention processors: A dictionary containing all attention processors used in the model with
+            indexed by its weight name.
+        """
+        processors = {}
+        def fn_recursive_add_processors(
+            name: str,
+            module: paddle.nn.Layer,
+            processors: Dict[str, ppdiffusers.models.attention_processor.AttentionProcessor],
+        ):
+            if not include_temporal_layers:
+                if "temporal" in name:
+                    return processors
+            if hasattr(module, "set_processor"):
+                processors[f"{name}.processor"] = module.processor
+            for sub_name, child in module.named_children():
+                fn_recursive_add_processors(f"{name}.{sub_name}", child, processors)
+            return processors
+        for name, module in self.named_children():
+            fn_recursive_add_processors(name, module, processors)
+        return processors
+    def set_attn_processor(
+        self,
+        processor: Union[
+            ppdiffusers.models.attention_processor.AttentionProcessor,
+            Dict[str, ppdiffusers.models.attention_processor.AttentionProcessor],
+        ],
+        include_temporal_layers=False,
+    ):
+        """
+        Sets the attention processor to use to compute attention.
+        Parameters:
+            processor (`dict` of `AttentionProcessor` or only `AttentionProcessor`):
+                The instantiated processor class or a dictionary of processor classes that will be set as the processor
+                for **all** `Attention` layers.
+                If `processor` is a dict, the key needs to define the path to the corresponding cross attention
+                processor. This is strongly recommended when setting trainable attention processors.
+        """
+        count = len(self.get_attn_processors(include_temporal_layers=include_temporal_layers).keys())
+        if isinstance(processor, dict) and len(processor) != count:
+            raise ValueError(
+                f"A dict of processors was passed, but the number of processors {len(processor)} does not match the number of attention layers: {count}. Please make sure to pass {count} processor classes."
+            )
+        def fn_recursive_attn_processor(name: str, module: paddle.nn.Layer, processor):
+            if not include_temporal_layers:
+                if "temporal" in name:
+                    return
+            if hasattr(module, "set_processor"):
+                if not isinstance(processor, dict):
+                    module.set_processor(processor)
+                else:
+                    module.set_processor(processor.pop(f"{name}.processor"))
+            for sub_name, child in module.named_children():
+                fn_recursive_attn_processor(f"{name}.{sub_name}", child, processor)
+        for name, module in self.named_children():
+            fn_recursive_attn_processor(name, module, processor)
+    def set_default_attn_processor(self):
+        """
+        Disables custom attention processors and sets the default attention implementation.
+        """
+        self.set_attn_processor(ppdiffusers.models.attention_processor.AttnProcessor())
+    def set_attention_slice(self, slice_size):
+        """
+        Enable sliced attention computation.
+        When this option is enabled, the attention module splits the input tensor in slices to compute attention in
+        several steps. This is useful for saving some memory in exchange for a small decrease in speed.
+        Args:
+            slice_size (`str` or `int` or `list(int)`, *optional*, defaults to `"auto"`):
+                When `"auto"`, input to the attention heads is halved, so attention is computed in two steps. If
+                `"max"`, maximum amount of memory is saved by running only one slice at a time. If a number is
+                provided, uses as many slices as `attention_head_dim // slice_size`. In this case, `attention_head_dim`
+                must be a multiple of `slice_size`.
+        """
+        sliceable_head_dims = []
+        def fn_recursive_retrieve_sliceable_dims(module: paddle.nn.Layer):
+            if hasattr(module, "set_attention_slice"):
+                sliceable_head_dims.append(module.sliceable_head_dim)
+            for child in module.children():
+                fn_recursive_retrieve_sliceable_dims(child)
+        for module in self.children():
+            fn_recursive_retrieve_sliceable_dims(module)
+        num_sliceable_layers = len(sliceable_head_dims)
+        if slice_size == "auto":
+            slice_size = [(dim // 2) for dim in sliceable_head_dims]
+        elif slice_size == "max":
+            slice_size = num_sliceable_layers * [1]
+        slice_size = num_sliceable_layers * [slice_size] if not isinstance(slice_size, list) else slice_size
+        if len(slice_size) != len(sliceable_head_dims):
+            raise ValueError(
+                f"You have provided {len(slice_size)}, but {self.config} has {len(sliceable_head_dims)} different attention layers. Make sure to match `len(slice_size)` to be {len(sliceable_head_dims)}."
+            )
+        for i in range(len(slice_size)):
+            size = slice_size[i]
+            dim = sliceable_head_dims[i]
+            if size is not None and size > dim:
+                raise ValueError(f"size {size} has to be smaller or equal to {dim}.")
+        def fn_recursive_set_attention_slice(module: paddle.nn.Layer, slice_size: List[int]):
+            if hasattr(module, "set_attention_slice"):
+                module.set_attention_slice(slice_size.pop())
+            for child in module.children():
+                fn_recursive_set_attention_slice(child, slice_size)
+        reversed_slice_size = list(reversed(slice_size))
+        for module in self.children():
+            fn_recursive_set_attention_slice(module, reversed_slice_size)
+    def _set_gradient_checkpointing(self, module, value=False):
+        if isinstance(module, (CrossAttnDownBlock3D, DownBlock3D, CrossAttnUpBlock3D, UpBlock3D)):
+            module.gradient_checkpointing = value
+    def forward(
+        self,
+        sample: paddle.float32,
+        timestep: Union[paddle.Tensor, float, int],
+        encoder_hidden_states: paddle.Tensor,
+        class_labels: Optional[paddle.Tensor] = None,
+        timestep_cond: Optional[paddle.Tensor] = None,
+        attention_mask: Optional[paddle.Tensor] = None,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+        added_cond_kwargs: Optional[Dict[str, paddle.Tensor]] = None,
+        down_block_additional_residuals: Optional[Tuple[paddle.Tensor]] = None,
+        mid_block_additional_residual: Optional[paddle.Tensor] = None,
+        encoder_attention_mask: Optional[paddle.Tensor] = None,
+        return_dict: bool = True,
+        enable_temporal_attentions: bool = True,
+    ) -> Union[UNet3DConditionOutput, Tuple]:
+        """
+        The [`UNet2DConditionModel`] forward method.
+        Args:
+            sample (`paddle.FloatTensor`):
+                The noisy input tensor with the following shape `(batch, channel, height, width)`.
+            timestep (`paddle.FloatTensor` or `float` or `int`): The number of timesteps to denoise an input.
+            encoder_hidden_states (`paddle.FloatTensor`):
+                The encoder hidden states with shape `(batch, sequence_length, feature_dim)`.
+            encoder_attention_mask (`paddle.Tensor`):
+                A cross-attention mask of shape `(batch, sequence_length)` is applied to `encoder_hidden_states`. If
+                `True` the mask is kept, otherwise if `False` it is discarded. Mask will be converted into a bias,
+                which adds large negative values to the attention scores corresponding to "discard" tokens.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~models.unet_2d_condition.UNet2DConditionOutput`] instead of a plain
+                tuple.
+            cross_attention_kwargs (`dict`, *optional*):
+                A kwargs dictionary that if specified is passed along to the [`AttnProcessor`].
+            added_cond_kwargs: (`dict`, *optional*):
+                A kwargs dictionary containin additional embeddings that if specified are added to the embeddings that
+                are passed along to the UNet blocks.
+        Returns:
+            [`~models.unet_2d_condition.UNet2DConditionOutput`] or `tuple`:
+                If `return_dict` is True, an [`~models.unet_2d_condition.UNet2DConditionOutput`] is returned, otherwise
+                a `tuple` is returned where the first element is the sample tensor.
+        """
+        default_overall_up_factor = 2**self.num_upsamplers
+        forward_upsample_size = False
+        upsample_size = None
+        if any(s % default_overall_up_factor != 0 for s in tuple(sample.shape)[-2:]):
+            logger.info("Forward upsample size to force interpolation output size.")
+            forward_upsample_size = True
+        if attention_mask is not None:
+            attention_mask = (1 - attention_mask.to(sample.dtype)) * -10000.0
+            attention_mask = attention_mask.unsqueeze(axis=1)
+        if encoder_attention_mask is not None:
+            encoder_attention_mask = (1 - encoder_attention_mask.to(sample.dtype)) * -10000.0
+            encoder_attention_mask = encoder_attention_mask.unsqueeze(axis=1)
+        if self.config.center_input_sample:
+            sample = 2 * sample - 1.0
+        timesteps = timestep
+        if not paddle.is_tensor(x=timesteps):
+            is_mps = sample.device.type == "mps"
+            if isinstance(timestep, float):
+                dtype = "float32" if is_mps else "float64"
+            else:
+                dtype = "int32" if is_mps else "int64"
+            timesteps = paddle.to_tensor(data=[timesteps], dtype=dtype, place=sample.place)
+        elif len(tuple(timesteps.shape)) == 0:
+            timesteps = timesteps[None].to(sample.place)
+        timesteps = timesteps.expand(shape=tuple(sample.shape)[0])
+        t_emb = self.time_proj(timesteps)
+        t_emb = t_emb.to(dtype=sample.dtype)
+        emb = self.time_embedding(t_emb, timestep_cond)
+        aug_emb = None
+        if self.class_embedding is not None:
+            if class_labels is None:
+                raise ValueError("class_labels should be provided when num_class_embeds > 0")
+            if self.config.class_embed_type == "timestep":
+                class_labels = self.time_proj(class_labels)
+                class_labels = class_labels.to(dtype=sample.dtype)
+            class_emb = self.class_embedding(class_labels).to(dtype=sample.dtype)
+            if self.config.class_embeddings_concat:
+                emb = paddle.concat(x=[emb, class_emb], axis=-1)
+            else:
+                emb = emb + class_emb
+        if self.config.addition_embed_type == "text":
+            aug_emb = self.add_embedding(encoder_hidden_states)
+        elif self.config.addition_embed_type == "text_image":
+            if "image_embeds" not in added_cond_kwargs:
+                raise ValueError(
+                    f"{self.__class__} has the config param `addition_embed_type` set to 'text_image' which requires the keyword argument `image_embeds` to be passed in `added_cond_kwargs`"
+                )
+            image_embs = added_cond_kwargs.get("image_embeds")
+            text_embs = added_cond_kwargs.get("text_embeds", encoder_hidden_states)
+            aug_emb = self.add_embedding(text_embs, image_embs)
+        elif self.config.addition_embed_type == "text_time":
+            if "text_embeds" not in added_cond_kwargs:
+                raise ValueError(
+                    f"{self.__class__} has the config param `addition_embed_type` set to 'text_time' which requires the keyword argument `text_embeds` to be passed in `added_cond_kwargs`"
+                )
+            text_embeds = added_cond_kwargs.get("text_embeds")
+            if "time_ids" not in added_cond_kwargs:
+                raise ValueError(
+                    f"{self.__class__} has the config param `addition_embed_type` set to 'text_time' which requires the keyword argument `time_ids` to be passed in `added_cond_kwargs`"
+                )
+            time_ids = added_cond_kwargs.get("time_ids")
+            time_embeds = self.add_time_proj(time_ids.flatten())
+            time_embeds = time_embeds.reshape((tuple(text_embeds.shape)[0], -1))
+            add_embeds = paddle.concat(x=[text_embeds, time_embeds], axis=-1)
+            add_embeds = add_embeds.to(emb.dtype)
+            aug_emb = self.add_embedding(add_embeds)
+        elif self.config.addition_embed_type == "image":
+            if "image_embeds" not in added_cond_kwargs:
+                raise ValueError(
+                    f"{self.__class__} has the config param `addition_embed_type` set to 'image' which requires the keyword argument `image_embeds` to be passed in `added_cond_kwargs`"
+                )
+            image_embs = added_cond_kwargs.get("image_embeds")
+            aug_emb = self.add_embedding(image_embs)
+        elif self.config.addition_embed_type == "image_hint":
+            if "image_embeds" not in added_cond_kwargs or "hint" not in added_cond_kwargs:
+                raise ValueError(
+                    f"{self.__class__} has the config param `addition_embed_type` set to 'image_hint' which requires the keyword arguments `image_embeds` and `hint` to be passed in `added_cond_kwargs`"
+                )
+            image_embs = added_cond_kwargs.get("image_embeds")
+            hint = added_cond_kwargs.get("hint")
+            aug_emb, hint = self.add_embedding(image_embs, hint)
+            sample = paddle.concat(x=[sample, hint], axis=1)
+        emb = emb + aug_emb if aug_emb is not None else emb
+        if self.time_embed_act is not None:
+            emb = self.time_embed_act(emb)
+        if self.encoder_hid_proj is not None and self.config.encoder_hid_dim_type == "text_proj":
+            encoder_hidden_states = self.encoder_hid_proj(encoder_hidden_states)
+        elif self.encoder_hid_proj is not None and self.config.encoder_hid_dim_type == "text_image_proj":
+            if "image_embeds" not in added_cond_kwargs:
+                raise ValueError(
+                    f"{self.__class__} has the config param `encoder_hid_dim_type` set to 'text_image_proj' which requires the keyword argument `image_embeds` to be passed in  `added_conditions`"
+                )
+            image_embeds = added_cond_kwargs.get("image_embeds")
+            encoder_hidden_states = self.encoder_hid_proj(encoder_hidden_states, image_embeds)
+        elif self.encoder_hid_proj is not None and self.config.encoder_hid_dim_type == "image_proj":
+            if "image_embeds" not in added_cond_kwargs:
+                raise ValueError(
+                    f"{self.__class__} has the config param `encoder_hid_dim_type` set to 'image_proj' which requires the keyword argument `image_embeds` to be passed in  `added_conditions`"
+                )
+            image_embeds = added_cond_kwargs.get("image_embeds")
+            encoder_hidden_states = self.encoder_hid_proj(image_embeds)
+        sample = self.conv_in(sample)
+        down_block_res_samples = (sample,)
+        for downsample_block in self.down_blocks:
+            if hasattr(downsample_block, "has_cross_attention") and downsample_block.has_cross_attention:
+                sample, res_samples = downsample_block(
+                    hidden_states=sample,
+                    temb=emb,
+                    encoder_hidden_states=encoder_hidden_states,
+                    attention_mask=attention_mask,
+                    cross_attention_kwargs=cross_attention_kwargs,
+                    enable_temporal_attentions=enable_temporal_attentions,
+                )
+            else:
+                sample, res_samples = downsample_block(
+                    hidden_states=sample,
+                    temb=emb,
+                    encoder_hidden_states=encoder_hidden_states,
+                    enable_temporal_attentions=enable_temporal_attentions,
+                )
+            down_block_res_samples += res_samples
+        if down_block_additional_residuals is not None:
+            new_down_block_res_samples = ()
+            for down_block_res_sample, down_block_additional_residual in zip(
+                down_block_res_samples, down_block_additional_residuals
+            ):
+                down_block_res_sample = down_block_res_sample + down_block_additional_residual
+                new_down_block_res_samples = new_down_block_res_samples + (down_block_res_sample,)
+            down_block_res_samples = new_down_block_res_samples
+        if self.mid_block is not None:
+            sample = self.mid_block(
+                sample,
+                emb,
+                encoder_hidden_states=encoder_hidden_states,
+                attention_mask=attention_mask,
+                cross_attention_kwargs=cross_attention_kwargs,
+                enable_temporal_attentions=enable_temporal_attentions,
+            )
+        if mid_block_additional_residual is not None:
+            sample = sample + mid_block_additional_residual
+        for i, upsample_block in enumerate(self.up_blocks):
+            is_final_block = i == len(self.up_blocks) - 1
+            res_samples = down_block_res_samples[-len(upsample_block.resnets) :]
+            down_block_res_samples = down_block_res_samples[: -len(upsample_block.resnets)]
+            if not is_final_block and forward_upsample_size:
+                upsample_size = tuple(down_block_res_samples[-1].shape)[2:]
+            if hasattr(upsample_block, "has_cross_attention") and upsample_block.has_cross_attention:
+                sample = upsample_block(
+                    hidden_states=sample,
+                    temb=emb,
+                    res_hidden_states_tuple=res_samples,
+                    encoder_hidden_states=encoder_hidden_states,
+                    cross_attention_kwargs=cross_attention_kwargs,
+                    upsample_size=upsample_size,
+                    attention_mask=attention_mask,
+                    enable_temporal_attentions=enable_temporal_attentions,
+                )
+            else:
+                sample = upsample_block(
+                    hidden_states=sample,
+                    temb=emb,
+                    res_hidden_states_tuple=res_samples,
+                    upsample_size=upsample_size,
+                    encoder_hidden_states=encoder_hidden_states,
+                    enable_temporal_attentions=enable_temporal_attentions,
+                )
+        if self.conv_norm_out:
+            sample = self.conv_norm_out(sample)
+            sample = self.conv_act(sample)
+        sample = self.conv_out(sample)
+        if not return_dict:
+            return (sample,)
+        return UNet3DConditionOutput(sample=sample)
+    @classmethod
+    def from_pretrained_spatial(cls, pretrained_model_path, subfolder=None):
+        import json
+        if subfolder is not None:
+            pretrained_model_path = os.path.join(pretrained_model_path, subfolder)
+        config_file = os.path.join(pretrained_model_path, "config.json")
+        with open(config_file, "r") as f:
+            config = json.load(f)
+        config["_class_name"] = "UNet3DConditionModel"
+        config["down_block_types"] = ["DownBlock3D", "CrossAttnDownBlock3D", "CrossAttnDownBlock3D"]
+        config["up_block_types"] = ["CrossAttnUpBlock3D", "CrossAttnUpBlock3D", "UpBlock3D"]
+        config["mid_block_type"] = "UNetMidBlock3DCrossAttn"
+        model = cls.from_config(config)
+        model_files = [
+            os.path.join(pretrained_model_path, "diffusion_paddle_model.bin"),
+            os.path.join(pretrained_model_path, "diffusion_paddle_model.safetensors"),
+        ]
+        model_file = None
+        for fp in model_files:
+            if os.path.exists(fp):
+                model_file = fp
+        if not model_file:
+            raise RuntimeError(f"{model_file} does not exist")
+        if model_file.split(".")[-1] == "safetensors":
+            from safetensors import safe_open
+            state_dict = {}
+            with safe_open(model_file, framework="pt", device="cuda") as f:
+                for key in f.keys():
+                    state_dict[key] = f.get_tensor(key)
+        else:
+            state_dict = paddle.load(path=model_file)
+        model.set_state_dict(state_dict=state_dict, use_structured_name=False)
+        return model

VLMEvalKit_old/PaddleMIX/ppdiffusers/ppdiffusers/models/hotshot_xl/unet_blocks.py ADDED Viewed

	@@ -0,0 +1,717 @@

+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import paddle
+from paddle.distributed.fleet.utils import recompute
+from .resnet import Downsample3D, ResnetBlock3D, Upsample3D
+from .transformer_3d import Transformer3DModel
+from .transformer_temporal import TransformerTemporal
+def get_down_block(
+    down_block_type,
+    num_layers,
+    in_channels,
+    out_channels,
+    temb_channels,
+    add_downsample,
+    resnet_eps,
+    resnet_act_fn,
+    transformer_layers_per_block=1,
+    num_attention_heads=None,
+    resnet_groups=None,
+    cross_attention_dim=None,
+    downsample_padding=None,
+    dual_cross_attention=False,
+    use_linear_projection=False,
+    only_cross_attention=False,
+    upcast_attention=False,
+    resnet_time_scale_shift="default",
+    resnet_skip_time_act=False,
+    resnet_out_scale_factor=1.0,
+    cross_attention_norm=None,
+    attention_head_dim=None,
+    downsample_type=None,
+):
+    down_block_type = down_block_type[7:] if down_block_type.startswith("UNetRes") else down_block_type
+    if down_block_type == "DownBlock3D":
+        return DownBlock3D(
+            num_layers=num_layers,
+            in_channels=in_channels,
+            out_channels=out_channels,
+            temb_channels=temb_channels,
+            add_downsample=add_downsample,
+            resnet_eps=resnet_eps,
+            resnet_act_fn=resnet_act_fn,
+            resnet_groups=resnet_groups,
+            downsample_padding=downsample_padding,
+            resnet_time_scale_shift=resnet_time_scale_shift,
+        )
+    elif down_block_type == "CrossAttnDownBlock3D":
+        if cross_attention_dim is None:
+            raise ValueError("cross_attention_dim must be specified for CrossAttnDownBlock3D")
+        return CrossAttnDownBlock3D(
+            num_layers=num_layers,
+            in_channels=in_channels,
+            out_channels=out_channels,
+            transformer_layers_per_block=transformer_layers_per_block,
+            temb_channels=temb_channels,
+            add_downsample=add_downsample,
+            resnet_eps=resnet_eps,
+            resnet_act_fn=resnet_act_fn,
+            resnet_groups=resnet_groups,
+            downsample_padding=downsample_padding,
+            cross_attention_dim=cross_attention_dim,
+            num_attention_heads=num_attention_heads,
+            dual_cross_attention=dual_cross_attention,
+            use_linear_projection=use_linear_projection,
+            only_cross_attention=only_cross_attention,
+            upcast_attention=upcast_attention,
+            resnet_time_scale_shift=resnet_time_scale_shift,
+        )
+    raise ValueError(f"{down_block_type} does not exist.")
+def get_up_block(
+    up_block_type,
+    num_layers,
+    in_channels,
+    out_channels,
+    prev_output_channel,
+    temb_channels,
+    add_upsample,
+    resnet_eps,
+    resnet_act_fn,
+    transformer_layers_per_block=1,
+    num_attention_heads=None,
+    resnet_groups=None,
+    cross_attention_dim=None,
+    dual_cross_attention=False,
+    use_linear_projection=False,
+    only_cross_attention=False,
+    upcast_attention=False,
+    resnet_time_scale_shift="default",
+    resnet_skip_time_act=False,
+    resnet_out_scale_factor=1.0,
+    cross_attention_norm=None,
+    attention_head_dim=None,
+    upsample_type=None,
+):
+    up_block_type = up_block_type[7:] if up_block_type.startswith("UNetRes") else up_block_type
+    if up_block_type == "UpBlock3D":
+        return UpBlock3D(
+            num_layers=num_layers,
+            in_channels=in_channels,
+            out_channels=out_channels,
+            prev_output_channel=prev_output_channel,
+            temb_channels=temb_channels,
+            add_upsample=add_upsample,
+            resnet_eps=resnet_eps,
+            resnet_act_fn=resnet_act_fn,
+            resnet_groups=resnet_groups,
+            resnet_time_scale_shift=resnet_time_scale_shift,
+        )
+    elif up_block_type == "CrossAttnUpBlock3D":
+        if cross_attention_dim is None:
+            raise ValueError("cross_attention_dim must be specified for CrossAttnUpBlock3D")
+        return CrossAttnUpBlock3D(
+            num_layers=num_layers,
+            in_channels=in_channels,
+            transformer_layers_per_block=transformer_layers_per_block,
+            out_channels=out_channels,
+            prev_output_channel=prev_output_channel,
+            temb_channels=temb_channels,
+            add_upsample=add_upsample,
+            resnet_eps=resnet_eps,
+            resnet_act_fn=resnet_act_fn,
+            resnet_groups=resnet_groups,
+            cross_attention_dim=cross_attention_dim,
+            num_attention_heads=num_attention_heads,
+            dual_cross_attention=dual_cross_attention,
+            use_linear_projection=use_linear_projection,
+            only_cross_attention=only_cross_attention,
+            upcast_attention=upcast_attention,
+            resnet_time_scale_shift=resnet_time_scale_shift,
+        )
+    raise ValueError(f"{up_block_type} does not exist.")
+class UNetMidBlock3DCrossAttn(paddle.nn.Layer):
+    def __init__(
+        self,
+        in_channels: int,
+        temb_channels: int,
+        dropout: float = 0.0,
+        num_layers: int = 1,
+        transformer_layers_per_block: int = 1,
+        resnet_eps: float = 1e-06,
+        resnet_time_scale_shift: str = "default",
+        resnet_act_fn: str = "swish",
+        resnet_groups: int = 32,
+        resnet_pre_norm: bool = True,
+        num_attention_heads=1,
+        output_scale_factor=1.0,
+        cross_attention_dim=1280,
+        dual_cross_attention=False,
+        use_linear_projection=False,
+        upcast_attention=False,
+    ):
+        super().__init__()
+        self.has_cross_attention = True
+        self.num_attention_heads = num_attention_heads
+        resnet_groups = resnet_groups if resnet_groups is not None else min(in_channels // 4, 32)
+        resnets = [
+            ResnetBlock3D(
+                in_channels=in_channels,
+                out_channels=in_channels,
+                temb_channels=temb_channels,
+                eps=resnet_eps,
+                groups=resnet_groups,
+                dropout=dropout,
+                time_embedding_norm=resnet_time_scale_shift,
+                non_linearity=resnet_act_fn,
+                output_scale_factor=output_scale_factor,
+                pre_norm=resnet_pre_norm,
+            )
+        ]
+        attentions = []
+        for _ in range(num_layers):
+            if dual_cross_attention:
+                raise NotImplementedError
+            attentions.append(
+                Transformer3DModel(
+                    num_attention_heads,
+                    in_channels // num_attention_heads,
+                    in_channels=in_channels,
+                    num_layers=transformer_layers_per_block,
+                    cross_attention_dim=cross_attention_dim,
+                    norm_num_groups=resnet_groups,
+                    use_linear_projection=use_linear_projection,
+                    upcast_attention=upcast_attention,
+                )
+            )
+            resnets.append(
+                ResnetBlock3D(
+                    in_channels=in_channels,
+                    out_channels=in_channels,
+                    temb_channels=temb_channels,
+                    eps=resnet_eps,
+                    groups=resnet_groups,
+                    dropout=dropout,
+                    time_embedding_norm=resnet_time_scale_shift,
+                    non_linearity=resnet_act_fn,
+                    output_scale_factor=output_scale_factor,
+                    pre_norm=resnet_pre_norm,
+                )
+            )
+        self.attentions = paddle.nn.LayerList(sublayers=attentions)
+        self.resnets = paddle.nn.LayerList(sublayers=resnets)
+    def forward(
+        self,
+        hidden_states,
+        temb=None,
+        encoder_hidden_states=None,
+        attention_mask=None,
+        cross_attention_kwargs=None,
+        enable_temporal_attentions: bool = True,
+    ):
+        hidden_states = self.resnets[0](hidden_states, temb)
+        for attn, resnet in zip(self.attentions, self.resnets[1:]):
+            hidden_states = attn(hidden_states, encoder_hidden_states=encoder_hidden_states).sample
+            hidden_states = resnet(hidden_states, temb)
+        return hidden_states
+    def temporal_parameters(self) -> list:
+        return []
+class CrossAttnDownBlock3D(paddle.nn.Layer):
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        temb_channels: int,
+        dropout: float = 0.0,
+        num_layers: int = 1,
+        transformer_layers_per_block: int = 1,
+        resnet_eps: float = 1e-06,
+        resnet_time_scale_shift: str = "default",
+        resnet_act_fn: str = "swish",
+        resnet_groups: int = 32,
+        resnet_pre_norm: bool = True,
+        num_attention_heads=1,
+        cross_attention_dim=1280,
+        output_scale_factor=1.0,
+        downsample_padding=1,
+        add_downsample=True,
+        dual_cross_attention=False,
+        use_linear_projection=False,
+        only_cross_attention=False,
+        upcast_attention=False,
+    ):
+        super().__init__()
+        resnets = []
+        attentions = []
+        temporal_attentions = []
+        self.has_cross_attention = True
+        self.num_attention_heads = num_attention_heads
+        for i in range(num_layers):
+            in_channels = in_channels if i == 0 else out_channels
+            resnets.append(
+                ResnetBlock3D(
+                    in_channels=in_channels,
+                    out_channels=out_channels,
+                    temb_channels=temb_channels,
+                    eps=resnet_eps,
+                    groups=resnet_groups,
+                    dropout=dropout,
+                    time_embedding_norm=resnet_time_scale_shift,
+                    non_linearity=resnet_act_fn,
+                    output_scale_factor=output_scale_factor,
+                    pre_norm=resnet_pre_norm,
+                )
+            )
+            if dual_cross_attention:
+                raise NotImplementedError
+            attentions.append(
+                Transformer3DModel(
+                    num_attention_heads,
+                    out_channels // num_attention_heads,
+                    in_channels=out_channels,
+                    num_layers=transformer_layers_per_block,
+                    cross_attention_dim=cross_attention_dim,
+                    norm_num_groups=resnet_groups,
+                    use_linear_projection=use_linear_projection,
+                    only_cross_attention=only_cross_attention,
+                    upcast_attention=upcast_attention,
+                )
+            )
+            temporal_attentions.append(
+                TransformerTemporal(
+                    num_attention_heads=8,
+                    attention_head_dim=out_channels // 8,
+                    in_channels=out_channels,
+                    cross_attention_dim=None,
+                )
+            )
+        self.attentions = paddle.nn.LayerList(sublayers=attentions)
+        self.resnets = paddle.nn.LayerList(sublayers=resnets)
+        self.temporal_attentions = paddle.nn.LayerList(sublayers=temporal_attentions)
+        if add_downsample:
+            self.downsamplers = paddle.nn.LayerList(
+                sublayers=[
+                    Downsample3D(
+                        out_channels, use_conv=True, out_channels=out_channels, padding=downsample_padding, name="op"
+                    )
+                ]
+            )
+        else:
+            self.downsamplers = None
+        self.gradient_checkpointing = False
+    def forward(
+        self,
+        hidden_states,
+        temb=None,
+        encoder_hidden_states=None,
+        attention_mask=None,
+        cross_attention_kwargs=None,
+        enable_temporal_attentions: bool = True,
+    ):
+        output_states = ()
+        for resnet, attn, temporal_attention in zip(self.resnets, self.attentions, self.temporal_attentions):
+            if self.training and self.gradient_checkpointing and not hidden_states.stop_gradient:
+                def create_custom_forward(module, return_dict=None):
+                    def custom_forward(*inputs):
+                        if return_dict is not None:
+                            return module(*inputs, return_dict=return_dict)
+                        else:
+                            return module(*inputs)
+                    return custom_forward
+                hidden_states = recompute(create_custom_forward(resnet), hidden_states, temb, use_reentrant=False)
+                hidden_states = recompute(
+                    create_custom_forward(attn, return_dict=False),
+                    hidden_states,
+                    encoder_hidden_states,
+                    use_reentrant=False,
+                )[0]
+                if enable_temporal_attentions and temporal_attention is not None:
+                    hidden_states = recompute(
+                        create_custom_forward(temporal_attention),
+                        hidden_states,
+                        encoder_hidden_states,
+                        use_reentrant=False,
+                    )
+            else:
+                hidden_states = resnet(hidden_states, temb)
+                hidden_states = attn(hidden_states, encoder_hidden_states=encoder_hidden_states).sample
+                if temporal_attention and enable_temporal_attentions:
+                    hidden_states = temporal_attention(hidden_states, encoder_hidden_states=encoder_hidden_states)
+            output_states += (hidden_states,)
+        if self.downsamplers is not None:
+            for downsampler in self.downsamplers:
+                hidden_states = downsampler(hidden_states)
+            output_states += (hidden_states,)
+        return hidden_states, output_states
+    def temporal_parameters(self) -> list:
+        output = []
+        for block in self.temporal_attentions:
+            if block:
+                output.extend(block.parameters())
+        return output
+class DownBlock3D(paddle.nn.Layer):
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        temb_channels: int,
+        dropout: float = 0.0,
+        num_layers: int = 1,
+        resnet_eps: float = 1e-06,
+        resnet_time_scale_shift: str = "default",
+        resnet_act_fn: str = "swish",
+        resnet_groups: int = 32,
+        resnet_pre_norm: bool = True,
+        output_scale_factor=1.0,
+        add_downsample=True,
+        downsample_padding=1,
+    ):
+        super().__init__()
+        resnets = []
+        temporal_attentions = []
+        for i in range(num_layers):
+            in_channels = in_channels if i == 0 else out_channels
+            resnets.append(
+                ResnetBlock3D(
+                    in_channels=in_channels,
+                    out_channels=out_channels,
+                    temb_channels=temb_channels,
+                    eps=resnet_eps,
+                    groups=resnet_groups,
+                    dropout=dropout,
+                    time_embedding_norm=resnet_time_scale_shift,
+                    non_linearity=resnet_act_fn,
+                    output_scale_factor=output_scale_factor,
+                    pre_norm=resnet_pre_norm,
+                )
+            )
+            temporal_attentions.append(
+                TransformerTemporal(
+                    num_attention_heads=8,
+                    attention_head_dim=out_channels // 8,
+                    in_channels=out_channels,
+                    cross_attention_dim=None,
+                )
+            )
+        self.resnets = paddle.nn.LayerList(sublayers=resnets)
+        self.temporal_attentions = paddle.nn.LayerList(sublayers=temporal_attentions)
+        if add_downsample:
+            self.downsamplers = paddle.nn.LayerList(
+                sublayers=[
+                    Downsample3D(
+                        out_channels, use_conv=True, out_channels=out_channels, padding=downsample_padding, name="op"
+                    )
+                ]
+            )
+        else:
+            self.downsamplers = None
+        self.gradient_checkpointing = False
+    def forward(self, hidden_states, temb=None, encoder_hidden_states=None, enable_temporal_attentions: bool = True):
+        output_states = ()
+        for resnet, temporal_attention in zip(self.resnets, self.temporal_attentions):
+            if self.training and self.gradient_checkpointing and not hidden_states.stop_gradient:
+                def create_custom_forward(module):
+                    def custom_forward(*inputs):
+                        return module(*inputs)
+                    return custom_forward
+                hidden_states = recompute(create_custom_forward(resnet), hidden_states, temb, use_reentrant=False)
+                if enable_temporal_attentions and temporal_attention is not None:
+                    hidden_states = recompute(
+                        create_custom_forward(temporal_attention),
+                        hidden_states,
+                        encoder_hidden_states,
+                        use_reentrant=False,
+                    )
+            else:
+                hidden_states = resnet(hidden_states, temb)
+                if enable_temporal_attentions and temporal_attention:
+                    hidden_states = temporal_attention(hidden_states, encoder_hidden_states=encoder_hidden_states)
+            output_states += (hidden_states,)
+        if self.downsamplers is not None:
+            for downsampler in self.downsamplers:
+                hidden_states = downsampler(hidden_states)
+            output_states += (hidden_states,)
+        return hidden_states, output_states
+    def temporal_parameters(self) -> list:
+        output = []
+        for block in self.temporal_attentions:
+            if block:
+                output.extend(block.parameters())
+        return output
+class CrossAttnUpBlock3D(paddle.nn.Layer):
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        prev_output_channel: int,
+        temb_channels: int,
+        dropout: float = 0.0,
+        num_layers: int = 1,
+        transformer_layers_per_block: int = 1,
+        resnet_eps: float = 1e-06,
+        resnet_time_scale_shift: str = "default",
+        resnet_act_fn: str = "swish",
+        resnet_groups: int = 32,
+        resnet_pre_norm: bool = True,
+        num_attention_heads=1,
+        cross_attention_dim=1280,
+        output_scale_factor=1.0,
+        add_upsample=True,
+        dual_cross_attention=False,
+        use_linear_projection=False,
+        only_cross_attention=False,
+        upcast_attention=False,
+    ):
+        super().__init__()
+        resnets = []
+        attentions = []
+        temporal_attentions = []
+        self.has_cross_attention = True
+        self.num_attention_heads = num_attention_heads
+        for i in range(num_layers):
+            res_skip_channels = in_channels if i == num_layers - 1 else out_channels
+            resnet_in_channels = prev_output_channel if i == 0 else out_channels
+            resnets.append(
+                ResnetBlock3D(
+                    in_channels=resnet_in_channels + res_skip_channels,
+                    out_channels=out_channels,
+                    temb_channels=temb_channels,
+                    eps=resnet_eps,
+                    groups=resnet_groups,
+                    dropout=dropout,
+                    time_embedding_norm=resnet_time_scale_shift,
+                    non_linearity=resnet_act_fn,
+                    output_scale_factor=output_scale_factor,
+                    pre_norm=resnet_pre_norm,
+                )
+            )
+            if dual_cross_attention:
+                raise NotImplementedError
+            attentions.append(
+                Transformer3DModel(
+                    num_attention_heads,
+                    out_channels // num_attention_heads,
+                    in_channels=out_channels,
+                    num_layers=transformer_layers_per_block,
+                    cross_attention_dim=cross_attention_dim,
+                    norm_num_groups=resnet_groups,
+                    use_linear_projection=use_linear_projection,
+                    only_cross_attention=only_cross_attention,
+                    upcast_attention=upcast_attention,
+                )
+            )
+            temporal_attentions.append(
+                TransformerTemporal(
+                    num_attention_heads=8,
+                    attention_head_dim=out_channels // 8,
+                    in_channels=out_channels,
+                    cross_attention_dim=None,
+                )
+            )
+        self.attentions = paddle.nn.LayerList(sublayers=attentions)
+        self.resnets = paddle.nn.LayerList(sublayers=resnets)
+        self.temporal_attentions = paddle.nn.LayerList(sublayers=temporal_attentions)
+        if add_upsample:
+            self.upsamplers = paddle.nn.LayerList(
+                sublayers=[Upsample3D(out_channels, use_conv=True, out_channels=out_channels)]
+            )
+        else:
+            self.upsamplers = None
+        self.gradient_checkpointing = False
+    def forward(
+        self,
+        hidden_states,
+        res_hidden_states_tuple,
+        temb=None,
+        encoder_hidden_states=None,
+        upsample_size=None,
+        cross_attention_kwargs=None,
+        attention_mask=None,
+        enable_temporal_attentions: bool = True,
+    ):
+        for resnet, attn, temporal_attention in zip(self.resnets, self.attentions, self.temporal_attentions):
+            res_hidden_states = res_hidden_states_tuple[-1]
+            res_hidden_states_tuple = res_hidden_states_tuple[:-1]
+            hidden_states = paddle.concat(x=[hidden_states, res_hidden_states], axis=1)
+            if self.training and self.gradient_checkpointing and not hidden_states.stop_gradient:
+                def create_custom_forward(module, return_dict=None):
+                    def custom_forward(*inputs):
+                        if return_dict is not None:
+                            return module(*inputs, return_dict=return_dict)
+                        else:
+                            return module(*inputs)
+                    return custom_forward
+                hidden_states = recompute(create_custom_forward(resnet), hidden_states, temb, use_reentrant=False)
+                hidden_states = recompute(
+                    create_custom_forward(attn, return_dict=False),
+                    hidden_states,
+                    encoder_hidden_states,
+                    use_reentrant=False,
+                )[0]
+                if enable_temporal_attentions and temporal_attention is not None:
+                    hidden_states = recompute(
+                        create_custom_forward(temporal_attention),
+                        hidden_states,
+                        encoder_hidden_states,
+                        use_reentrant=False,
+                    )
+            else:
+                hidden_states = resnet(hidden_states, temb)
+                hidden_states = attn(hidden_states, encoder_hidden_states=encoder_hidden_states).sample
+                if enable_temporal_attentions and temporal_attention:
+                    hidden_states = temporal_attention(hidden_states, encoder_hidden_states=encoder_hidden_states)
+        if self.upsamplers is not None:
+            for upsampler in self.upsamplers:
+                hidden_states = upsampler(hidden_states, upsample_size)
+        return hidden_states
+    def temporal_parameters(self) -> list:
+        output = []
+        for block in self.temporal_attentions:
+            if block:
+                output.extend(block.parameters())
+        return output
+class UpBlock3D(paddle.nn.Layer):
+    def __init__(
+        self,
+        in_channels: int,
+        prev_output_channel: int,
+        out_channels: int,
+        temb_channels: int,
+        dropout: float = 0.0,
+        num_layers: int = 1,
+        resnet_eps: float = 1e-06,
+        resnet_time_scale_shift: str = "default",
+        resnet_act_fn: str = "swish",
+        resnet_groups: int = 32,
+        resnet_pre_norm: bool = True,
+        output_scale_factor=1.0,
+        add_upsample=True,
+    ):
+        super().__init__()
+        resnets = []
+        temporal_attentions = []
+        for i in range(num_layers):
+            res_skip_channels = in_channels if i == num_layers - 1 else out_channels
+            resnet_in_channels = prev_output_channel if i == 0 else out_channels
+            resnets.append(
+                ResnetBlock3D(
+                    in_channels=resnet_in_channels + res_skip_channels,
+                    out_channels=out_channels,
+                    temb_channels=temb_channels,
+                    eps=resnet_eps,
+                    groups=resnet_groups,
+                    dropout=dropout,
+                    time_embedding_norm=resnet_time_scale_shift,
+                    non_linearity=resnet_act_fn,
+                    output_scale_factor=output_scale_factor,
+                    pre_norm=resnet_pre_norm,
+                )
+            )
+            temporal_attentions.append(
+                TransformerTemporal(
+                    num_attention_heads=8,
+                    attention_head_dim=out_channels // 8,
+                    in_channels=out_channels,
+                    cross_attention_dim=None,
+                )
+            )
+        self.resnets = paddle.nn.LayerList(sublayers=resnets)
+        self.temporal_attentions = paddle.nn.LayerList(sublayers=temporal_attentions)
+        if add_upsample:
+            self.upsamplers = paddle.nn.LayerList(
+                sublayers=[Upsample3D(out_channels, use_conv=True, out_channels=out_channels)]
+            )
+        else:
+            self.upsamplers = None
+        self.gradient_checkpointing = False
+    def forward(
+        self,
+        hidden_states,
+        res_hidden_states_tuple,
+        temb=None,
+        upsample_size=None,
+        encoder_hidden_states=None,
+        enable_temporal_attentions: bool = True,
+    ):
+        for resnet, temporal_attention in zip(self.resnets, self.temporal_attentions):
+            res_hidden_states = res_hidden_states_tuple[-1]
+            res_hidden_states_tuple = res_hidden_states_tuple[:-1]
+            hidden_states = paddle.concat(x=[hidden_states, res_hidden_states], axis=1)
+            if self.training and self.gradient_checkpointing and not hidden_states.stop_gradient:
+                def create_custom_forward(module):
+                    def custom_forward(*inputs):
+                        return module(*inputs)
+                    return custom_forward
+                hidden_states = recompute(create_custom_forward(resnet), hidden_states, temb, use_reentrant=False)
+                if enable_temporal_attentions and temporal_attention is not None:
+                    hidden_states = recompute(
+                        create_custom_forward(temporal_attention),
+                        hidden_states,
+                        encoder_hidden_states,
+                        use_reentrant=False,
+                    )
+            else:
+                hidden_states = resnet(hidden_states, temb)
+                hidden_states = (
+                    temporal_attention(hidden_states, encoder_hidden_states=encoder_hidden_states)
+                    if enable_temporal_attentions and temporal_attention is not None
+                    else hidden_states
+                )
+        if self.upsamplers is not None:
+            for upsampler in self.upsamplers:
+                hidden_states = upsampler(hidden_states, upsample_size)
+        return hidden_states
+    def temporal_parameters(self) -> list:
+        output = []
+        for block in self.temporal_attentions:
+            if block:
+                output.extend(block.parameters())
+        return output

VLMEvalKit_old/PaddleMIX/ppdiffusers/ppdiffusers/models/stable_cascade/gdf/__init__.py ADDED Viewed

	@@ -0,0 +1,162 @@

+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import sys
+import paddle
+import paddle_aux
+import ppdiffusers
+from .loss_weights import *
+from .noise_conditions import *
+from .samplers import *
+from .scalers import *
+from .schedulers import *
+from .targets import *
+class GDF:
+    def __init__(self, schedule, input_scaler, target, noise_cond, loss_weight, offset_noise=0):
+        self.schedule = schedule
+        self.input_scaler = input_scaler
+        self.target = target
+        self.noise_cond = noise_cond
+        self.loss_weight = loss_weight
+        self.offset_noise = offset_noise
+    def setup_limits(self, stretch_max=True, stretch_min=True, shift=1):
+        stretched_limits = self.input_scaler.setup_limits(
+            self.schedule, self.input_scaler, stretch_max, stretch_min, shift
+        )
+        return stretched_limits
+    def diffuse(self, x0, epsilon=None, t=None, shift=1, loss_shift=1, offset=None):
+        if epsilon is None:
+            epsilon = paddle.randn(shape=x0.shape, dtype=x0.dtype)
+        if self.offset_noise > 0:
+            if offset is None:
+                offset = paddle.randn(
+                    shape=[x0.shape[0], x0.shape[1]] + [1] * (len(x0.shape) - 2),
+                )
+            epsilon = epsilon + offset * self.offset_noise
+        logSNR = self.schedule(x0.shape[0] if t is None else t, shift=shift)
+        a, b = self.input_scaler(logSNR)
+        if len(a.shape) == 1:
+            a, b = a.reshape([-1, *([1] * (len(x0.shape) - 1))]), b.reshape([-1, *([1] * (len(x0.shape) - 1))])
+        target = self.target(x0, epsilon, logSNR, a, b)
+        return (
+            x0 * a + epsilon * b,
+            epsilon,
+            target,
+            logSNR,
+            self.noise_cond(logSNR),
+            self.loss_weight(logSNR, shift=loss_shift),
+        )
+    def undiffuse(self, x, logSNR, pred):
+        a, b = self.input_scaler(logSNR)
+        if len(a.shape) == 1:
+            a, b = a.reshape([-1, *([1] * (len(x.shape) - 1))]), b.reshape([-1, *([1] * (len(x.shape) - 1))])
+        return self.target.x0(x, pred, logSNR, a, b), self.target.epsilon(x, pred, logSNR, a, b)
+    def sample(
+        self,
+        model,
+        model_inputs,
+        shape,
+        unconditional_inputs=None,
+        sampler=None,
+        schedule=None,
+        t_start=1.0,
+        t_end=0.0,
+        timesteps=20,
+        x_init=None,
+        cfg=3.0,
+        cfg_t_stop=None,
+        cfg_t_start=None,
+        cfg_rho=0.7,
+        sampler_params=None,
+        shift=1,
+        device="cpu",
+    ):
+        sampler_params = {} if sampler_params is None else sampler_params
+        if sampler is None:
+            sampler = DDPMSampler(self)  # noqa
+        r_range = paddle.linspace(start=t_start, stop=t_end, num=timesteps + 1)
+        schedule = self.schedule if schedule is None else schedule
+        logSNR_range = (
+            schedule(r_range, shift=shift)[:, None]
+            .expand(shape=[-1, shape[0] if x_init is None else x_init.shape[0]])
+            .to(device)
+        )
+        x = sampler.init_x(shape).to(device) if x_init is None else x_init.clone()
+        if cfg is not None:
+            if unconditional_inputs is None:
+                unconditional_inputs = {k: paddle.zeros_like(x=v) for k, v in model_inputs.items()}
+            model_inputs = {
+                k: (
+                    paddle.concat(x=[v, v_u], axis=0)
+                    if isinstance(v, paddle.Tensor)
+                    else [
+                        (
+                            paddle.concat(x=[vi, vi_u], axis=0)
+                            if isinstance(vi, paddle.Tensor) and isinstance(vi_u, paddle.Tensor)
+                            else None
+                        )
+                        for vi, vi_u in zip(v, v_u)
+                    ]
+                    if isinstance(v, list)
+                    else {vk: paddle.concat(x=[v[vk], v_u.get(vk, paddle.zeros_like(x=v[vk]))], axis=0) for vk in v}
+                    if isinstance(v, dict)
+                    else None
+                )
+                for (k, v), (k_u, v_u) in zip(model_inputs.items(), unconditional_inputs.items())
+            }
+        for i in range(0, timesteps):
+            noise_cond = self.noise_cond(logSNR_range[i])
+            if (
+                cfg is not None
+                and (cfg_t_stop is None or r_range[i].item() >= cfg_t_stop)
+                and (cfg_t_start is None or r_range[i].item() <= cfg_t_start)
+            ):
+                cfg_val = cfg
+                if isinstance(cfg_val, (list, tuple)):
+                    assert len(cfg_val) == 2, "cfg must be a float or a list/tuple of length 2"
+                    cfg_val = cfg_val[0] * r_range[i].item() + cfg_val[1] * (1 - r_range[i].item())
+                pred, pred_unconditional = model(
+                    paddle.concat(x=[x, x], axis=0), noise_cond.repeat(2), **model_inputs
+                ).chunk(chunks=2)
+                pred_cfg = paddle.lerp(pred_unconditional, pred, paddle.to_tensor(cfg_val, dtype=paddle.float32))
+                if cfg_rho > 0:
+                    std_pos, std_cfg = pred.std(), pred_cfg.std()
+                    pred = cfg_rho * (pred_cfg * std_pos / (std_cfg + 1e-9)) + pred_cfg * (1 - cfg_rho)
+                else:
+                    pred = pred_cfg
+            else:
+                pred = model(x, noise_cond, **model_inputs)
+            x0, epsilon = self.undiffuse(x, logSNR_range[i], pred)
+            x = sampler(x, x0, epsilon, logSNR_range[i], logSNR_range[i + 1], **sampler_params)
+            altered_vars = yield x0, x, pred
+            if altered_vars is not None:
+                cfg = altered_vars.get("cfg", cfg)
+                cfg_rho = altered_vars.get("cfg_rho", cfg_rho)
+                sampler = altered_vars.get("sampler", sampler)
+                model_inputs = altered_vars.get("model_inputs", model_inputs)
+                x = altered_vars.get("x", x)
+                x_init = altered_vars.get("x_init", x_init)

VLMEvalKit_old/PaddleMIX/ppdiffusers/ppdiffusers/models/stable_cascade/gdf/loss_weights.py ADDED Viewed

	@@ -0,0 +1,128 @@

+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import numpy as np
+import paddle
+import paddle_aux  # noqa
+class BaseLossWeight:
+    def weight(self, logSNR):
+        raise NotImplementedError("this method needs to be overridden")
+    def __call__(self, logSNR, *args, shift=1, clamp_range=None, **kwargs):
+        clamp_range = [-1000000000.0, 1000000000.0] if clamp_range is None else clamp_range
+        if shift != 1:
+            logSNR = logSNR.clone() + 2 * np.log(shift)
+        return self.weight(logSNR, *args, **kwargs).clip(*clamp_range)
+class ComposedLossWeight(BaseLossWeight):
+    def __init__(self, div, mul):
+        self.mul = [mul] if isinstance(mul, BaseLossWeight) else mul
+        self.div = [div] if isinstance(div, BaseLossWeight) else div
+    def weight(self, logSNR):
+        prod, div = 1, 1
+        for m in self.mul:
+            prod *= m.weight(logSNR)
+        for d in self.div:
+            div *= d.weight(logSNR)
+        return prod / div
+class ConstantLossWeight(BaseLossWeight):
+    def __init__(self, v=1):
+        self.v = v
+    def weight(self, logSNR):
+        return paddle.ones_like(x=logSNR) * self.v
+class SNRLossWeight(BaseLossWeight):
+    def weight(self, logSNR):
+        return logSNR.exp()
+class P2LossWeight(BaseLossWeight):
+    def __init__(self, k=1.0, gamma=1.0, s=1.0):
+        self.k, self.gamma, self.s = k, gamma, s
+    def weight(self, logSNR):
+        return (self.k + (logSNR * self.s).exp()) ** -self.gamma
+class SNRPlusOneLossWeight(BaseLossWeight):
+    def weight(self, logSNR):
+        return logSNR.exp() + 1
+class MinSNRLossWeight(BaseLossWeight):
+    def __init__(self, max_snr=5):
+        self.max_snr = max_snr
+    def weight(self, logSNR):
+        return logSNR.exp().clip(max=self.max_snr)
+class MinSNRPlusOneLossWeight(BaseLossWeight):
+    def __init__(self, max_snr=5):
+        self.max_snr = max_snr
+    def weight(self, logSNR):
+        return (logSNR.exp() + 1).clip(max=self.max_snr)
+class TruncatedSNRLossWeight(BaseLossWeight):
+    def __init__(self, min_snr=1):
+        self.min_snr = min_snr
+    def weight(self, logSNR):
+        return logSNR.exp().clip(min=self.min_snr)
+class SechLossWeight(BaseLossWeight):
+    def __init__(self, div=2):
+        self.div = div
+    def weight(self, logSNR):
+        return 1 / (logSNR / self.div).cosh()
+class DebiasedLossWeight(BaseLossWeight):
+    def weight(self, logSNR):
+        return 1 / logSNR.exp().sqrt()
+class SigmoidLossWeight(BaseLossWeight):
+    def __init__(self, s=1):
+        self.s = s
+    def weight(self, logSNR):
+        return (logSNR * self.s).sigmoid()
+class AdaptiveLossWeight(BaseLossWeight):
+    def __init__(self, logsnr_range=[-10, 10], buckets=300, weight_range=[1e-07, 10000000.0]):
+        self.bucket_ranges = paddle.linspace(start=logsnr_range[0], stop=logsnr_range[1], num=buckets - 1)
+        self.bucket_losses = paddle.ones(shape=buckets)
+        self.weight_range = weight_range
+    def weight(self, logSNR):
+        indices = paddle.searchsorted(sorted_sequence=self.bucket_ranges.to(logSNR.place), values=logSNR)
+        return (1 / self.bucket_losses.to(logSNR.place)[indices]).clip([*self.weight_range])
+    def update_buckets(self, logSNR, loss, beta=0.99):
+        indices = paddle.searchsorted(sorted_sequence=self.bucket_ranges.to(logSNR.place), values=logSNR).cpu()
+        self.bucket_losses[indices] = self.bucket_losses[indices] * beta + loss.detach().cpu() * (1 - beta)

VLMEvalKit_old/PaddleMIX/ppdiffusers/ppdiffusers/models/stable_cascade/gdf/scalers.py ADDED Viewed

	@@ -0,0 +1,58 @@

+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import paddle
+class BaseScaler:
+    def __init__(self):
+        self.stretched_limits = None
+    def setup_limits(self, schedule, input_scaler, stretch_max=True, stretch_min=True, shift=1):
+        min_logSNR = schedule(paddle.ones(shape=[1]), shift=shift)
+        max_logSNR = schedule(paddle.zeros(shape=[1]), shift=shift)
+        min_a, max_b = [v.item() for v in input_scaler(min_logSNR)] if stretch_max else [0, 1]
+        max_a, min_b = [v.item() for v in input_scaler(max_logSNR)] if stretch_min else [1, 0]
+        self.stretched_limits = [min_a, max_a, min_b, max_b]
+        return self.stretched_limits
+    def stretch_limits(self, a, b):
+        min_a, max_a, min_b, max_b = self.stretched_limits
+        return (a - min_a) / (max_a - min_a), (b - min_b) / (max_b - min_b)
+    def scalers(self, logSNR):
+        raise NotImplementedError("this method needs to be overridden")
+    def __call__(self, logSNR):
+        a, b = self.scalers(logSNR)
+        if self.stretched_limits is not None:
+            a, b = self.stretch_limits(a, b)
+        return a, b
+class VPScaler(BaseScaler):
+    def scalers(self, logSNR):
+        a_squared = logSNR.sigmoid()
+        a = a_squared.sqrt()
+        b = (1 - a_squared).sqrt()
+        return a, b
+class LERPScaler(BaseScaler):
+    def scalers(self, logSNR):
+        _a = logSNR.exp() - 1
+        _a[_a == 0] = 0.001
+        a = 1 + (2 - (2**2 + 4 * _a) ** 0.5) / (2 * _a)
+        b = 1 - a
+        return a, b

VLMEvalKit_old/PaddleMIX/ppdiffusers/ppdiffusers/models/stable_cascade/gdf/targets.py ADDED Viewed

	@@ -0,0 +1,59 @@

+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+class EpsilonTarget:
+    def __call__(self, x0, epsilon, logSNR, a, b):
+        return epsilon
+    def x0(self, noised, pred, logSNR, a, b):
+        return (noised - pred * b) / a
+    def epsilon(self, noised, pred, logSNR, a, b):
+        return pred
+class X0Target:
+    def __call__(self, x0, epsilon, logSNR, a, b):
+        return x0
+    def x0(self, noised, pred, logSNR, a, b):
+        return pred
+    def epsilon(self, noised, pred, logSNR, a, b):
+        return (noised - pred * a) / b
+class VTarget:
+    def __call__(self, x0, epsilon, logSNR, a, b):
+        return a * epsilon - b * x0
+    def x0(self, noised, pred, logSNR, a, b):
+        squared_sum = a**2 + b**2
+        return a / squared_sum * noised - b / squared_sum * pred
+    def epsilon(self, noised, pred, logSNR, a, b):
+        squared_sum = a**2 + b**2
+        return b / squared_sum * noised + a / squared_sum * pred
+class RectifiedFlowsTarget:
+    def __call__(self, x0, epsilon, logSNR, a, b):
+        return epsilon - x0
+    def x0(self, noised, pred, logSNR, a, b):
+        return noised - pred * b
+    def epsilon(self, noised, pred, logSNR, a, b):
+        return noised + pred * a

VLMEvalKit_old/PaddleMIX/ppdiffusers/ppdiffusers/models/stable_cascade/modules/__init__.py ADDED Viewed

	@@ -0,0 +1,17 @@

+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from .effnet import EfficientNetEncoder
+from .previewer import Previewer
+from .stage_c import AttnBlock, FeedForwardBlock, ResBlock, StageC, TimestepBlock

VLMEvalKit_old/PaddleMIX/ppdiffusers/ppdiffusers/models/stable_cascade/modules/common.py ADDED Viewed

	@@ -0,0 +1,151 @@

+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import numpy as np
+import paddle
+import paddle.nn as nn
+def load(path="../x.npy"):
+    return paddle.to_tensor(np.load(path))
+def diff(a, b):
+    return (a - b).abs().mean()
+class Linear(nn.Linear):
+    def reset_parameters(self):
+        return None
+class Conv2d(nn.Conv2D):
+    def reset_parameters(self):
+        return None
+class Attention2D(nn.Layer):
+    def __init__(self, c, nhead, dropout=0.0):
+        super().__init__()
+        self.attn = nn.MultiHeadAttention(c, nhead, dropout=dropout)
+    def forward(self, x, kv, self_attn=False):
+        orig_shape = x.shape
+        x = x.reshape([x.shape[0], x.shape[1], -1]).transpose([0, 2, 1])
+        if self_attn:
+            kv = paddle.concat([x, kv], axis=1)
+        x = self.attn(x, kv, kv)
+        x = x.transpose([0, 2, 1]).reshape(orig_shape)
+        return x
+class LayerNorm2d(nn.LayerNorm):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+    def forward(self, x):
+        return super().forward(x.transpose(perm=[0, 2, 3, 1])).transpose(perm=[0, 3, 1, 2])
+class GlobalResponseNorm(nn.Layer):
+    def __init__(self, dim):
+        super(GlobalResponseNorm, self).__init__()
+        self.gamma = self.create_parameter(
+            shape=[1, 1, 1, dim], default_initializer=paddle.nn.initializer.Constant(value=0.0)
+        )
+        self.beta = self.create_parameter(
+            shape=[1, 1, 1, dim], default_initializer=paddle.nn.initializer.Constant(value=0.0)
+        )
+        self.gamma.stop_gradient = False
+        self.beta.stop_gradient = False
+    def forward(self, x):
+        Gx = paddle.norm(x, p=2, axis=(1, 2), keepdim=True)
+        Nx = Gx / (paddle.mean(Gx, axis=-1, keepdim=True) + 1e-6)
+        x = self.gamma * (x * Nx) + self.beta + x
+        return x
+class ResBlock(nn.Layer):
+    def __init__(self, c, c_skip=0, kernel_size=3, dropout=0.0):
+        super().__init__()
+        self.depthwise = Conv2d(c, c, kernel_size=kernel_size, padding=kernel_size // 2, groups=c)
+        self.norm = LayerNorm2d(c, weight_attr=False, bias_attr=False, epsilon=1e-06)
+        self.channelwise = nn.Sequential(
+            Linear(c + c_skip, c * 4),
+            nn.GELU(),
+            GlobalResponseNorm(c * 4),
+            nn.Dropout(p=dropout),
+            Linear(c * 4, c),
+        )
+    def forward(self, x, x_skip=None):
+        x_res = x
+        x = self.depthwise(x)
+        x = self.norm(x)
+        if x_skip is not None:
+            x = paddle.concat(x=[x, x_skip], axis=1)
+        x = self.channelwise(x.transpose(perm=[0, 2, 3, 1])).transpose(perm=[0, 3, 1, 2])
+        return x + x_res
+class AttnBlock(nn.Layer):
+    def __init__(self, c, c_cond, nhead, self_attn=True, dropout=0.0):
+        super().__init__()
+        self.self_attn = self_attn
+        self.norm = LayerNorm2d(c, weight_attr=False, bias_attr=False, epsilon=1e-06)
+        self.attention = Attention2D(c, nhead, dropout)
+        self.kv_mapper = nn.Sequential(nn.Silu(), Linear(c_cond, c))
+    def forward(self, x, kv):
+        kv = self.kv_mapper(kv)
+        x = x + self.attention(self.norm(x), kv, self_attn=self.self_attn)
+        return x
+class FeedForwardBlock(nn.Layer):
+    def __init__(self, c, dropout=0.0):
+        super().__init__()
+        self.norm = LayerNorm2d(c, weight_attr=False, bias_attr=False, epsilon=1e-06)
+        self.channelwise = nn.Sequential(
+            Linear(c, c * 4),
+            nn.GELU(),
+            GlobalResponseNorm(c * 4),
+            nn.Dropout(p=dropout),
+            Linear(c * 4, c),
+        )
+    def forward(self, x):
+        x = x + self.channelwise(self.norm(x).transpose(perm=[0, 2, 3, 1])).transpose(perm=[0, 3, 1, 2])
+        return x
+class TimestepBlock(nn.Layer):
+    def __init__(self, c, c_timestep, conds=["sca"], trainable=True):
+        super(TimestepBlock, self).__init__()
+        self.mapper = nn.Linear(c_timestep, c * 2, bias_attr=trainable)
+        self.conds = conds
+        for cname in conds:
+            setattr(self, f"mapper_{cname}", nn.Linear(c_timestep, c * 2, bias_attr=trainable))
+    def forward(self, x, t):
+        t = paddle.split(t, num_or_sections=len(self.conds) + 1, axis=1)
+        a_b = self.mapper(t[0])
+        a, b = a_b[:, : a_b.shape[1] // 2, None, None], a_b[:, a_b.shape[1] // 2 :, None, None]
+        for i, c in enumerate(self.conds):
+            ac_bc = getattr(self, f"mapper_{c}")(t[i + 1])
+            ac, bc = ac_bc[:, : ac_bc.shape[1] // 2, None, None], ac_bc[:, ac_bc.shape[1] // 2 :, None, None]
+            a, b = a + ac, b + bc
+        return x * (1 + a) + b

VLMEvalKit_old/PaddleMIX/ppdiffusers/ppdiffusers/models/stable_cascade/modules/efficientnet_v2_s.py ADDED Viewed

	@@ -0,0 +1,561 @@

+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import copy
+import math
+from dataclasses import dataclass
+from functools import partial
+from typing import Any, Callable, Dict, List, Optional, Sequence, Tuple, Union
+import paddle
+import paddle.nn as nn
+from paddle import Tensor
+from paddle.nn import (
+    AdaptiveAvgPool2D,
+    BatchNorm,
+    BatchNorm2D,
+    Conv2D,
+    Dropout,
+    GroupNorm,
+    Layer,
+    Linear,
+    ReLU,
+    Sequential,
+    Sigmoid,
+    Silu,
+)
+from paddle.nn.initializer import Constant, KaimingNormal, Uniform
+from paddle.utils.download import get_weights_path_from_url
+__all__ = ["EfficientNet", "EfficientNet_V2_S_Weights", "efficientnet_v2_s"]
+class SqueezeExcitation(paddle.nn.Layer):
+    """
+    This block implements the Squeeze-and-Excitation block from https://arxiv.org/abs/1709.01507 (see Fig. 1).
+    Parameters ``activation`` and ``scale_activation`` correspond to ``delta`` and ``sigma`` in eq. 3.
+    Args:
+        input_channels (int): Number of channels in the input feature maps
+        squeeze_channels (int): Number of squeeze channels
+        activation (Callable[[Tensor], Tensor], optional): ``delta`` activation. Default: ReLU
+        scale_activation (Callable[[Tensor], Tensor], optional): ``sigma`` activation. Default: Sigmoid
+    """
+    def __init__(
+        self,
+        input_channels: int,
+        squeeze_channels: int,
+        activation: Callable[[Tensor], Tensor] = ReLU(),
+        scale_activation: Callable[[Tensor], Tensor] = Sigmoid(),
+    ) -> None:
+        super(SqueezeExcitation, self).__init__()
+        self.avgpool = AdaptiveAvgPool2D(1)
+        self.fc1 = Conv2D(in_channels=input_channels, out_channels=squeeze_channels, kernel_size=1)
+        self.fc2 = Conv2D(in_channels=squeeze_channels, out_channels=input_channels, kernel_size=1)
+        self.activation = activation
+        self.scale_activation = scale_activation
+    def forward(self, input: paddle.Tensor) -> paddle.Tensor:
+        scale = self.avgpool(input)
+        scale = self.fc1(scale)
+        scale = self.activation(scale)
+        scale = self.fc2(scale)
+        scale = self.scale_activation(scale)
+        return scale * input
+def stochastic_depth(input, p, mode, training=True):
+    """
+    Implements the Stochastic Depth from `"Deep Networks with Stochastic Depth"
+    <https://arxiv.org/abs/1603.09382>`_  used for randomly dropping residual
+    branches of residual architectures.
+    Args:
+        input (paddle.Tensor): The input tensor or arbitrary dimensions with the first one
+                    being its batch i.e. a batch with ``N`` rows.
+        p (float): probability of the input to be zeroed.
+        mode (str): ``"batch"`` or ``"row"``.
+                    ``"batch"`` randomly zeroes the entire input, ``"row"`` zeroes
+                    randomly selected rows from the batch.
+        training (bool): apply stochastic depth if is ``True``. Default: ``True``
+    Returns:
+        paddle.Tensor: The randomly zeroed tensor.
+    """
+    if p < 0.0 or p > 1.0:
+        raise ValueError(f"drop probability has to be between 0 and 1, but got {p}")
+    if mode not in ["batch", "row"]:
+        raise ValueError(f"mode has to be either 'batch' or 'row', but got {mode}")
+    if not training or p == 0.0:
+        return input
+    survival_rate = 1.0 - p
+    if mode == "row":
+        size = [input.shape[0]] + [1] * (input.ndim - 1)
+    else:
+        size = [1] * input.ndim
+    noise = paddle.empty(size, dtype=input.dtype)
+    survival_rate = paddle.to_tensor(survival_rate, dtype=input.dtype)
+    paddle.assign(paddle.bernoulli(paddle.broadcast_to(survival_rate, noise.shape)), noise)
+    if survival_rate > 0.0:
+        noise /= survival_rate
+    return input * noise
+class StochasticDepth(Layer):
+    """
+    See :func:`stochastic_depth`.
+    """
+    def __init__(self, p: float, mode: str) -> None:
+        super(StochasticDepth, self).__init__()
+        self.p = p
+        self.mode = mode
+    def forward(self, input):
+        return stochastic_depth(input, self.p, self.mode, self.training)
+    def __repr__(self):
+        s = f"{self.__class__.__name__}(p={self.p}, mode={self.mode})"
+        return s
+def _make_ntuple(value, n):
+    """Helper function to create a tuple of size n with the given value."""
+    if isinstance(value, int):
+        return (value,) * n
+    return value
+class ConvNormActivation(Sequential):
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        kernel_size: Union[int, Sequence[int]] = 3,
+        stride: Union[int, Sequence[int]] = 1,
+        padding: Optional[Union[int, Sequence[int], str]] = None,
+        groups: int = 1,
+        norm_layer: Optional[Callable[..., paddle.nn.Layer]] = BatchNorm,
+        activation_layer: Optional[Callable[..., paddle.nn.Layer]] = ReLU,
+        dilation: Union[int, Sequence[int]] = 1,
+        inplace: Optional[bool] = True,
+        bias: Optional[bool] = None,
+        conv_layer: Callable[..., Conv2D] = Conv2D,
+    ) -> None:
+        if padding is None:
+            padding = (kernel_size - 1) // 2 * dilation
+        else:
+            padding = _make_ntuple(padding, len(kernel_size))
+        layers = [
+            conv_layer(
+                in_channels,
+                out_channels,
+                kernel_size,
+                stride,
+                padding,
+                dilation=dilation,
+                groups=groups,
+                bias_attr=False if bias is None else bias,
+            )
+        ]
+        if norm_layer is not None:
+            norm_layer_instance = norm_layer(out_channels, use_global_stats=True)
+            layers.append(norm_layer_instance)
+        if activation_layer is not None:
+            layers.append(activation_layer)
+        super(ConvNormActivation, self).__init__(*layers)
+        self.out_channels = out_channels
+class Conv2DNormActivation(ConvNormActivation):
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        kernel_size: Union[int, Tuple[int, int]] = 3,
+        stride: Union[int, Tuple[int, int]] = 1,
+        padding: Optional[Union[int, Tuple[int, int], str]] = None,
+        groups: int = 1,
+        norm_layer: Optional[Callable[..., paddle.nn.Layer]] = BatchNorm,
+        activation_layer: Optional[Callable[..., paddle.nn.Layer]] = ReLU,
+        dilation: Union[int, Tuple[int, int]] = 1,
+        inplace: Optional[bool] = True,
+        bias: Optional[bool] = None,
+    ) -> None:
+        super().__init__(
+            in_channels,
+            out_channels,
+            kernel_size,
+            stride,
+            padding,
+            groups,
+            norm_layer,
+            activation_layer,
+            dilation,
+            inplace,
+            bias,
+            Conv2D,
+        )
+class EfficientNet_V2_S_Weights:
+    IMAGENET1K_V1 = "https://download.pytorch.org/models/efficientnet_v2_s-dd5fe13b.pth"
+    def __init__(self, url: str, transforms: Callable[..., Any], meta: Dict[str, Any]) -> None:
+        self.url = url
+        self.transforms = transforms
+        self.meta = meta
+    def state_dict(self, progress: bool = True, check_hash: bool = False) -> Dict[str, Any]:
+        path = get_weights_path_from_url(self.url, progress=progress, check_hash=check_hash)
+        return paddle.load(path)
+    @classmethod
+    def verify(cls, weights):
+        if weights is None:
+            return None
+        if not isinstance(weights, EfficientNet_V2_S_Weights):
+            raise ValueError(f"weights must be an instance of EfficientNet_V2_S_Weights, but got {type(weights)}")
+        return weights
+@dataclass
+class _MBConvConfig:
+    expand_ratio: float
+    kernel: int
+    stride: int
+    input_channels: int
+    out_channels: int
+    num_layers: int
+    block: Callable[..., paddle.nn.Layer]
+    @staticmethod
+    def adjust_channels(channels: int, width_mult: float, min_value: Optional[int] = None) -> int:
+        return _make_divisible(channels * width_mult, 8, min_value)
+class MBConvConfig(_MBConvConfig):
+    def __init__(
+        self,
+        expand_ratio: float,
+        kernel: int,
+        stride: int,
+        input_channels: int,
+        out_channels: int,
+        num_layers: int,
+        width_mult: float = 1.0,
+        depth_mult: float = 1.0,
+        block: Optional[Callable[..., paddle.nn.Layer]] = None,
+    ) -> None:
+        input_channels = self.adjust_channels(input_channels, width_mult)
+        out_channels = self.adjust_channels(out_channels, width_mult)
+        num_layers = self.adjust_depth(num_layers, depth_mult)
+        if block is None:
+            block = MBConv
+        super().__init__(expand_ratio, kernel, stride, input_channels, out_channels, num_layers, block)
+    @staticmethod
+    def adjust_depth(num_layers: int, depth_mult: float):
+        return int(math.ceil(num_layers * depth_mult))
+class FusedMBConvConfig(_MBConvConfig):
+    def __init__(
+        self,
+        expand_ratio: float,
+        kernel: int,
+        stride: int,
+        input_channels: int,
+        out_channels: int,
+        num_layers: int,
+        block: Optional[Callable[..., paddle.nn.Layer]] = None,
+    ) -> None:
+        if block is None:
+            block = FusedMBConv
+        super().__init__(expand_ratio, kernel, stride, input_channels, out_channels, num_layers, block)
+class MBConv(Layer):
+    def __init__(
+        self,
+        cnf,
+        stochastic_depth_prob: float,
+        norm_layer: Callable[..., Layer],
+        se_layer: Callable[..., Layer] = SqueezeExcitation,
+    ) -> None:
+        super(MBConv, self).__init__()
+        if not (1 <= cnf.stride <= 2):
+            raise ValueError("illegal stride value")
+        self.use_res_connect = cnf.stride == 1 and cnf.input_channels == cnf.out_channels
+        layers = []
+        activation_layer = nn.Silu()
+        # expand
+        expanded_channels = cnf.adjust_channels(cnf.input_channels, cnf.expand_ratio)
+        if expanded_channels != cnf.input_channels:
+            layers.append(
+                Conv2DNormActivation(
+                    cnf.input_channels,
+                    expanded_channels,
+                    kernel_size=1,
+                    norm_layer=norm_layer,
+                    activation_layer=activation_layer,
+                )
+            )
+        # depthwise
+        layers.append(
+            Conv2DNormActivation(
+                expanded_channels,
+                expanded_channels,
+                kernel_size=cnf.kernel,
+                stride=cnf.stride,
+                groups=expanded_channels,
+                norm_layer=norm_layer,
+                activation_layer=activation_layer,
+            )
+        )
+        # squeeze and excitation
+        squeeze_channels = max(1, cnf.input_channels // 4)
+        layers.append(se_layer(expanded_channels, squeeze_channels, activation=nn.Silu()))
+        # project
+        layers.append(
+            Conv2DNormActivation(
+                expanded_channels, cnf.out_channels, kernel_size=1, norm_layer=norm_layer, activation_layer=None
+            )
+        )
+        self.block = Sequential(*layers)
+        self.stochastic_depth = StochasticDepth(stochastic_depth_prob, "row")
+        self.out_channels = cnf.out_channels
+    def forward(self, input) -> paddle.Tensor:
+        result = self.block(input)
+        if self.use_res_connect:
+            result = self.stochastic_depth(result)
+            result += input
+        return result
+class FusedMBConv(Layer):
+    def __init__(
+        self,
+        cnf: "FusedMBConvConfig",
+        stochastic_depth_prob: float,
+        norm_layer: Callable[..., Layer],
+    ) -> None:
+        super(FusedMBConv, self).__init__()
+        if not (1 <= cnf.stride <= 2):
+            raise ValueError("illegal stride value")
+        self.use_res_connect = cnf.stride == 1 and cnf.input_channels == cnf.out_channels
+        layers: List[Layer] = []
+        activation_layer = nn.Silu()
+        expanded_channels = cnf.adjust_channels(cnf.input_channels, cnf.expand_ratio)
+        if expanded_channels != cnf.input_channels:
+            # fused expand and project
+            layers.append(
+                Conv2DNormActivation(
+                    cnf.input_channels,
+                    expanded_channels,
+                    kernel_size=cnf.kernel,
+                    stride=cnf.stride,
+                    norm_layer=norm_layer,
+                    activation_layer=activation_layer,
+                )
+            )
+            # project
+            layers.append(
+                Conv2DNormActivation(
+                    expanded_channels,
+                    cnf.out_channels,
+                    kernel_size=1,
+                    norm_layer=norm_layer,
+                    activation_layer=None,
+                )
+            )
+        else:
+            layers.append(
+                Conv2DNormActivation(
+                    cnf.input_channels,
+                    cnf.out_channels,
+                    kernel_size=cnf.kernel,
+                    stride=cnf.stride,
+                    norm_layer=norm_layer,
+                    activation_layer=activation_layer,
+                )
+            )
+        self.block = Sequential(*layers)
+        self.stochastic_depth = StochasticDepth(stochastic_depth_prob, "row")
+        self.out_channels = cnf.out_channels
+    def forward(self, input: Tensor) -> Tensor:
+        result = self.block(input)
+        if self.use_res_connect:
+            result = self.stochastic_depth(result)
+            result += input
+        return result
+class EfficientNet(Layer):
+    def __init__(
+        self,
+        inverted_residual_setting: Sequence[Union[MBConvConfig, FusedMBConvConfig]],
+        dropout: float,
+        stochastic_depth_prob: float = 0.2,
+        num_classes: int = 1000,
+        norm_layer: Optional[Callable[..., paddle.nn.Layer]] = None,
+        last_channel: Optional[int] = None,
+    ) -> None:
+        super().__init__()
+        if not inverted_residual_setting:
+            raise ValueError("The inverted_residual_setting should not be empty")
+        elif not (
+            isinstance(inverted_residual_setting, Sequence)
+            and all([isinstance(s, _MBConvConfig) for s in inverted_residual_setting])
+        ):
+            raise TypeError("The inverted_residual_setting should be List[MBConvConfig]")
+        if norm_layer is None:
+            norm_layer = BatchNorm2D
+        layers: List[paddle.nn.Layer] = []
+        firstconv_output_channels = inverted_residual_setting[0].input_channels
+        layers.append(
+            Conv2DNormActivation(
+                3, firstconv_output_channels, kernel_size=3, stride=2, norm_layer=norm_layer, activation_layer=Silu()
+            )
+        )
+        total_stage_blocks = sum(cnf.num_layers for cnf in inverted_residual_setting)
+        stage_block_id = 0
+        for cnf in inverted_residual_setting:
+            stage: List[paddle.nn.Layer] = []
+            for _ in range(cnf.num_layers):
+                block_cnf = copy.copy(cnf)
+                if stage:
+                    block_cnf.input_channels = block_cnf.out_channels
+                    block_cnf.stride = 1
+                sd_prob = stochastic_depth_prob * float(stage_block_id) / total_stage_blocks
+                stage.append(block_cnf.block(block_cnf, sd_prob, norm_layer))
+                stage_block_id += 1
+            layers.append(Sequential(*stage))
+        lastconv_input_channels = inverted_residual_setting[-1].out_channels
+        lastconv_output_channels = last_channel if last_channel is not None else 4 * lastconv_input_channels
+        layers.append(
+            Conv2DNormActivation(
+                lastconv_input_channels,
+                lastconv_output_channels,
+                kernel_size=1,
+                norm_layer=norm_layer,
+                activation_layer=Silu(),
+            )
+        )
+        self.features = Sequential(*layers)
+        self.avgpool = AdaptiveAvgPool2D(output_size=1)
+        self.classifier = Sequential(
+            Dropout(p=dropout), Linear(in_features=lastconv_output_channels, out_features=num_classes)
+        )
+        for m in self.sublayers():
+            if isinstance(m, Conv2D):
+                KaimingNormal()(m.weight)
+                if m.bias is not None:
+                    Constant(value=0.0)(m.bias)
+            elif isinstance(m, (BatchNorm2D, GroupNorm)):
+                Constant(value=1.0)(m.weight)
+                Constant(value=0.0)(m.bias)
+            elif isinstance(m, Linear):
+                init_range = 1.0 / math.sqrt(m.weight.shape[1])
+                Uniform(low=-init_range, high=init_range)(m.weight)
+                Constant(value=0.0)(m.bias)
+    def forward(self, x: paddle.Tensor) -> paddle.Tensor:
+        x = self.features(x)
+        x = self.avgpool(x)
+        x = paddle.flatten(x=x, start_axis=1)
+        x = self.classifier(x)
+        return x
+def _make_divisible(value: float, divisor: int, min_value: Optional[int] = None) -> int:
+    if min_value is None:
+        min_value = divisor
+    new_value = max(min_value, int(value + divisor / 2) // divisor * divisor)
+    if new_value < 0.9 * value:
+        new_value += divisor
+    return new_value
+def _efficientnet(
+    inverted_residual_setting: Sequence[Union[MBConvConfig, FusedMBConvConfig]],
+    dropout: float,
+    last_channel: Optional[int],
+    weights: Optional[EfficientNet_V2_S_Weights],
+    progress: bool,
+    **kwargs: Any
+) -> EfficientNet:
+    if weights is not None:
+        kwargs["num_classes"] = len(weights.meta["categories"])
+    model = EfficientNet(inverted_residual_setting, dropout, last_channel=last_channel, **kwargs)
+    if weights is not None:
+        model.set_state_dict(weights.state_dict(progress=progress, check_hash=True))
+    return model
+def _efficientnet_conf(
+    arch: str, **kwargs: Any
+) -> Tuple[Sequence[Union[MBConvConfig, FusedMBConvConfig]], Optional[int]]:
+    inverted_residual_setting: Sequence[Union[MBConvConfig, FusedMBConvConfig]]
+    if arch.startswith("efficientnet_v2_s"):
+        inverted_residual_setting = [
+            FusedMBConvConfig(1, 3, 1, 24, 24, 2),
+            FusedMBConvConfig(4, 3, 2, 24, 48, 4),
+            FusedMBConvConfig(4, 3, 2, 48, 64, 4),
+            MBConvConfig(4, 3, 2, 64, 128, 6),
+            MBConvConfig(6, 3, 1, 128, 160, 9),
+            MBConvConfig(6, 3, 2, 160, 256, 15),
+        ]
+        last_channel = 1280
+    else:
+        raise ValueError(f"Unsupported model type {arch}")
+    return inverted_residual_setting, last_channel
+def efficientnet_v2_s(
+    *, weights: Optional[EfficientNet_V2_S_Weights] = None, progress: bool = True, **kwargs: Any
+) -> EfficientNet:
+    weights = EfficientNet_V2_S_Weights.verify(weights)
+    inverted_residual_setting, last_channel = _efficientnet_conf("efficientnet_v2_s")
+    return _efficientnet(
+        inverted_residual_setting,
+        kwargs.pop("dropout", 0.2),
+        last_channel,
+        weights,
+        progress,
+        norm_layer=partial(BatchNorm2D, epsilon=0.001),
+        **kwargs,
+    )

VLMEvalKit_old/PaddleMIX/ppdiffusers/ppdiffusers/models/stable_cascade/modules/effnet.py ADDED Viewed

	@@ -0,0 +1,83 @@

+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import paddle
+import paddle.nn as nn
+from .efficientnet_v2_s import efficientnet_v2_s
+class BatchNorm2D(nn.Layer):
+    def __init__(self, num_features, eps=1e-5, momentum=0.1, affine=True, track_running_stats=True):
+        super(BatchNorm2D, self).__init__()
+        self.num_features = num_features
+        self.eps = eps
+        self.momentum = momentum
+        self.affine = affine
+        self.track_running_stats = track_running_stats
+        if self.affine:
+            self.weight = self.create_parameter(
+                shape=[num_features], default_initializer=paddle.nn.initializer.Constant(value=1.0)
+            )
+            self.bias = self.create_parameter(
+                shape=[num_features], default_initializer=paddle.nn.initializer.Constant(value=0.0)
+            )
+        else:
+            self.weight = None
+            self.bias = None
+        if self.track_running_stats:
+            self._mean = self.create_parameter(
+                shape=[num_features], default_initializer=paddle.nn.initializer.Constant(value=0.0), is_bias=False
+            )
+            self._variance = self.create_parameter(
+                shape=[num_features], default_initializer=paddle.nn.initializer.Constant(value=1.0), is_bias=False
+            )
+            self._mean.stop_gradient = True
+            self._variance.stop_gradient = True
+        else:
+            self._mean = None
+            self._variance = None
+    def forward(self, input):
+        mean = self._mean
+        variance = self._variance
+        output = (input - paddle.unsqueeze(mean, axis=[0, 2, 3])) / paddle.unsqueeze(
+            paddle.sqrt(variance + self.eps), axis=[0, 2, 3]
+        )
+        if self.affine:
+            output = output * paddle.unsqueeze(self.weight, axis=[0, 2, 3]) + paddle.unsqueeze(
+                self.bias, axis=[0, 2, 3]
+            )
+        return output
+class EfficientNetEncoder(nn.Layer):
+    def __init__(self, c_latent=16):
+        super().__init__()
+        self.backbone = efficientnet_v2_s().features
+        self.backbone.eval()
+        self.mapper = nn.Sequential(
+            nn.Conv2D(1280, c_latent, kernel_size=1, bias_attr=False),
+            BatchNorm2D(c_latent, affine=False),
+        )
+        self.mapper.eval()
+    def forward(self, x):
+        x = self.backbone(x)
+        x = self.mapper(x)
+        return x

VLMEvalKit_old/PaddleMIX/ppdiffusers/ppdiffusers/models/stable_cascade/modules/previewer.py ADDED Viewed

	@@ -0,0 +1,80 @@

+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import paddle
+class Previewer(paddle.nn.Layer):
+    def __init__(self, c_in=16, c_hidden=512, c_out=3):
+        super().__init__()
+        self.blocks = paddle.nn.Sequential(
+            paddle.nn.Conv2D(in_channels=c_in, out_channels=c_hidden, kernel_size=1),
+            paddle.nn.GELU(),
+            paddle.nn.BatchNorm2D(num_features=c_hidden),
+            paddle.nn.Conv2D(in_channels=c_hidden, out_channels=c_hidden, kernel_size=3, padding=1),
+            paddle.nn.GELU(),
+            paddle.nn.BatchNorm2D(num_features=c_hidden),
+            paddle.nn.Conv2DTranspose(
+                in_channels=c_hidden,
+                out_channels=c_hidden // 2,
+                kernel_size=2,
+                stride=2,
+            ),
+            paddle.nn.GELU(),
+            paddle.nn.BatchNorm2D(num_features=c_hidden // 2),
+            paddle.nn.Conv2D(
+                in_channels=c_hidden // 2,
+                out_channels=c_hidden // 2,
+                kernel_size=3,
+                padding=1,
+            ),
+            paddle.nn.GELU(),
+            paddle.nn.BatchNorm2D(num_features=c_hidden // 2),
+            paddle.nn.Conv2DTranspose(
+                in_channels=c_hidden // 2,
+                out_channels=c_hidden // 4,
+                kernel_size=2,
+                stride=2,
+            ),
+            paddle.nn.GELU(),
+            paddle.nn.BatchNorm2D(num_features=c_hidden // 4),
+            paddle.nn.Conv2D(
+                in_channels=c_hidden // 4,
+                out_channels=c_hidden // 4,
+                kernel_size=3,
+                padding=1,
+            ),
+            paddle.nn.GELU(),
+            paddle.nn.BatchNorm2D(num_features=c_hidden // 4),
+            paddle.nn.Conv2DTranspose(
+                in_channels=c_hidden // 4,
+                out_channels=c_hidden // 4,
+                kernel_size=2,
+                stride=2,
+            ),
+            paddle.nn.GELU(),
+            paddle.nn.BatchNorm2D(num_features=c_hidden // 4),
+            paddle.nn.Conv2D(
+                in_channels=c_hidden // 4,
+                out_channels=c_hidden // 4,
+                kernel_size=3,
+                padding=1,
+            ),
+            paddle.nn.GELU(),
+            paddle.nn.BatchNorm2D(num_features=c_hidden // 4),
+            paddle.nn.Conv2D(in_channels=c_hidden // 4, out_channels=c_out, kernel_size=1),
+        )
+    def forward(self, x):
+        return self.blocks(x)

VLMEvalKit_old/PaddleMIX/ppdiffusers/ppdiffusers/models/stable_cascade/modules/stage_a.py ADDED Viewed

	@@ -0,0 +1,206 @@

+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import paddle
+from torchtools.nn import VectorQuantize
+class ResBlock(paddle.nn.Layer):
+    def __init__(self, c, c_hidden):
+        super().__init__()
+        self.norm1 = paddle.nn.LayerNorm(normalized_shape=c, weight_attr=False, bias_attr=False, epsilon=1e-06)
+        self.depthwise = paddle.nn.Sequential(
+            paddle.nn.Pad2D(padding=1, mode="replicate"),
+            paddle.nn.Conv2D(in_channels=c, out_channels=c, kernel_size=3, groups=c),
+        )
+        self.norm2 = paddle.nn.LayerNorm(normalized_shape=c, weight_attr=False, bias_attr=False, epsilon=1e-06)
+        self.channelwise = paddle.nn.Sequential(
+            paddle.nn.Linear(in_features=c, out_features=c_hidden),
+            paddle.nn.GELU(),
+            paddle.nn.Linear(in_features=c_hidden, out_features=c),
+        )
+        out_19 = paddle.create_parameter(
+            shape=paddle.zeros(shape=[6]).shape,
+            dtype=paddle.zeros(shape=[6]).numpy().dtype,
+            default_initializer=paddle.nn.initializer.Assign(paddle.zeros(shape=[6])),
+        )
+        out_19.stop_gradient = not True
+        self.gammas = out_19
+        def _basic_init(module):
+            if isinstance(module, paddle.nn.Linear) or isinstance(module, paddle.nn.Conv2D):
+                init_XavierUniform = paddle.nn.initializer.XavierUniform()
+                init_XavierUniform(module.weight)
+                if module.bias is not None:
+                    init_Constant = paddle.nn.initializer.Constant(value=0)
+                    init_Constant(module.bias)
+        self.apply(_basic_init)
+    def _norm(self, x, norm):
+        return norm(x.transpose(perm=[0, 2, 3, 1])).transpose(perm=[0, 3, 1, 2])
+    def forward(self, x):
+        mods = self.gammas
+        x_temp = self._norm(x, self.norm1) * (1 + mods[0]) + mods[1]
+        x = x + self.depthwise(x_temp) * mods[2]
+        x_temp = self._norm(x, self.norm2) * (1 + mods[3]) + mods[4]
+        x = x + self.channelwise(x_temp.transpose(perm=[0, 2, 3, 1])).transpose(perm=[0, 3, 1, 2]) * mods[5]
+        return x
+class StageA(paddle.nn.Layer):
+    def __init__(
+        self,
+        levels=2,
+        bottleneck_blocks=12,
+        c_hidden=384,
+        c_latent=4,
+        codebook_size=8192,
+        scale_factor=0.43,
+    ):
+        super().__init__()
+        self.c_latent = c_latent
+        self.scale_factor = scale_factor
+        c_levels = [(c_hidden // 2**i) for i in reversed(range(levels))]
+        self.in_block = paddle.nn.Sequential(
+            paddle.nn.PixelUnshuffle(downscale_factor=2),
+            paddle.nn.Conv2D(in_channels=3 * 4, out_channels=c_levels[0], kernel_size=1),
+        )
+        down_blocks = []
+        for i in range(levels):
+            if i > 0:
+                down_blocks.append(
+                    paddle.nn.Conv2D(
+                        in_channels=c_levels[i - 1],
+                        out_channels=c_levels[i],
+                        kernel_size=4,
+                        stride=2,
+                        padding=1,
+                    )
+                )
+            block = ResBlock(c_levels[i], c_levels[i] * 4)
+            down_blocks.append(block)
+        down_blocks.append(
+            paddle.nn.Sequential(
+                paddle.nn.Conv2D(
+                    in_channels=c_levels[-1],
+                    out_channels=c_latent,
+                    kernel_size=1,
+                    bias_attr=False,
+                ),
+                paddle.nn.BatchNorm2D(num_features=c_latent),
+            )
+        )
+        self.down_blocks = paddle.nn.Sequential(*down_blocks)
+        self.down_blocks[0]
+        self.codebook_size = codebook_size
+        self.vquantizer = VectorQuantize(c_latent, k=codebook_size)
+        up_blocks = [
+            paddle.nn.Sequential(paddle.nn.Conv2D(in_channels=c_latent, out_channels=c_levels[-1], kernel_size=1))
+        ]
+        for i in range(levels):
+            for j in range(bottleneck_blocks if i == 0 else 1):
+                block = ResBlock(c_levels[levels - 1 - i], c_levels[levels - 1 - i] * 4)
+                up_blocks.append(block)
+            if i < levels - 1:
+                up_blocks.append(
+                    paddle.nn.Conv2DTranspose(
+                        in_channels=c_levels[levels - 1 - i],
+                        out_channels=c_levels[levels - 2 - i],
+                        kernel_size=4,
+                        stride=2,
+                        padding=1,
+                    )
+                )
+        self.up_blocks = paddle.nn.Sequential(*up_blocks)
+        self.out_block = paddle.nn.Sequential(
+            paddle.nn.Conv2D(in_channels=c_levels[0], out_channels=3 * 4, kernel_size=1),
+            paddle.nn.PixelShuffle(upscale_factor=2),
+        )
+    def encode(self, x, quantize=False):
+        x = self.in_block(x)
+        x = self.down_blocks(x)
+        if quantize:
+            qe, (vq_loss, commit_loss), indices = self.vquantizer.forward(x, dim=1)
+            return (
+                qe / self.scale_factor,
+                x / self.scale_factor,
+                indices,
+                vq_loss + commit_loss * 0.25,
+            )
+        else:
+            return x / self.scale_factor, None, None, None
+    def decode(self, x):
+        x = x * self.scale_factor
+        x = self.up_blocks(x)
+        x = self.out_block(x)
+        return x
+    def forward(self, x, quantize=False):
+        qe, x, _, vq_loss = self.encode(x, quantize)
+        x = self.decode(qe)
+        return x, vq_loss
+class Discriminator(paddle.nn.Layer):
+    def __init__(self, c_in=3, c_cond=0, c_hidden=512, depth=6):
+        super().__init__()
+        d = max(depth - 3, 3)
+        layers = [
+            paddle.nn.utils.spectral_norm(
+                layer=paddle.nn.Conv2D(
+                    in_channels=c_in,
+                    out_channels=c_hidden // 2**d,
+                    kernel_size=3,
+                    stride=2,
+                    padding=1,
+                )
+            ),
+            paddle.nn.LeakyReLU(negative_slope=0.2),
+        ]
+        for i in range(depth - 1):
+            c_in = c_hidden // 2 ** max(d - i, 0)
+            c_out = c_hidden // 2 ** max(d - 1 - i, 0)
+            layers.append(
+                paddle.nn.utils.spectral_norm(
+                    layer=paddle.nn.Conv2D(
+                        in_channels=c_in,
+                        out_channels=c_out,
+                        kernel_size=3,
+                        stride=2,
+                        padding=1,
+                    )
+                )
+            )
+            layers.append(paddle.nn.InstanceNorm2D(num_features=c_out, momentum=1 - 0.1))
+            layers.append(paddle.nn.LeakyReLU(negative_slope=0.2))
+        self.encoder = paddle.nn.Sequential(*layers)
+        self.shuffle = paddle.nn.Conv2D(
+            in_channels=c_hidden + c_cond if c_cond > 0 else c_hidden,
+            out_channels=1,
+            kernel_size=1,
+        )
+        self.logits = paddle.nn.Sigmoid()
+    def forward(self, x, cond=None):
+        x = self.encoder(x)
+        if cond is not None:
+            cond = cond.reshape([cond.shape[0], cond.shape[1], 1, 1]).expand(shape=[-1, -1, x.shape[-2], x.shape[-1]])
+            x = paddle.concat(x=[x, cond], axis=1)
+        x = self.shuffle(x)
+        x = self.logits(x)
+        return x