tuandunghcmut commited on
Commit
64f97f7
·
verified ·
1 Parent(s): 7ee2e81

Add files using upload-large-folder tool

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. VLM2Vec/evaluation/__init__.py +0 -0
  2. VLM2Vec/evaluation/eval_blip.py +209 -0
  3. VLM2Vec/evaluation/eval_clip.py +185 -0
  4. VLM2Vec/evaluation/eval_openclip.py +185 -0
  5. VLM2Vec/evaluation/eval_siglip.py +186 -0
  6. VLM2Vec/src/dist_utils.py +92 -0
  7. VLMEvalKit_old/PaddleMIX/deploy/README.md +110 -0
  8. VLMEvalKit_old/PaddleMIX/deploy/README_en.md +108 -0
  9. VLMEvalKit_old/PaddleMIX/docs/CHANGELOG.md +44 -0
  10. VLMEvalKit_old/PaddleMIX/docs/FAQ.md +0 -0
  11. VLMEvalKit_old/PaddleMIX/docs/train_tutorial.md +10 -0
  12. VLMEvalKit_old/PaddleMIX/ppdiffusers/LICENSE +203 -0
  13. VLMEvalKit_old/PaddleMIX/ppdiffusers/Makefile +30 -0
  14. VLMEvalKit_old/PaddleMIX/ppdiffusers/README.md +1278 -0
  15. VLMEvalKit_old/PaddleMIX/ppdiffusers/deploy/controlnet/export_model.py +263 -0
  16. VLMEvalKit_old/PaddleMIX/ppdiffusers/deploy/controlnet/scripts/benchmark_paddle.sh +32 -0
  17. VLMEvalKit_old/PaddleMIX/ppdiffusers/deploy/controlnet/scripts/benchmark_torch.sh +26 -0
  18. VLMEvalKit_old/PaddleMIX/ppdiffusers/deploy/controlnet/scripts/inference.sh +26 -0
  19. VLMEvalKit_old/PaddleMIX/ppdiffusers/deploy/sd15/export_model.py +205 -0
  20. VLMEvalKit_old/PaddleMIX/ppdiffusers/deploy/sd15/infer.py +408 -0
  21. VLMEvalKit_old/PaddleMIX/ppdiffusers/deploy/sd15/infer_dygraph_paddle.py +357 -0
  22. VLMEvalKit_old/PaddleMIX/ppdiffusers/deploy/sd15/infer_dygraph_torch.py +417 -0
  23. VLMEvalKit_old/PaddleMIX/ppdiffusers/deploy/sd3/README.md +77 -0
  24. VLMEvalKit_old/PaddleMIX/ppdiffusers/deploy/sd3/infer_dygraph_paddle.py +264 -0
  25. VLMEvalKit_old/PaddleMIX/ppdiffusers/deploy/sd3/infer_dygraph_torch.py +325 -0
  26. VLMEvalKit_old/PaddleMIX/ppdiffusers/deploy/sd3/scripts/benchmark_paddle.sh +32 -0
  27. VLMEvalKit_old/PaddleMIX/ppdiffusers/deploy/sd3/scripts/benchmark_torch.sh +21 -0
  28. VLMEvalKit_old/PaddleMIX/ppdiffusers/deploy/sd3/text_to_image_generation-stable_diffusion_3.py +149 -0
  29. VLMEvalKit_old/PaddleMIX/ppdiffusers/deploy/sdxl/README.md +44 -0
  30. VLMEvalKit_old/PaddleMIX/ppdiffusers/ppdiffusers/experimental/rl/__init__.py +15 -0
  31. VLMEvalKit_old/PaddleMIX/ppdiffusers/ppdiffusers/experimental/rl/value_guided_sampling.py +153 -0
  32. VLMEvalKit_old/PaddleMIX/ppdiffusers/ppdiffusers/models/animate_anyone/pose_guider.py +68 -0
  33. VLMEvalKit_old/PaddleMIX/ppdiffusers/ppdiffusers/models/animate_anyone/resnet.py +235 -0
  34. VLMEvalKit_old/PaddleMIX/ppdiffusers/ppdiffusers/models/animate_anyone/transformer_3d.py +155 -0
  35. VLMEvalKit_old/PaddleMIX/ppdiffusers/ppdiffusers/models/animate_anyone/unet_3d.py +615 -0
  36. VLMEvalKit_old/PaddleMIX/ppdiffusers/ppdiffusers/models/hotshot_xl/__init__.py +28 -0
  37. VLMEvalKit_old/PaddleMIX/ppdiffusers/ppdiffusers/models/hotshot_xl/resnet.py +124 -0
  38. VLMEvalKit_old/PaddleMIX/ppdiffusers/ppdiffusers/models/hotshot_xl/transformer_3d.py +77 -0
  39. VLMEvalKit_old/PaddleMIX/ppdiffusers/ppdiffusers/models/hotshot_xl/unet.py +778 -0
  40. VLMEvalKit_old/PaddleMIX/ppdiffusers/ppdiffusers/models/hotshot_xl/unet_blocks.py +717 -0
  41. VLMEvalKit_old/PaddleMIX/ppdiffusers/ppdiffusers/models/stable_cascade/gdf/__init__.py +162 -0
  42. VLMEvalKit_old/PaddleMIX/ppdiffusers/ppdiffusers/models/stable_cascade/gdf/loss_weights.py +128 -0
  43. VLMEvalKit_old/PaddleMIX/ppdiffusers/ppdiffusers/models/stable_cascade/gdf/scalers.py +58 -0
  44. VLMEvalKit_old/PaddleMIX/ppdiffusers/ppdiffusers/models/stable_cascade/gdf/targets.py +59 -0
  45. VLMEvalKit_old/PaddleMIX/ppdiffusers/ppdiffusers/models/stable_cascade/modules/__init__.py +17 -0
  46. VLMEvalKit_old/PaddleMIX/ppdiffusers/ppdiffusers/models/stable_cascade/modules/common.py +151 -0
  47. VLMEvalKit_old/PaddleMIX/ppdiffusers/ppdiffusers/models/stable_cascade/modules/efficientnet_v2_s.py +561 -0
  48. VLMEvalKit_old/PaddleMIX/ppdiffusers/ppdiffusers/models/stable_cascade/modules/effnet.py +83 -0
  49. VLMEvalKit_old/PaddleMIX/ppdiffusers/ppdiffusers/models/stable_cascade/modules/previewer.py +80 -0
  50. VLMEvalKit_old/PaddleMIX/ppdiffusers/ppdiffusers/models/stable_cascade/modules/stage_a.py +206 -0
VLM2Vec/evaluation/__init__.py ADDED
File without changes
VLM2Vec/evaluation/eval_blip.py ADDED
@@ -0,0 +1,209 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # https://github.com/salesforce/LAVIS/blob/3446bac20c5646d35ae383ebe6d13cec4f8b00cb/examples/blip2_feature_extraction.ipynb
2
+ # https://medium.com/@enrico.randellini/image-and-text-features-extraction-with-blip-and-blip-2-how-to-build-a-multimodal-search-engine-a4ceabf51fbe
3
+ from src.arguments import ModelArguments, DataArguments, TrainingArguments
4
+ from transformers import HfArgumentParser, AutoProcessor
5
+ from src.dataset import EvalDataset
6
+ from evaluation.collator import EvalCollator, BLIP2Collator
7
+ from torch.utils.data import DataLoader
8
+ import torch
9
+ from tqdm import tqdm
10
+ import numpy as np
11
+ import pickle
12
+ import os
13
+ from datasets import load_dataset
14
+ from evaluation.eval_utils import get_pred, save_results, print_results
15
+ from lavis.models import load_model_and_preprocess
16
+
17
+ t2i_tasks = [
18
+ "EDIS", "MSCOCO_t2i","VisDial","VisualNews_t2i","WebQA", "Wiki-SS-NQ", # retrieval
19
+ ]
20
+ i2t_tasks = [
21
+ "MSCOCO_i2t","VisualNews_i2t", # retrieval
22
+ "ImageNet-1K", "N24News", "HatefulMemes", "VOC2007", "SUN397", "Place365", "ImageNet-A", "ImageNet-R", "ObjectNet", "Country211" # classification
23
+ ]
24
+
25
+
26
+ def get_pred_blip(qry_t, tgt_t, mode="multimodal2text"):
27
+
28
+ if mode == "multimodal2text":
29
+ # Compute the dot product between each token in qry_t (shape 32, dim) and tgt_t (shape candidate_num, dim)
30
+ # This results in a (32, candidate_num) array of scores
31
+ scores = np.dot(qry_t, tgt_t.T) # (32, dim) dot (candidate_num, dim).T -> (32, candidate_num)
32
+
33
+ # Find the maximum score for each candidate across the 32 tokens
34
+ max_scores = np.max(scores, axis=0) # Max along the 32 tokens for each candidate (shape candidate_num)
35
+
36
+ # The prediction is the index of the target with the highest maximum score
37
+ pred = np.argmax(max_scores)
38
+
39
+ elif mode == "text2multimodal":
40
+ # Compute the dot product between qry_t (shape dim) and each of the 32 tokens in the target (candidate_num, 32, dim)
41
+ # This results in a (candidate_num, 32) array of scores
42
+ scores = np.dot(tgt_t, qry_t) # (candidate_num, 32, dim) dot (dim) -> (candidate_num, 32)
43
+
44
+ # Find the maximum score for each candidate across the 32 tokens
45
+ max_scores = np.max(scores, axis=1) # Max along the 32 tokens for each candidate (shape candidate_num)
46
+
47
+ # The prediction is the index of the target with the highest maximum score
48
+ pred = np.argmax(max_scores)
49
+
50
+ return max_scores, pred
51
+
52
+
53
+ def main():
54
+ parser = HfArgumentParser((ModelArguments, DataArguments, TrainingArguments))
55
+ model_args, data_args, training_args = parser.parse_args_into_dataclasses()
56
+ model_args: ModelArguments
57
+ data_args: DataArguments
58
+ training_args: TrainingArguments
59
+ model, vis_processors, txt_processors = load_model_and_preprocess(name=model_args.model_name, model_type=model_args.model_type, is_eval=True, device=training_args.device)
60
+ embedding_type = data_args.embedding_type
61
+ eval_collator = BLIP2Collator(
62
+ data_args=data_args,
63
+ vis_processors=vis_processors,
64
+ txt_processors=txt_processors
65
+ )
66
+
67
+ # ToDo: This part of code is a little bit hacky. Need to refactor later.
68
+ for idx, subset in enumerate(data_args.subset_name):
69
+ print(f"\033[91m{idx+1}/{len(data_args.subset_name)}: Processing {subset} now!\033[0m")
70
+ encode_qry_path = os.path.join(data_args.encode_output_path, f"{subset}_qry")
71
+ encode_tgt_path = os.path.join(data_args.encode_output_path, f"{subset}_tgt")
72
+ if os.path.exists(encode_qry_path) and os.path.exists(encode_tgt_path):
73
+ continue
74
+
75
+ eval_qry_dataset = EvalDataset(
76
+ data_args=data_args,
77
+ subset=subset,
78
+ text_field="qry_text",
79
+ img_path_field="qry_img_path",
80
+ )
81
+ eval_tgt_dataset = EvalDataset(
82
+ data_args=data_args,
83
+ subset=subset,
84
+ text_field="tgt_text",
85
+ img_path_field="tgt_img_path",
86
+ )
87
+
88
+ eval_qry_loader = DataLoader(
89
+ eval_qry_dataset,
90
+ batch_size=training_args.per_device_eval_batch_size,
91
+ collate_fn=eval_collator,
92
+ shuffle=False,
93
+ drop_last=False,
94
+ num_workers=training_args.dataloader_num_workers,
95
+ )
96
+ eval_tgt_loader = DataLoader(
97
+ eval_tgt_dataset,
98
+ batch_size=training_args.per_device_eval_batch_size,
99
+ collate_fn=eval_collator,
100
+ shuffle=False,
101
+ drop_last=False,
102
+ num_workers=training_args.dataloader_num_workers,
103
+ )
104
+
105
+ encoded_tensor = []
106
+ with torch.no_grad():
107
+ for batch in tqdm(eval_qry_loader, desc="Encode query"):
108
+ samples, modes = batch
109
+ for sample, mode in zip(samples, modes):
110
+ image_features, text_features = None, None
111
+ if sample["image"] is not None:
112
+ sample["image"] = sample["image"].to(training_args.device)
113
+ image_features = model.extract_features(sample, mode="image").image_embeds[0,0,:] # (dim,)
114
+ if sample["text_input"]:
115
+ text_features = model.extract_features(sample, mode="text").text_embeds[0,0,:] # (dim,)
116
+ if embedding_type=="unimodal":
117
+ if subset in t2i_tasks:
118
+ features = text_features
119
+ if subset in i2t_tasks:
120
+ features = image_features
121
+ elif embedding_type=="multimodal":
122
+ if image_features is None:
123
+ features = text_features
124
+ elif text_features is None:
125
+ features = image_features
126
+ else:
127
+ features = image_features + text_features
128
+ encoded_tensor.append(features.cpu().detach().float().numpy())
129
+ with open(encode_qry_path, 'wb') as f:
130
+ pickle.dump((encoded_tensor, eval_qry_dataset.paired_data), f)
131
+
132
+ encoded_tensor = []
133
+ with torch.no_grad():
134
+ for batch in tqdm(eval_tgt_loader, desc="Encode target"):
135
+ samples, modes = batch
136
+ for sample, mode in zip(samples, modes):
137
+ image_features, text_features = None, None
138
+ if sample["image"] is not None:
139
+ sample["image"] = sample["image"].to(training_args.device)
140
+ image_features = model.extract_features(sample, mode="image").image_embeds[0,0,:] # (dim,)
141
+ if sample["text_input"]:
142
+ text_features = model.extract_features(sample, mode="text").text_embeds[0,0,:] # (dim,)
143
+ if embedding_type=="unimodal":
144
+ if subset in t2i_tasks:
145
+ features = image_features
146
+ if subset in i2t_tasks:
147
+ features = text_features
148
+ elif embedding_type=="multimodal":
149
+ if image_features is None:
150
+ features = text_features
151
+ elif text_features is None:
152
+ features = image_features
153
+ else:
154
+ features = image_features + text_features
155
+ encoded_tensor.append(features.cpu().detach().float().numpy())
156
+ with open(encode_tgt_path, 'wb') as f:
157
+ pickle.dump((encoded_tensor, eval_tgt_dataset.paired_data), f)
158
+
159
+ results = {}
160
+ for subset in tqdm(data_args.subset_name, desc="calculate score"):
161
+ encode_qry_path = os.path.join(data_args.encode_output_path, f"{subset}_qry")
162
+ encode_tgt_path = os.path.join(data_args.encode_output_path, f"{subset}_tgt")
163
+ with open(encode_qry_path, 'rb') as f:
164
+ qry_tensor, qry_index = pickle.load(f)
165
+ with open(encode_tgt_path, 'rb') as f:
166
+ tgt_tensor, tgt_index = pickle.load(f)
167
+ qry_dict, tgt_dict = {}, {}
168
+ for qry_t, tt in zip(qry_tensor, qry_index):
169
+ text, img_path = tt["text"], tt["img_path"]
170
+ qry_dict[(text, img_path)] = qry_t
171
+ for tgt_t, tt in zip(tgt_tensor, tgt_index):
172
+ text, img_path = tt["text"], tt["img_path"]
173
+ tgt_dict[(text, img_path)] = tgt_t
174
+
175
+ eval_data = load_dataset(
176
+ data_args.dataset_name,
177
+ subset,
178
+ split=data_args.dataset_split,
179
+ )
180
+ acc = 0
181
+ all_pred = []
182
+ for row in eval_data:
183
+ qry_t = qry_dict[(row["qry_text"], row["qry_img_path"])] # (dim,)
184
+ tgt_t, all_candidates = [], []
185
+ if row["tgt_text"] == "":
186
+ row["tgt_text"] = ["" for _ in range(len(row["tgt_img_path"]))]
187
+ for tt in zip(row["tgt_text"], row["tgt_img_path"]):
188
+ tgt_t.append(tgt_dict[tt])
189
+ all_candidates.append(tt)
190
+ try:
191
+ tgt_t = np.stack(tgt_t, axis=0) # (num_candidate, dim)
192
+ except:
193
+ import ipdb; ipdb.set_trace()
194
+ scores, pred = get_pred(qry_t, tgt_t, normalization=model_args.normalize)
195
+ if pred == 0:
196
+ acc += 1
197
+ all_pred.append(all_candidates[pred])
198
+ with open(os.path.join(data_args.encode_output_path, f"{subset}_pred.txt"), "w") as f:
199
+ for item in all_pred:
200
+ f.write(f"{item}\n")
201
+ accuracy = acc / len(eval_data) * 100
202
+ results[subset] = accuracy
203
+ print(f"\033[91m{subset} accuracy: {acc/len(eval_data)}\033[0m")
204
+ save_results(results, model_args, data_args, training_args)
205
+ print_results(results)
206
+
207
+
208
+ if __name__ == "__main__":
209
+ main()
VLM2Vec/evaluation/eval_clip.py ADDED
@@ -0,0 +1,185 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from src.arguments import ModelArguments, DataArguments, TrainingArguments
2
+ from transformers import HfArgumentParser, AutoProcessor, AutoTokenizer, CLIPModel
3
+ from src.dataset import EvalDataset
4
+ from src.collator import CLIPCollator
5
+ from torch.utils.data import DataLoader
6
+ import torch
7
+ from tqdm import tqdm
8
+ import numpy as np
9
+ import pickle
10
+ import os
11
+ from datasets import load_dataset
12
+ from evaluation.eval_utils import get_pred, save_results, print_results
13
+
14
+ t2i_tasks = [
15
+ "CIRR", "NIGHTS", "EDIS", "MSCOCO_t2i","VisDial","VisualNews_t2i","WebQA", "Wiki-SS-NQ", "OVEN", # retrieval
16
+ ]
17
+ i2t_tasks = [
18
+ "MSCOCO_i2t","VisualNews_i2t", # retrieval
19
+ "ImageNet-1K", "N24News", "HatefulMemes", "VOC2007", "SUN397", "Place365", "ImageNet-A", "ImageNet-R", "ObjectNet", "Country211" # classification
20
+ ]
21
+
22
+
23
+ def main():
24
+ parser = HfArgumentParser((ModelArguments, DataArguments, TrainingArguments))
25
+ model_args, data_args, training_args = parser.parse_args_into_dataclasses()
26
+ model_args: ModelArguments
27
+ data_args: DataArguments
28
+ training_args: TrainingArguments
29
+
30
+ model = CLIPModel.from_pretrained(model_args.model_name)
31
+ processor = AutoProcessor.from_pretrained(model_args.model_name)
32
+ tokenizer = AutoTokenizer.from_pretrained(model_args.model_name)
33
+
34
+ embedding_type = data_args.embedding_type
35
+ eval_collator = CLIPCollator(
36
+ data_args=data_args,
37
+ vis_processors=processor,
38
+ txt_processors=tokenizer
39
+ )
40
+ model.eval()
41
+ model = model.to(training_args.device)
42
+
43
+ # ToDo: This part of code is a little bit hacky. Need to refactor later.
44
+ for idx, subset in enumerate(data_args.subset_name):
45
+ print(f"\033[91m{idx+1}/{len(data_args.subset_name)}: Processing {subset} now!\033[0m")
46
+ encode_qry_path = os.path.join(data_args.encode_output_path, f"{subset}_qry")
47
+ encode_tgt_path = os.path.join(data_args.encode_output_path, f"{subset}_tgt")
48
+ if os.path.exists(encode_qry_path) and os.path.exists(encode_tgt_path):
49
+ continue
50
+
51
+ eval_qry_dataset = EvalDataset(
52
+ data_args=data_args,
53
+ subset=subset,
54
+ text_field="qry_text",
55
+ img_path_field="qry_img_path",
56
+ )
57
+ eval_tgt_dataset = EvalDataset(
58
+ data_args=data_args,
59
+ subset=subset,
60
+ text_field="tgt_text",
61
+ img_path_field="tgt_img_path",
62
+ )
63
+
64
+ eval_qry_loader = DataLoader(
65
+ eval_qry_dataset,
66
+ batch_size=training_args.per_device_eval_batch_size,
67
+ collate_fn=eval_collator,
68
+ shuffle=False,
69
+ drop_last=False,
70
+ num_workers=training_args.dataloader_num_workers,
71
+ )
72
+ eval_tgt_loader = DataLoader(
73
+ eval_tgt_dataset,
74
+ batch_size=training_args.per_device_eval_batch_size,
75
+ collate_fn=eval_collator,
76
+ shuffle=False,
77
+ drop_last=False,
78
+ num_workers=training_args.dataloader_num_workers,
79
+ )
80
+
81
+ encoded_tensor = []
82
+ with torch.no_grad():
83
+ for batch in tqdm(eval_qry_loader, desc="Encode query"):
84
+ batch = {key: value.to(training_args.device) for key, value in batch.items() if type(value) is not list}
85
+ image_features, text_features = None, None
86
+ if "pixel_values" in batch:
87
+ image_features = model.get_image_features(batch["pixel_values"])
88
+ if "input_ids" in batch:
89
+ text_features = model.get_text_features(batch["input_ids"], batch["attention_mask"])
90
+ if embedding_type=="unimodal":
91
+ if subset in t2i_tasks:
92
+ features = text_features
93
+ if subset in i2t_tasks:
94
+ features = image_features
95
+ elif embedding_type=="multimodal":
96
+ if image_features is None:
97
+ features = text_features
98
+ elif text_features is None:
99
+ features = image_features
100
+ else:
101
+ try:
102
+ features = image_features + text_features
103
+ except:
104
+ import ipdb; ipdb.set_trace()
105
+ encoded_tensor.append(features.cpu().detach().float().numpy())
106
+ encoded_tensor = np.concatenate(encoded_tensor)
107
+ with open(encode_qry_path, 'wb') as f:
108
+ pickle.dump((encoded_tensor, eval_qry_dataset.paired_data), f)
109
+
110
+ encoded_tensor = []
111
+ with torch.no_grad():
112
+ for batch in tqdm(eval_tgt_loader, desc="Encode target"):
113
+ batch = {key: value.to(training_args.device) for key, value in batch.items() if type(value) is not list}
114
+ image_features, text_features = None, None
115
+ if "pixel_values" in batch:
116
+ image_features = model.get_image_features(batch["pixel_values"])
117
+ if "input_ids" in batch:
118
+ text_features = model.get_text_features(batch["input_ids"], batch["attention_mask"])
119
+ if embedding_type=="unimodal":
120
+ if subset in t2i_tasks:
121
+ features = image_features
122
+ if subset in i2t_tasks:
123
+ features = text_features
124
+ elif embedding_type=="multimodal":
125
+ if image_features is None:
126
+ features = text_features
127
+ elif text_features is None:
128
+ features = image_features
129
+ else:
130
+ features = image_features + text_features
131
+ encoded_tensor.append(features.cpu().detach().float().numpy())
132
+ encoded_tensor = np.concatenate(encoded_tensor)
133
+ with open(encode_tgt_path, 'wb') as f:
134
+ pickle.dump((encoded_tensor, eval_tgt_dataset.paired_data), f)
135
+ results = {}
136
+ for subset in tqdm(data_args.subset_name, desc="calculate score"):
137
+ encode_qry_path = os.path.join(data_args.encode_output_path, f"{subset}_qry")
138
+ encode_tgt_path = os.path.join(data_args.encode_output_path, f"{subset}_tgt")
139
+ with open(encode_qry_path, 'rb') as f:
140
+ qry_tensor, qry_index = pickle.load(f)
141
+ with open(encode_tgt_path, 'rb') as f:
142
+ tgt_tensor, tgt_index = pickle.load(f)
143
+ qry_dict, tgt_dict = {}, {}
144
+ for qry_t, tt in zip(qry_tensor, qry_index):
145
+ text, img_path = tt["text"], tt["img_path"]
146
+ qry_dict[(text, img_path)] = qry_t
147
+ for tgt_t, tt in zip(tgt_tensor, tgt_index):
148
+ text, img_path = tt["text"], tt["img_path"]
149
+ tgt_dict[(text, img_path)] = tgt_t
150
+
151
+ eval_data = load_dataset(
152
+ data_args.dataset_name,
153
+ subset,
154
+ split=data_args.dataset_split,
155
+ )
156
+ acc = 0
157
+ all_pred = []
158
+ for row in eval_data:
159
+ qry_t = qry_dict[(row["qry_text"], row["qry_img_path"])] # (dim,)
160
+ tgt_t, all_candidates = [], []
161
+ if row["tgt_text"] == "":
162
+ row["tgt_text"] = ["" for _ in range(len(row["tgt_img_path"]))]
163
+ for tt in zip(row["tgt_text"], row["tgt_img_path"]):
164
+ tgt_t.append(tgt_dict[tt])
165
+ all_candidates.append(tt)
166
+ try:
167
+ tgt_t = np.stack(tgt_t, axis=0) # (num_candidate, dim)
168
+ except:
169
+ import ipdb; ipdb.set_trace()
170
+ scores, pred = get_pred(qry_t, tgt_t, normalization=model_args.normalize)
171
+ if pred == 0:
172
+ acc += 1
173
+ all_pred.append(all_candidates[pred])
174
+ with open(os.path.join(data_args.encode_output_path, f"{subset}_pred.txt"), "w") as f:
175
+ for item in all_pred:
176
+ f.write(f"{item}\n")
177
+ accuracy = acc / len(eval_data) * 100
178
+ results[subset] = accuracy
179
+ print(f"\033[91m{subset} accuracy: {acc/len(eval_data)}\033[0m")
180
+ save_results(results, model_args, data_args, training_args)
181
+ print_results(results)
182
+
183
+
184
+ if __name__ == "__main__":
185
+ main()
VLM2Vec/evaluation/eval_openclip.py ADDED
@@ -0,0 +1,185 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import open_clip
2
+ from src.arguments import ModelArguments, DataArguments, TrainingArguments
3
+ from transformers import HfArgumentParser, AutoProcessor, AutoTokenizer, CLIPModel
4
+ from src.dataset import EvalDataset
5
+ from src.collator import EvalCollator, BLIP2Collator, CLIPCollator, OpenCLIPCollator
6
+ from torch.utils.data import DataLoader
7
+ import torch
8
+ from tqdm import tqdm
9
+ import numpy as np
10
+ import pickle
11
+ import os
12
+ from datasets import load_dataset
13
+ from evaluation.eval_utils import get_pred, save_results, print_results
14
+
15
+ t2i_tasks = [
16
+ "CIRR", "NIGHTS", "EDIS", "MSCOCO_t2i","VisDial","VisualNews_t2i","WebQA", "Wiki-SS-NQ", "OVEN", # retrieval
17
+ ]
18
+ i2t_tasks = [
19
+ "MSCOCO_i2t","VisualNews_i2t", # retrieval
20
+ "ImageNet-1K", "N24News", "HatefulMemes", "VOC2007", "SUN397", "Place365", "ImageNet-A", "ImageNet-R", "ObjectNet", "Country211" # classification
21
+ ]
22
+
23
+
24
+ def main():
25
+ parser = HfArgumentParser((ModelArguments, DataArguments, TrainingArguments))
26
+ model_args, data_args, training_args = parser.parse_args_into_dataclasses()
27
+ model_args: ModelArguments
28
+ data_args: DataArguments
29
+ training_args: TrainingArguments
30
+
31
+ model, processor = open_clip.create_model_from_pretrained('hf-hub:laion/CLIP-ViT-B-16-laion2B-s34B-b88K')
32
+ tokenizer = open_clip.get_tokenizer('hf-hub:laion/CLIP-ViT-B-16-laion2B-s34B-b88K')
33
+
34
+ embedding_type = data_args.embedding_type
35
+ eval_collator = OpenCLIPCollator(
36
+ data_args=data_args,
37
+ vis_processors=processor,
38
+ txt_processors=tokenizer
39
+ )
40
+ model.eval()
41
+ model = model.to(training_args.device)
42
+
43
+ # ToDo: This part of code is a little bit hacky. Need to refactor later.
44
+ for idx, subset in enumerate(data_args.subset_name):
45
+ print(f"\033[91m{idx+1}/{len(data_args.subset_name)}: Processing {subset} now!\033[0m")
46
+ encode_qry_path = os.path.join(data_args.encode_output_path, f"{subset}_qry")
47
+ encode_tgt_path = os.path.join(data_args.encode_output_path, f"{subset}_tgt")
48
+ if os.path.exists(encode_qry_path) and os.path.exists(encode_tgt_path):
49
+ continue
50
+
51
+ eval_qry_dataset = EvalDataset(
52
+ data_args=data_args,
53
+ subset=subset,
54
+ text_field="qry_text",
55
+ img_path_field="qry_img_path",
56
+ )
57
+ eval_tgt_dataset = EvalDataset(
58
+ data_args=data_args,
59
+ subset=subset,
60
+ text_field="tgt_text",
61
+ img_path_field="tgt_img_path",
62
+ )
63
+
64
+ eval_qry_loader = DataLoader(
65
+ eval_qry_dataset,
66
+ batch_size=training_args.per_device_eval_batch_size,
67
+ collate_fn=eval_collator,
68
+ shuffle=False,
69
+ drop_last=False,
70
+ num_workers=training_args.dataloader_num_workers,
71
+ )
72
+ eval_tgt_loader = DataLoader(
73
+ eval_tgt_dataset,
74
+ batch_size=training_args.per_device_eval_batch_size,
75
+ collate_fn=eval_collator,
76
+ shuffle=False,
77
+ drop_last=False,
78
+ num_workers=training_args.dataloader_num_workers,
79
+ )
80
+
81
+ encoded_tensor = []
82
+ with torch.no_grad():
83
+ for batch in tqdm(eval_qry_loader, desc="Encode query"):
84
+ batch = {key: value.to(training_args.device) for key, value in batch.items() if type(value) is not list}
85
+ image_features, text_features = None, None
86
+ if "pixel_values" in batch:
87
+ image_features = model.encode_image(batch["pixel_values"])
88
+ if "input_ids" in batch:
89
+ text_features = model.encode_text(batch["input_ids"])
90
+ if embedding_type=="unimodal":
91
+ if subset in t2i_tasks:
92
+ features = text_features
93
+ if subset in i2t_tasks:
94
+ features = image_features
95
+ elif embedding_type=="multimodal":
96
+ if image_features is None:
97
+ features = text_features
98
+ elif text_features is None:
99
+ features = image_features
100
+ else:
101
+ try:
102
+ features = image_features + text_features
103
+ except:
104
+ import ipdb; ipdb.set_trace()
105
+ encoded_tensor.append(features.cpu().detach().float().numpy())
106
+ encoded_tensor = np.concatenate(encoded_tensor)
107
+ with open(encode_qry_path, 'wb') as f:
108
+ pickle.dump((encoded_tensor, eval_qry_dataset.paired_data), f)
109
+
110
+ encoded_tensor = []
111
+ with torch.no_grad():
112
+ for batch in tqdm(eval_tgt_loader, desc="Encode target"):
113
+ batch = {key: value.to(training_args.device) for key, value in batch.items() if type(value) is not list}
114
+ image_features, text_features = None, None
115
+ if "pixel_values" in batch:
116
+ image_features = model.encode_image(batch["pixel_values"])
117
+ if "input_ids" in batch:
118
+ text_features = model.encode_text(batch["input_ids"])
119
+ if embedding_type=="unimodal":
120
+ if subset in t2i_tasks:
121
+ features = image_features
122
+ if subset in i2t_tasks:
123
+ features = text_features
124
+ elif embedding_type=="multimodal":
125
+ if image_features is None:
126
+ features = text_features
127
+ elif text_features is None:
128
+ features = image_features
129
+ else:
130
+ features = image_features + text_features
131
+ encoded_tensor.append(features.cpu().detach().float().numpy())
132
+ encoded_tensor = np.concatenate(encoded_tensor)
133
+ with open(encode_tgt_path, 'wb') as f:
134
+ pickle.dump((encoded_tensor, eval_tgt_dataset.paired_data), f)
135
+ results = {}
136
+ for subset in tqdm(data_args.subset_name, desc="calculate score"):
137
+ encode_qry_path = os.path.join(data_args.encode_output_path, f"{subset}_qry")
138
+ encode_tgt_path = os.path.join(data_args.encode_output_path, f"{subset}_tgt")
139
+ with open(encode_qry_path, 'rb') as f:
140
+ qry_tensor, qry_index = pickle.load(f)
141
+ with open(encode_tgt_path, 'rb') as f:
142
+ tgt_tensor, tgt_index = pickle.load(f)
143
+ qry_dict, tgt_dict = {}, {}
144
+ for qry_t, tt in zip(qry_tensor, qry_index):
145
+ text, img_path = tt["text"], tt["img_path"]
146
+ qry_dict[(text, img_path)] = qry_t
147
+ for tgt_t, tt in zip(tgt_tensor, tgt_index):
148
+ text, img_path = tt["text"], tt["img_path"]
149
+ tgt_dict[(text, img_path)] = tgt_t
150
+
151
+ eval_data = load_dataset(
152
+ data_args.dataset_name,
153
+ subset,
154
+ split=data_args.dataset_split,
155
+ )
156
+ acc = 0
157
+ all_pred = []
158
+ for row in eval_data:
159
+ qry_t = qry_dict[(row["qry_text"], row["qry_img_path"])] # (dim,)
160
+ tgt_t, all_candidates = [], []
161
+ if row["tgt_text"] == "":
162
+ row["tgt_text"] = ["" for _ in range(len(row["tgt_img_path"]))]
163
+ for tt in zip(row["tgt_text"], row["tgt_img_path"]):
164
+ tgt_t.append(tgt_dict[tt])
165
+ all_candidates.append(tt)
166
+ try:
167
+ tgt_t = np.stack(tgt_t, axis=0) # (num_candidate, dim)
168
+ except:
169
+ import ipdb; ipdb.set_trace()
170
+ scores, pred = get_pred(qry_t, tgt_t, normalization=model_args.normalize)
171
+ if pred == 0:
172
+ acc += 1
173
+ all_pred.append(all_candidates[pred])
174
+ with open(os.path.join(data_args.encode_output_path, f"{subset}_pred.txt"), "w") as f:
175
+ for item in all_pred:
176
+ f.write(f"{item}\n")
177
+ accuracy = acc / len(eval_data) * 100
178
+ results[subset] = accuracy
179
+ print(f"\033[91m{subset} accuracy: {acc/len(eval_data)}\033[0m")
180
+ save_results(results, model_args, data_args, training_args)
181
+ print_results(results)
182
+
183
+
184
+ if __name__ == "__main__":
185
+ main()
VLM2Vec/evaluation/eval_siglip.py ADDED
@@ -0,0 +1,186 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from src.arguments import ModelArguments, DataArguments, TrainingArguments
2
+ from transformers import HfArgumentParser, AutoProcessor, AutoTokenizer, CLIPModel, AutoModel
3
+ from src.dataset import EvalDataset
4
+ from src.collator import EvalCollator, BLIP2Collator, CLIPCollator
5
+ from torch.utils.data import DataLoader
6
+ import torch
7
+ from tqdm import tqdm
8
+ import numpy as np
9
+ import pickle
10
+ import os
11
+ from datasets import load_dataset
12
+ from evaluation.eval_utils import get_pred, save_results, print_results
13
+
14
+ t2i_tasks = [
15
+ "CIRR", "NIGHTS", "EDIS", "MSCOCO_t2i","VisDial","VisualNews_t2i","WebQA", "Wiki-SS-NQ", "OVEN", # retrieval
16
+ ]
17
+ i2t_tasks = [
18
+ "MSCOCO_i2t","VisualNews_i2t", # retrieval
19
+ "ImageNet-1K", "N24News", "HatefulMemes", "VOC2007", "SUN397", "Place365", "ImageNet-A", "ImageNet-R", "ObjectNet", "Country211" # classification
20
+ ]
21
+
22
+
23
+ def main():
24
+ parser = HfArgumentParser((ModelArguments, DataArguments, TrainingArguments))
25
+ model_args, data_args, training_args = parser.parse_args_into_dataclasses()
26
+ model_args: ModelArguments
27
+ data_args: DataArguments
28
+ training_args: TrainingArguments
29
+
30
+ model = AutoModel.from_pretrained("google/siglip-so400m-patch14-384")
31
+ all_processor = AutoProcessor.from_pretrained("google/siglip-so400m-patch14-384")
32
+ processor = all_processor.image_processor
33
+ tokenizer = all_processor.tokenizer
34
+
35
+ embedding_type = data_args.embedding_type
36
+ eval_collator = CLIPCollator(
37
+ data_args=data_args,
38
+ vis_processors=processor,
39
+ txt_processors=tokenizer
40
+ )
41
+ model.eval()
42
+ model = model.to(training_args.device)
43
+
44
+ # ToDo: This part of code is a little bit hacky. Need to refactor later.
45
+ for idx, subset in enumerate(data_args.subset_name):
46
+ print(f"\033[91m{idx+1}/{len(data_args.subset_name)}: Processing {subset} now!\033[0m")
47
+ encode_qry_path = os.path.join(data_args.encode_output_path, f"{subset}_qry")
48
+ encode_tgt_path = os.path.join(data_args.encode_output_path, f"{subset}_tgt")
49
+ if os.path.exists(encode_qry_path) and os.path.exists(encode_tgt_path):
50
+ continue
51
+
52
+ eval_qry_dataset = EvalDataset(
53
+ data_args=data_args,
54
+ subset=subset,
55
+ text_field="qry_text",
56
+ img_path_field="qry_img_path",
57
+ )
58
+ eval_tgt_dataset = EvalDataset(
59
+ data_args=data_args,
60
+ subset=subset,
61
+ text_field="tgt_text",
62
+ img_path_field="tgt_img_path",
63
+ )
64
+
65
+ eval_qry_loader = DataLoader(
66
+ eval_qry_dataset,
67
+ batch_size=training_args.per_device_eval_batch_size,
68
+ collate_fn=eval_collator,
69
+ shuffle=False,
70
+ drop_last=False,
71
+ num_workers=training_args.dataloader_num_workers,
72
+ )
73
+ eval_tgt_loader = DataLoader(
74
+ eval_tgt_dataset,
75
+ batch_size=training_args.per_device_eval_batch_size,
76
+ collate_fn=eval_collator,
77
+ shuffle=False,
78
+ drop_last=False,
79
+ num_workers=training_args.dataloader_num_workers,
80
+ )
81
+
82
+ encoded_tensor = []
83
+ with torch.no_grad():
84
+ for batch in tqdm(eval_qry_loader, desc="Encode query"):
85
+ batch = {key: value.to(training_args.device) for key, value in batch.items() if type(value) is not list}
86
+ image_features, text_features = None, None
87
+ if "pixel_values" in batch:
88
+ image_features = model.get_image_features(batch["pixel_values"])
89
+ if "input_ids" in batch:
90
+ text_features = model.get_text_features(batch["input_ids"], batch["attention_mask"])
91
+ if embedding_type=="unimodal":
92
+ if subset in t2i_tasks:
93
+ features = text_features
94
+ if subset in i2t_tasks:
95
+ features = image_features
96
+ elif embedding_type=="multimodal":
97
+ if image_features is None:
98
+ features = text_features
99
+ elif text_features is None:
100
+ features = image_features
101
+ else:
102
+ try:
103
+ features = image_features + text_features
104
+ except:
105
+ import ipdb; ipdb.set_trace()
106
+ encoded_tensor.append(features.cpu().detach().float().numpy())
107
+ encoded_tensor = np.concatenate(encoded_tensor)
108
+ with open(encode_qry_path, 'wb') as f:
109
+ pickle.dump((encoded_tensor, eval_qry_dataset.paired_data), f)
110
+
111
+ encoded_tensor = []
112
+ with torch.no_grad():
113
+ for batch in tqdm(eval_tgt_loader, desc="Encode target"):
114
+ batch = {key: value.to(training_args.device) for key, value in batch.items() if type(value) is not list}
115
+ image_features, text_features = None, None
116
+ if "pixel_values" in batch:
117
+ image_features = model.get_image_features(batch["pixel_values"])
118
+ if "input_ids" in batch:
119
+ text_features = model.get_text_features(batch["input_ids"], batch["attention_mask"])
120
+ if embedding_type=="unimodal":
121
+ if subset in t2i_tasks:
122
+ features = image_features
123
+ if subset in i2t_tasks:
124
+ features = text_features
125
+ elif embedding_type=="multimodal":
126
+ if image_features is None:
127
+ features = text_features
128
+ elif text_features is None:
129
+ features = image_features
130
+ else:
131
+ features = image_features + text_features
132
+ encoded_tensor.append(features.cpu().detach().float().numpy())
133
+ encoded_tensor = np.concatenate(encoded_tensor)
134
+ with open(encode_tgt_path, 'wb') as f:
135
+ pickle.dump((encoded_tensor, eval_tgt_dataset.paired_data), f)
136
+ results = {}
137
+ for subset in tqdm(data_args.subset_name, desc="calculate score"):
138
+ encode_qry_path = os.path.join(data_args.encode_output_path, f"{subset}_qry")
139
+ encode_tgt_path = os.path.join(data_args.encode_output_path, f"{subset}_tgt")
140
+ with open(encode_qry_path, 'rb') as f:
141
+ qry_tensor, qry_index = pickle.load(f)
142
+ with open(encode_tgt_path, 'rb') as f:
143
+ tgt_tensor, tgt_index = pickle.load(f)
144
+ qry_dict, tgt_dict = {}, {}
145
+ for qry_t, tt in zip(qry_tensor, qry_index):
146
+ text, img_path = tt["text"], tt["img_path"]
147
+ qry_dict[(text, img_path)] = qry_t
148
+ for tgt_t, tt in zip(tgt_tensor, tgt_index):
149
+ text, img_path = tt["text"], tt["img_path"]
150
+ tgt_dict[(text, img_path)] = tgt_t
151
+
152
+ eval_data = load_dataset(
153
+ data_args.dataset_name,
154
+ subset,
155
+ split=data_args.dataset_split,
156
+ )
157
+ acc = 0
158
+ all_pred = []
159
+ for row in eval_data:
160
+ qry_t = qry_dict[(row["qry_text"], row["qry_img_path"])] # (dim,)
161
+ tgt_t, all_candidates = [], []
162
+ if row["tgt_text"] == "":
163
+ row["tgt_text"] = ["" for _ in range(len(row["tgt_img_path"]))]
164
+ for tt in zip(row["tgt_text"], row["tgt_img_path"]):
165
+ tgt_t.append(tgt_dict[tt])
166
+ all_candidates.append(tt)
167
+ try:
168
+ tgt_t = np.stack(tgt_t, axis=0) # (num_candidate, dim)
169
+ except:
170
+ import ipdb; ipdb.set_trace()
171
+ scores, pred = get_pred(qry_t, tgt_t, normalization=model_args.normalize)
172
+ if pred == 0:
173
+ acc += 1
174
+ all_pred.append(all_candidates[pred])
175
+ with open(os.path.join(data_args.encode_output_path, f"{subset}_pred.txt"), "w") as f:
176
+ for item in all_pred:
177
+ f.write(f"{item}\n")
178
+ accuracy = acc / len(eval_data) * 100
179
+ results[subset] = accuracy
180
+ print(f"\033[91m{subset} accuracy: {acc/len(eval_data)}\033[0m")
181
+ save_results(results, model_args, data_args, training_args)
182
+ print_results(results)
183
+
184
+
185
+ if __name__ == "__main__":
186
+ main()
VLM2Vec/src/dist_utils.py ADDED
@@ -0,0 +1,92 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Code adapted from SimCSE (https://github.com/princeton-nlp/SimCSE) governed by MIT license.
2
+
3
+ # Copyright (c) 2023, Salesforce, Inc.
4
+ # All rights reserved.
5
+ # SPDX-License-Identifier: BSD-3-Clause
6
+ # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
7
+
8
+ import torch
9
+ import torch.distributed as dist
10
+
11
+ class GatherLayer(torch.autograd.Function):
12
+ """
13
+ Gather tensors from all process, supporting backward propagation.
14
+ https://github.com/Spijkervet/SimCLR/blob/master/simclr/modules/gather.py
15
+ """
16
+ @staticmethod
17
+ def forward(ctx, input):
18
+ ctx.save_for_backward(input)
19
+ output = [torch.zeros_like(input) for _ in range(dist.get_world_size())]
20
+ dist.all_gather(output, input)
21
+ return tuple(output)
22
+
23
+ @staticmethod
24
+ def backward(ctx, *grads):
25
+ (input,) = ctx.saved_tensors
26
+ grad_out = torch.zeros_like(input)
27
+ grad_out[:] = grads[dist.get_rank()]
28
+ return grad_out
29
+
30
+
31
+ def dist_gather(x: torch.tensor):
32
+ if not dist.is_initialized(): return x
33
+ if len(x.shape) == 0:
34
+ x = x.reshape(1)
35
+ x_gather = GatherLayer.apply(x)
36
+ x_gather = torch.cat(x_gather, dim=0)
37
+ return x_gather
38
+
39
+
40
+ @torch.no_grad()
41
+ def dist_gather_nograd(x: torch.tensor):
42
+ if not dist.is_initialized(): return x
43
+ x_gather = [torch.ones_like(x) for _ in range(get_world_size())]
44
+ dist.all_gather(x_gather, x, async_op=False)
45
+ x_gather = torch.cat(x_gather, dim=0)
46
+ return x_gather
47
+
48
+
49
+ def get_rank():
50
+ if not dist.is_available():
51
+ return 0
52
+ if not dist.is_initialized():
53
+ return 0
54
+ return dist.get_rank()
55
+
56
+
57
+ def is_main():
58
+ return get_rank() == 0
59
+
60
+
61
+ def get_world_size():
62
+ if not dist.is_initialized():
63
+ return 1
64
+ else:
65
+ return dist.get_world_size()
66
+
67
+ def barrier():
68
+ if dist.is_initialized():
69
+ dist.barrier()
70
+
71
+
72
+ @torch.no_grad()
73
+ def varsize_gather_nograd(x: torch.Tensor):
74
+ """gather tensors of different sizes along the first dimension"""
75
+ if not dist.is_initialized():
76
+ return x
77
+
78
+ # determine max size
79
+ size = torch.tensor([x.shape[0]], device=x.device, dtype=torch.int)
80
+ allsizes = [torch.zeros_like(size) for _ in range(dist.get_world_size())]
81
+ dist.all_gather(allsizes, size)
82
+ max_size = max([size.cpu().max() for size in allsizes])
83
+
84
+ padded = torch.empty(max_size, *x.shape[1:], dtype=x.dtype, device=x.device)
85
+ padded[: x.shape[0]] = x
86
+ output = [torch.zeros_like(padded) for _ in range(dist.get_world_size())]
87
+ dist.all_gather(output, padded)
88
+
89
+ output = [tensor[: allsizes[k]] for k, tensor in enumerate(output)]
90
+ output = torch.cat(output, dim=0)
91
+
92
+ return output
VLMEvalKit_old/PaddleMIX/deploy/README.md ADDED
@@ -0,0 +1,110 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # PaddleMIX推理部署
2
+
3
+ [[English](README_en.md)]
4
+
5
+ PaddleMIX基于Paddle Inference,提供了python的部署方案。部署方式分为两种:
6
+ - 通过 **APPflow** ,设置static_mode = True 变量开启静态图推理,同时可配合trt加速推理;该方式部分模型不支持静态图以及trt,具体模型可参考[跨模态多场景应用](../applications/README.md/#跨模态多场景应用);
7
+
8
+ - 单模型部署
9
+
10
+
11
+ ## 1.APPflow部署
12
+
13
+ 在使用 PaddleMIX 一键预测 **APPflow** 时,可通过设置 static_mode = True 变量开启静态图推理,同时可配合trt加速推理。
14
+
15
+ ### 1.1 示例
16
+
17
+ ```python
18
+ >>> from paddlemix.appflow import Appflow
19
+ >>> from PIL import Image
20
+
21
+ >>> task = Appflow(app="openset_det_sam",
22
+ models=["GroundingDino/groundingdino-swint-ogc","Sam/SamVitH-1024"],
23
+ static_mode=True,
24
+ precision="fp32")
25
+ >>> image_pil = Image.open("beauty.png").convert("RGB")
26
+ >>> result = task(image=image_pil,prompt="women")
27
+ ```
28
+
29
+ ### 1.2 参数说明
30
+ | 参数 | 是否必须| 含义 |
31
+ |-------|-------|---------------------------------------------------------------------------------------------|
32
+ | --app | Yes| 应用名称 |
33
+ | --models | Yes | 需要使用的模型,可以是单个模型,也可以多个组合 |
34
+ | --static_mode | Option | 是否静态图推理,默认False |
35
+ | --precision | Option | 当 static_mode == True 时使用,默认fp32,可选择trt_fp32、trt_fp16 |
36
+
37
+ 说明:
38
+ - 部分模型不支持静态图以及trt,具体可参考[跨模态多场景应用](../applications/README.md)
39
+ - 生成的静态图将在模型名字对应的文件夹下 如:GroundingDino/groundingdino-swint-ogc/
40
+
41
+
42
+ ## 2. 单模型预测部署
43
+
44
+ Python端预测部署主要包含两个步骤:
45
+ - 导出预测模型
46
+ - 基于Python进行预测
47
+
48
+ 当前支持模型:
49
+ - [blip2](./blip2/README.md)
50
+ - [groundingdino](./groundingdino/README.md)
51
+ - [sam](./sam/README.md)
52
+ - [qwen_vl](./qwen_vl/README.md)
53
+
54
+ 以 groundingdino 为例子。
55
+
56
+ ### 2.1 导出预测模型
57
+
58
+ ```bash
59
+ cd deploy/groundingdino
60
+ # 导出groundingdino模型
61
+ python export.py \
62
+ --dino_type GroundingDino/groundingdino-swint-ogc
63
+ ```
64
+ 导出后目录下,包括 `model_state.pdiparams`, `model_state.pdiparams.info`, `model_state.pdmodel`等文件。
65
+
66
+ ### 2.2 基于python的预测
67
+
68
+ ```bash
69
+ python predict.py \
70
+ --text_encoder_type GroundingDino/groundingdino-swint-ogc \
71
+ --model_path output_groundingdino/GroundingDino/groundingdino-swint-ogc \
72
+ --input_image https://bj.bcebos.com/v1/paddlenlp/models/community/GroundingDino/000000004505.jpg \
73
+ --output_dir ./groundingdino_predict_output \
74
+ --prompt "bus"
75
+
76
+ ```
77
+
78
+ ## 3. 推理 BenchMark
79
+
80
+ > Note:
81
+ > 测试环境为:
82
+ Paddle 3.0,
83
+ PaddleMIX release/2.0
84
+ PaddleNLP2.7.2
85
+ A100 80G单卡。
86
+
87
+ ### 3.1 benchmark命令
88
+
89
+ 在 `deploy` 对应模型目录下的运行后加 --benchmark,
90
+ 如 GroundingDino 的benchmark命令为:
91
+
92
+ ```bash
93
+ cd deploy/groundingdino
94
+ python predict.py \
95
+ --text_encoder_type GroundingDino/groundingdino-swint-ogc \
96
+ --model_path output_groundingdino/GroundingDino/groundingdino-swint-ogc \
97
+ --input_image https://bj.bcebos.com/v1/paddlenlp/models/community/GroundingDino/000000004505.jpg \
98
+ --output_dir ./groundingdino_predict_output \
99
+ --prompt "bus" \
100
+ --benchmark True
101
+ ```
102
+
103
+ # A100性能数据
104
+ |模型|图片分辨率|数据类型 |Paddle Deploy |
105
+ |-|-|-|-|
106
+ |qwen-vl-7b|448*448|fp16|669.8 ms|
107
+ |llava-1.5-7b|336*336|fp16|981.2 ms|
108
+ |llava-1.6-7b|336*336|fp16|778.7 ms|
109
+ |groundingDino/groundingdino-swint-ogc|800*1193|fp32|100 ms|
110
+ |Sam/SamVitH-1024|1024*1024|fp32|121 ms|
VLMEvalKit_old/PaddleMIX/deploy/README_en.md ADDED
@@ -0,0 +1,108 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # PaddleMIX Inference Deployment
2
+
3
+ [[中文文档](README.md)]
4
+
5
+ PaddleMIX utilizes Paddle Inference and provides a Python-based deployment solution. There are two deployment methods:
6
+
7
+ 1. **APPflow Deployment**:
8
+ - By setting the `static_mode = True` variable in APPflow, you can enable static graph inference. Additionally, you can accelerate inference using TensorRT. Note that not all models support static graph or TensorRT. Please refer to the [Multi Modal And Scenario](../applications/README_en.md/#multi-modal-and-scenario) section for specific model support.
9
+
10
+ 2. **Single Model Deployment**:
11
+
12
+ For APPflow usage, you can set the `static_mode = True` variable to enable static graph inference and optionally accelerate inference using TensorRT.
13
+
14
+ ### 1.1 Exmaples
15
+
16
+ ```python
17
+ >>> from paddlemix.appflow import Appflow
18
+ >>> from PIL import Image
19
+
20
+ >>> task = Appflow(app="openset_det_sam",
21
+ models=["GroundingDino/groundingdino-swint-ogc","Sam/SamVitH-1024"],
22
+ static_mode=True,
23
+ precision="fp32")
24
+ >>> image_pil = Image.open("beauty.png").convert("RGB")
25
+ >>> result = task(image=image_pil,prompt="women")
26
+ ```
27
+
28
+ ### 1.2 Parameter Explanation
29
+ | Parameter | Required? | Meaning |
30
+ |-------|-------|---------------------------------------------------------------------------------------------|
31
+ | --app | Yes| Application name |
32
+ | --models | Yes | Model(s) used. Can be one model, or multiple models |
33
+ | --static_mode | Optional | Whether to use static graph inference, default to False |
34
+ | --precision | Optional | When `static_mode == True`, it defaults to using FP32. You can optionally select `trt_fp32` or `trt_fp16`. |
35
+
36
+ Instructions:
37
+ - Some models do not support static graph or TensorRT. For specific information, please refer to [Multi Modal And Scenario](../applications/README_en.md/#multi-modal-and-scenario).
38
+
39
+ - The generated static graph will be located in the folder corresponding to the model name, for example: `GroundingDino/groundingdino-swint-ogc/`.
40
+
41
+ ## 2. Single Model Prediction Deployment
42
+
43
+ Python-based prediction deployment mainly involves two steps:
44
+ - Exporting the predictive model
45
+ - Performing prediction using Python
46
+
47
+ Currently supported models:
48
+ - [blip2](./blip2/README.md)
49
+ - [groundingdino](./groundingdino/README.md)
50
+ - [sam](./sam/README.md)
51
+ - [qwen_vl](./qwen_vl/README.md)
52
+
53
+ Using groundingdino as an exmaple.
54
+
55
+ ### 2.1 Exporting Predictive Model
56
+
57
+ ```bash
58
+ cd deploy/groundingdino
59
+ # 导出groundingdino模型
60
+ python export.py \
61
+ --dino_type GroundingDino/groundingdino-swint-ogc
62
+ ```
63
+ Will be exported to the following directory, including `model_state.pdiparams`, `model_state.pdiparams.info`, `model_state.pdmodel`and other files.
64
+
65
+ ### 2.2 Python-based Inference
66
+
67
+ ```bash
68
+ python predict.py \
69
+ --text_encoder_type GroundingDino/groundingdino-swint-ogc \
70
+ --model_path output_groundingdino/GroundingDino/groundingdino-swint-ogc \
71
+ --input_image https://bj.bcebos.com/v1/paddlenlp/models/community/GroundingDino/000000004505.jpg \
72
+ --output_dir ./groundingdino_predict_output \
73
+ --prompt "bus"
74
+
75
+ ```
76
+
77
+ ## 3. BenchMark
78
+
79
+ > Note:
80
+ > environment
81
+ Paddle 3.0
82
+ PaddleMIX release/2.0
83
+ PaddleNLP 2.7.2
84
+ A100 80G。
85
+
86
+ ### 3.1 benchmark cmd
87
+
88
+ Add -- benchmark after running in the 'deploy' corresponding model directory to obtain the running time of the model.
89
+ example: GroundingDino benchmark:
90
+
91
+ ```bash
92
+ cd deploy/groundingdino
93
+ python predict.py \
94
+ --text_encoder_type GroundingDino/groundingdino-swint-ogc \
95
+ --model_path output_groundingdino/GroundingDino/groundingdino-swint-ogc \
96
+ --input_image https://bj.bcebos.com/v1/paddlenlp/models/community/GroundingDino/000000004505.jpg \
97
+ --output_dir ./groundingdino_predict_output \
98
+ --prompt "bus" \
99
+ --benchmark True
100
+ ```
101
+
102
+ |Model|image size|dtype |Paddle Deploy |
103
+ |-|-|-|-|
104
+ |qwen-vl-7b|448*448|fp16|669.8 ms|
105
+ |llava-1.5-7b|336*336|fp16|981.2 ms|
106
+ |llava-1.6-7b|336*336|fp16|778.7 ms|
107
+ |groundingDino/groundingdino-swint-ogc|800*1193|fp32|100 ms|
108
+ |Sam/SamVitH-1024|1024*1024|fp32|121 ms|
VLMEvalKit_old/PaddleMIX/docs/CHANGELOG.md ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # 版本更新信息
2
+
3
+ ## 最新版本信息
4
+
5
+ ### 2.0(07/26/2024)
6
+
7
+ #### 多模态理解
8
+
9
+ 1. 新增模型:LLaVA: v1.5-7b, v1.5-13b, v1,6-7b,CogAgent, CogVLM, Qwen-VL, InternLM-XComposer2
10
+ 2. 数据集增强:新增chatml_dataset图文对话数据读取方案,可自定义chat_template文件适配,支持混合数据集
11
+ 3. 工具链升级:新增Auto模块,统一SFT训练流程,兼容全参数、lora训练。新增mixtoken训练策略,SFT吞吐量提升5.6倍。支持Qwen-VL,LLaVA推理部署,较torch推理性能提升2.38倍
12
+
13
+ #### 多模态生成
14
+
15
+ 1. 视频生成能力:支持Sora相关技术,支持DiT、SiT、UViT训练推理,新增NaViT、MAGVIT-v2模型; 新增视频生成模型SVD、Open Sora,支持模型微调和推理; 新增姿态可控视频生成模型AnimateAnyone、即插即用视频生成模型AnimateDiff、GIF视频生成模型Hotshot-XL;
16
+ 2. 文生图模型库:新增高速推理文图生成模型LCM,适配SD/SDXL训练和推理;
17
+ 3. 工具链升级:发布ppdiffusers 0.24.1版本,新增peft,accelerate后端; 权重加载/保存全面升级,支持分布式、模型切片、safetensors等场景。
18
+ 4. 生态兼容:提供基于ppdiffusers开发的ComfyUI插件,支持了常见的模型加载转换、文生图、图生图、图像局部修改等任务。新增Stable Diffusion 1.5系列节点;新增Stable Diffusion XL系列节点。新增4个图像生成的workflow案例。
19
+
20
+ #### DataCopilot(多模态数据处理工具箱)
21
+
22
+ 1. 多模态数据集类型MMDataset,支持加载和导出Json、H5、Jsonl等多种数据存储格式,内置并发(map, filter)数据处理接口等
23
+ 2. 多模态数据格式工具,支持自定义数据结构,数据转换,离线格式检查
24
+ 3. 多模态数据分析工具,支持基本的统计信息,数据可视化功能,以及注册自定义功能
25
+
26
+ ### 1.0(11/15/2023)
27
+
28
+ #### 核心能力
29
+
30
+ 1. 大规模预训练: BLIP-2支持数据并行、sharding、模型并行,流水线并行训练;支持千亿参数规模训练; EVA-CLIP支持数据并行、sharding、模型并行训练; Stable Diffusion支持数据并行、sharding、BF16 O2训练; CLIP,Coca支持数据并行训练
31
+ 2. 有监督精调: Stable Diffusion,SDXL 支持LoRA精调
32
+ 3. 推理部署: 支持BLIP-2,miniGPT-4,Grounding DINO, SAM,Stable Diffusion动转静导出部署
33
+
34
+ #### 前沿模型
35
+ 1. 新增CLIP系列跨模态大模型:CLIP,EVA-CLIP,Coca
36
+ 2. 新增图生文跨模态大模型:BLIP-2,miniGPT-4,VisualGLM
37
+ 3. 新增跨模态视觉模型:Grounding DINO, SAM
38
+ 4. 新增融合更多模态大模型:ImageBind
39
+ 5. 新增文生图模型:SDXL,支持Text2Image、Img2Img、Inpainting、InstructPix2Pix等任务,支持DreamBooth Lora训练; 新增UniDiffuser,通过统一的多模态扩散过程支持文生图、图生文等任务; 新增文本条件视频生成模型LVDM,支持训练与推理; 新增文图生成模型Kandinsky 2.2,Consistency models; Controlnet升级,支持ControlNetImg2Img、ControlNetInpaint、 StableDiffusionXLControlNet等。
40
+
41
+ #### 特色应用
42
+ 1. 新增跨模态大模型应用流水线AppFlow
43
+ 2. 新增基于chat的图像编辑应用
44
+ 3. 新增自动标注应用
VLMEvalKit_old/PaddleMIX/docs/FAQ.md ADDED
File without changes
VLMEvalKit_old/PaddleMIX/docs/train_tutorial.md ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ # Train Tutorial
2
+
3
+
4
+ ## 训练微调示例
5
+ - [Blip2](../paddlemix/examples/blip2/README.md)
6
+ - [clip](../paddlemix/examples/clip/README.md)
7
+ - [coca](../paddlemix/examples/coca/README.md)
8
+ - [eva02](../paddlemix/examples/eva02/README.md)
9
+ - [evaclip](../paddlemix/examples/evaclip/README.md)
10
+ - [Stable Diffusion](../ppdiffusers/examples/text_to_image/README.md)
VLMEvalKit_old/PaddleMIX/ppdiffusers/LICENSE ADDED
@@ -0,0 +1,203 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+
3
+ Apache License
4
+ Version 2.0, January 2004
5
+ http://www.apache.org/licenses/
6
+
7
+ TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
8
+
9
+ 1. Definitions.
10
+
11
+ "License" shall mean the terms and conditions for use, reproduction,
12
+ and distribution as defined by Sections 1 through 9 of this document.
13
+
14
+ "Licensor" shall mean the copyright owner or entity authorized by
15
+ the copyright owner that is granting the License.
16
+
17
+ "Legal Entity" shall mean the union of the acting entity and all
18
+ other entities that control, are controlled by, or are under common
19
+ control with that entity. For the purposes of this definition,
20
+ "control" means (i) the power, direct or indirect, to cause the
21
+ direction or management of such entity, whether by contract or
22
+ otherwise, or (ii) ownership of fifty percent (50%) or more of the
23
+ outstanding shares, or (iii) beneficial ownership of such entity.
24
+
25
+ "You" (or "Your") shall mean an individual or Legal Entity
26
+ exercising permissions granted by this License.
27
+
28
+ "Source" form shall mean the preferred form for making modifications,
29
+ including but not limited to software source code, documentation
30
+ source, and configuration files.
31
+
32
+ "Object" form shall mean any form resulting from mechanical
33
+ transformation or translation of a Source form, including but
34
+ not limited to compiled object code, generated documentation,
35
+ and conversions to other media types.
36
+
37
+ "Work" shall mean the work of authorship, whether in Source or
38
+ Object form, made available under the License, as indicated by a
39
+ copyright notice that is included in or attached to the work
40
+ (an example is provided in the Appendix below).
41
+
42
+ "Derivative Works" shall mean any work, whether in Source or Object
43
+ form, that is based on (or derived from) the Work and for which the
44
+ editorial revisions, annotations, elaborations, or other modifications
45
+ represent, as a whole, an original work of authorship. For the purposes
46
+ of this License, Derivative Works shall not include works that remain
47
+ separable from, or merely link (or bind by name) to the interfaces of,
48
+ the Work and Derivative Works thereof.
49
+
50
+ "Contribution" shall mean any work of authorship, including
51
+ the original version of the Work and any modifications or additions
52
+ to that Work or Derivative Works thereof, that is intentionally
53
+ submitted to Licensor for inclusion in the Work by the copyright owner
54
+ or by an individual or Legal Entity authorized to submit on behalf of
55
+ the copyright owner. For the purposes of this definition, "submitted"
56
+ means any form of electronic, verbal, or written communication sent
57
+ to the Licensor or its representatives, including but not limited to
58
+ communication on electronic mailing lists, source code control systems,
59
+ and issue tracking systems that are managed by, or on behalf of, the
60
+ Licensor for the purpose of discussing and improving the Work, but
61
+ excluding communication that is conspicuously marked or otherwise
62
+ designated in writing by the copyright owner as "Not a Contribution."
63
+
64
+ "Contributor" shall mean Licensor and any individual or Legal Entity
65
+ on behalf of whom a Contribution has been received by Licensor and
66
+ subsequently incorporated within the Work.
67
+
68
+ 2. Grant of Copyright License. Subject to the terms and conditions of
69
+ this License, each Contributor hereby grants to You a perpetual,
70
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
71
+ copyright license to reproduce, prepare Derivative Works of,
72
+ publicly display, publicly perform, sublicense, and distribute the
73
+ Work and such Derivative Works in Source or Object form.
74
+
75
+ 3. Grant of Patent License. Subject to the terms and conditions of
76
+ this License, each Contributor hereby grants to You a perpetual,
77
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
78
+ (except as stated in this section) patent license to make, have made,
79
+ use, offer to sell, sell, import, and otherwise transfer the Work,
80
+ where such license applies only to those patent claims licensable
81
+ by such Contributor that are necessarily infringed by their
82
+ Contribution(s) alone or by combination of their Contribution(s)
83
+ with the Work to which such Contribution(s) was submitted. If You
84
+ institute patent litigation against any entity (including a
85
+ cross-claim or counterclaim in a lawsuit) alleging that the Work
86
+ or a Contribution incorporated within the Work constitutes direct
87
+ or contributory patent infringement, then any patent licenses
88
+ granted to You under this License for that Work shall terminate
89
+ as of the date such litigation is filed.
90
+
91
+ 4. Redistribution. You may reproduce and distribute copies of the
92
+ Work or Derivative Works thereof in any medium, with or without
93
+ modifications, and in Source or Object form, provided that You
94
+ meet the following conditions:
95
+
96
+ (a) You must give any other recipients of the Work or
97
+ Derivative Works a copy of this License; and
98
+
99
+ (b) You must cause any modified files to carry prominent notices
100
+ stating that You changed the files; and
101
+
102
+ (c) You must retain, in the Source form of any Derivative Works
103
+ that You distribute, all copyright, patent, trademark, and
104
+ attribution notices from the Source form of the Work,
105
+ excluding those notices that do not pertain to any part of
106
+ the Derivative Works; and
107
+
108
+ (d) If the Work includes a "NOTICE" text file as part of its
109
+ distribution, then any Derivative Works that You distribute must
110
+ include a readable copy of the attribution notices contained
111
+ within such NOTICE file, excluding those notices that do not
112
+ pertain to any part of the Derivative Works, in at least one
113
+ of the following places: within a NOTICE text file distributed
114
+ as part of the Derivative Works; within the Source form or
115
+ documentation, if provided along with the Derivative Works; or,
116
+ within a display generated by the Derivative Works, if and
117
+ wherever such third-party notices normally appear. The contents
118
+ of the NOTICE file are for informational purposes only and
119
+ do not modify the License. You may add Your own attribution
120
+ notices within Derivative Works that You distribute, alongside
121
+ or as an addendum to the NOTICE text from the Work, provided
122
+ that such additional attribution notices cannot be construed
123
+ as modifying the License.
124
+
125
+ You may add Your own copyright statement to Your modifications and
126
+ may provide additional or different license terms and conditions
127
+ for use, reproduction, or distribution of Your modifications, or
128
+ for any such Derivative Works as a whole, provided Your use,
129
+ reproduction, and distribution of the Work otherwise complies with
130
+ the conditions stated in this License.
131
+
132
+ 5. Submission of Contributions. Unless You explicitly state otherwise,
133
+ any Contribution intentionally submitted for inclusion in the Work
134
+ by You to the Licensor shall be under the terms and conditions of
135
+ this License, without any additional terms or conditions.
136
+ Notwithstanding the above, nothing herein shall supersede or modify
137
+ the terms of any separate license agreement you may have executed
138
+ with Licensor regarding such Contributions.
139
+
140
+ 6. Trademarks. This License does not grant permission to use the trade
141
+ names, trademarks, service marks, or product names of the Licensor,
142
+ except as required for reasonable and customary use in describing the
143
+ origin of the Work and reproducing the content of the NOTICE file.
144
+
145
+ 7. Disclaimer of Warranty. Unless required by applicable law or
146
+ agreed to in writing, Licensor provides the Work (and each
147
+ Contributor provides its Contributions) on an "AS IS" BASIS,
148
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
149
+ implied, including, without limitation, any warranties or conditions
150
+ of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
151
+ PARTICULAR PURPOSE. You are solely responsible for determining the
152
+ appropriateness of using or redistributing the Work and assume any
153
+ risks associated with Your exercise of permissions under this License.
154
+
155
+ 8. Limitation of Liability. In no event and under no legal theory,
156
+ whether in tort (including negligence), contract, or otherwise,
157
+ unless required by applicable law (such as deliberate and grossly
158
+ negligent acts) or agreed to in writing, shall any Contributor be
159
+ liable to You for damages, including any direct, indirect, special,
160
+ incidental, or consequential damages of any character arising as a
161
+ result of this License or out of the use or inability to use the
162
+ Work (including but not limited to damages for loss of goodwill,
163
+ work stoppage, computer failure or malfunction, or any and all
164
+ other commercial damages or losses), even if such Contributor
165
+ has been advised of the possibility of such damages.
166
+
167
+ 9. Accepting Warranty or Additional Liability. While redistributing
168
+ the Work or Derivative Works thereof, You may choose to offer,
169
+ and charge a fee for, acceptance of support, warranty, indemnity,
170
+ or other liability obligations and/or rights consistent with this
171
+ License. However, in accepting such obligations, You may act only
172
+ on Your own behalf and on Your sole responsibility, not on behalf
173
+ of any other Contributor, and only if You agree to indemnify,
174
+ defend, and hold each Contributor harmless for any liability
175
+ incurred by, or claims asserted against, such Contributor by reason
176
+ of your accepting any such warranty or additional liability.
177
+
178
+ END OF TERMS AND CONDITIONS
179
+
180
+ APPENDIX: How to apply the Apache License to your work.
181
+
182
+ To apply the Apache License to your work, attach the following
183
+ boilerplate notice, with the fields enclosed by brackets "[]"
184
+ replaced with your own identifying information. (Don't include
185
+ the brackets!) The text should be enclosed in the appropriate
186
+ comment syntax for the file format. We also recommend that a
187
+ file or class name and description of purpose be included on the
188
+ same "printed page" as the copyright notice for easier
189
+ identification within third-party archives.
190
+
191
+ Copyright [yyyy] [name of copyright owner]
192
+
193
+ Licensed under the Apache License, Version 2.0 (the "License");
194
+ you may not use this file except in compliance with the License.
195
+ You may obtain a copy of the License at
196
+
197
+ http://www.apache.org/licenses/LICENSE-2.0
198
+
199
+ Unless required by applicable law or agreed to in writing, software
200
+ distributed under the License is distributed on an "AS IS" BASIS,
201
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
202
+ See the License for the specific language governing permissions and
203
+ limitations under the License.
VLMEvalKit_old/PaddleMIX/ppdiffusers/Makefile ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ .DEFAULT_GOAL := all
3
+
4
+ .PHONY: all
5
+ all: deploy-version build deploy
6
+
7
+ .PHONY: build
8
+ build:
9
+ python3 setup.py sdist bdist_wheel
10
+
11
+ .PHONY: deploy
12
+ deploy:
13
+ make deploy-version
14
+ twine upload --skip-existing dist/*
15
+
16
+ .PHONY: deploy-version
17
+ deploy-version:
18
+ echo "VERSION = '$$(cat VERSION)'" > ppdiffusers/version.py
19
+
20
+ .PHONY: install
21
+ install:
22
+ pip install -r requirements.txt
23
+
24
+ .PHONY: version
25
+ version:
26
+ @newVersion=$$(awk -F. '{print $$1"."$$2"."$$3+1}' < VERSION) \
27
+ && echo $${newVersion} > VERSION \
28
+ && git add VERSION \
29
+ && git commit -m "🔥 update version to $${newVersion}" > /dev/null \
30
+ && echo "Bumped version to $${newVersion}"
VLMEvalKit_old/PaddleMIX/ppdiffusers/README.md ADDED
@@ -0,0 +1,1278 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <div align="center">
2
+ <img src="https://user-images.githubusercontent.com/11793384/215372703-4385f66a-abe4-44c7-9626-96b7b65270c8.png" width="40%" height="40%" />
3
+ </div>
4
+
5
+ <p align="center">
6
+ <a href="https://pypi.org/project/ppdiffusers/"><img src="https://img.shields.io/pypi/pyversions/ppdiffusers"></a>
7
+ <a href=""><img src="https://img.shields.io/badge/os-linux%2C%20win%2C%20mac-yellow.svg"></a>
8
+ <a href="https://github.com/PaddlePaddle/PaddleMIX/blob/develop/ppdiffusers/LICENSE"><img src="https://img.shields.io/badge/license-Apache%202-dfd.svg"></a>
9
+ </p>
10
+
11
+ <h4 align="center">
12
+ <a href=#特性> 特性 </a> |
13
+ <a href=#安装> 安装 </a> |
14
+ <a href=#快速开始> 快速开始 </a> |
15
+ <a href=#模型部署> 模型部署</a>
16
+ </h4>
17
+
18
+ # PPDiffusers: Diffusers toolbox implemented based on PaddlePaddle
19
+
20
+ **PPDiffusers**是一款支持多种模态(如文本图像跨模态、图像、语音)扩散模型(Diffusion Model)训练和推理的国产化工具箱,依托于[**PaddlePaddle**](https://www.paddlepaddle.org.cn/)框架和[**PaddleNLP**](https://github.com/PaddlePaddle/PaddleNLP)自然语言处理开发库。
21
+
22
+ ## News 📢
23
+ * 🔥 **2024.10.18 发布 0.29.0 版本,新增图像生成模型[Stable Diffusion 3 (SD3)](https://github.com/PaddlePaddle/PaddleMIX/blob/develop/ppdiffusers/examples/text_to_image/README_sd3.md),支持DreamBooth训练及高性能推理;SD3、SDXL适配昇腾910B,提供国产计算芯片上的训推能力;DIT支持[高性能推理](https://github.com/PaddlePaddle/PaddleMIX/blob/develop/ppdiffusers/examples/class_conditional_image_generation/DiT/README.md#23-paddle-inference-%E9%AB%98%E6%80%A7%E8%83%BD%E6%8E%A8%E7%90%86);支持PaddleNLP 3.0 beta版本。**
24
+
25
+ * 🔥 **2024.07.15 发布 0.24.1 版本,新增[Open-Sora](https://github.com/PaddlePaddle/PaddleMIX/tree/develop/ppdiffusers/examples/Open-Sora),支持模型训练和推理;全面支持Paddle 3.0。**
26
+
27
+ * 🔥 **2024.04.17 发布 0.24.0 版本,支持[Sora相关技术](https://github.com/PaddlePaddle/PaddleMIX/tree/develop/ppdiffusers/examples/sora),支持[DiT](https://github.com/PaddlePaddle/PaddleMIX/tree/develop/ppdiffusers/examples/class_conditional_image_generation/DiT)、[SiT](https://github.com/PaddlePaddle/PaddleMIX/tree/develop/ppdiffusers/examples/class_conditional_image_generation/DiT#exploring-flow-and-diffusion-based-generative-models-with-scalable-interpolant-transformers-sit)、[UViT](https://github.com/PaddlePaddle/PaddleMIX/tree/develop/ppdiffusers/examples/text_to_image_mscoco_uvit)训练推理,新增[NaViT](https://github.com/PaddlePaddle/PaddleMIX/tree/develop/ppdiffusers/examples/navit)、[MAGVIT-v2](https://github.com/PaddlePaddle/PaddleMIX/tree/develop/ppdiffusers/examples/video_tokenizer/magvit2)模型;
28
+ 视频生成能力全面升级;
29
+ 新增视频生成模型[SVD](https://github.com/PaddlePaddle/PaddleMIX/tree/develop/ppdiffusers/examples/stable_video_diffusion),支持模型微调和推理;
30
+ 新增姿态可控视频生成模型[AnimateAnyone](https://github.com/PaddlePaddle/PaddleMIX/tree/develop/ppdiffusers/examples/AnimateAnyone)、即插即用视频生成模型[AnimateDiff](https://github.com/PaddlePaddle/PaddleMIX/blob/develop/ppdiffusers/examples/inference/text_to_video_generation_animediff.py)、GIF视频生成模型[Hotshot-XL](https://github.com/PaddlePaddle/PaddleMIX/tree/develop/ppdiffusers/examples/community/Hotshot-XL);
31
+ 新增高速推理文图生成模型[LCM](https://github.com/PaddlePaddle/PaddleMIX/tree/develop/ppdiffusers/examples/consistency_distillation),支持SD/SDXL训练和推理;
32
+ [模型推理部署](https://github.com/PaddlePaddle/PaddleMIX/tree/develop/ppdiffusers/deploy)全面升级;新增peft,accelerate后端;
33
+ 权重加载/保存全面升级,支持分布式、模型切片、safetensors等场景,相关能力已集成DiT、 [IP-Adapter](https://github.com/PaddlePaddle/PaddleMIX/tree/develop/ppdiffusers/examples/ip_adapter)、[PhotoMaker](https://github.com/PaddlePaddle/PaddleMIX/tree/develop/ppdiffusers/examples/PhotoMaker)、[InstantID](https://github.com/PaddlePaddle/PaddleMIX/tree/develop/ppdiffusers/examples/InstantID)等。**
34
+ * 🔥 **2023.12.12 发布 0.19.4 版本,修复已知的部分 BUG,修复 0D Tensor 的 Warning,新增 SDXL 的 FastdeployPipeline。**
35
+ * 🔥 **2023.09.27 发布 0.19.3 版本,新增[SDXL](#文本图像多模),支持Text2Image、Img2Img、Inpainting、InstructPix2Pix等任务,支持DreamBooth Lora训练;
36
+ 新增[UniDiffuser](#文本图像多模),通过统一的多模态扩散过程支持文生图、图生文等任务;
37
+ 新增文本条件视频生成模型[LVDM](https://github.com/PaddlePaddle/PaddleMIX/tree/develop/ppdiffusers/examples/text_to_video_lvdm),支持训练与推理;
38
+ 新增文图生成模型[Kandinsky 2.2](#文本图像多模),[Consistency models](#文本图像多模);
39
+ Stable Diffusion支持[BF16 O2训练](https://github.com/PaddlePaddle/PaddleMIX/tree/develop/ppdiffusers/examples/stable_diffusion),效果对齐FP32;
40
+ [LoRA加载升级](#加载HF-LoRA权重),支持加载SDXL的LoRA权重;
41
+ [Controlnet](https://github.com/PaddlePaddle/PaddleMIX/tree/develop/ppdiffusers/ppdiffusers/pipelines/controlnet)升级,支持ControlNetImg2Img、ControlNetInpaint、StableDiffusionXLControlNet等。**
42
+
43
+
44
+
45
+
46
+ ## 特性
47
+ #### 📦 SOTA扩散模型Pipelines集合
48
+ 我们提供**SOTA(State-of-the-Art)** 的扩散模型Pipelines集合。
49
+ 目前**PPDiffusers**已经集成了**100+Pipelines**,支持文图生成(Text-to-Image Generation)、文本引导的图像编辑(Text-Guided Image Inpainting)、文本引导的图像变换(Image-to-Image Text-Guided Generation)、文本条件的视频生成(Text-to-Video Generation)、超分(Super Superresolution)、文本条件的音频生成(Text-to-Audio Generation)在内的**10余项**任务,覆盖**文本、图像、视频、音频**等多种模态。
50
+ 如果想要了解当前支持的所有**Pipelines**以及对应的来源信息,可以阅读[🔥 PPDiffusers Pipelines](https://github.com/PaddlePaddle/PaddleMIX/blob/develop/ppdiffusers/ppdiffusers/pipelines/README.md)文档。
51
+
52
+
53
+ #### 🔊 提供丰富的Noise Scheduler
54
+ 我们提供了丰富的**噪声调度器(Noise Scheduler)**,可以对**速度**与**质量**进行权衡,用户可在推理时根据需求快速切换使用。
55
+ 当前**PPDiffusers**已经集成了**14+Scheduler**,不仅支持 [DDPM](https://github.com/PaddlePaddle/PaddleMIX/blob/develop/ppdiffusers/ppdiffusers/schedulers/scheduling_ddpm.py)、[DDIM](https://github.com/PaddlePaddle/PaddleMIX/blob/develop/ppdiffusers/ppdiffusers/schedulers/scheduling_ddim.py) 和 [PNDM](https://github.com/PaddlePaddle/PaddleMIX/blob/develop/ppdiffusers/ppdiffusers/schedulers/scheduling_pndm.py),还支持最新的 [🔥 DPMSolver](https://github.com/PaddlePaddle/PaddleMIX/blob/develop/ppdiffusers/ppdiffusers/schedulers/scheduling_dpmsolver_multistep.py)!
56
+
57
+ #### 🎛️ 提供多种扩散模型组件
58
+ 我们提供了**多种扩散模型**组件,如[UNet1DModel](https://github.com/PaddlePaddle/PaddleMIX/blob/develop/ppdiffusers/ppdiffusers/models/unet_1d.py)、[UNet2DModel](https://github.com/PaddlePaddle/PaddleMIX/blob/develop/ppdiffusers/ppdiffusers/models/unet_2d.py)、[UNet2DConditionModel](https://github.com/PaddlePaddle/PaddleMIX/blob/develop/ppdiffusers/ppdiffusers/models/unet_2d_condition.py)、[UNet3DConditionModel](https://github.com/PaddlePaddle/PaddleMIX/blob/develop/ppdiffusers/ppdiffusers/models/unet_3d_condition.py)、[VQModel](https://github.com/PaddlePaddle/PaddleMIX/blob/develop/ppdiffusers/ppdiffusers/models/vae.py)、[AutoencoderKL](https://github.com/PaddlePaddle/PaddleMIX/blob/develop/ppdiffusers/ppdiffusers/models/vae.py)等。
59
+
60
+
61
+ #### 📖 提供丰富的训练和推理教程
62
+ 我们提供了丰富的训练教程,不仅支持扩散模型的二次开发微调,如基于[Textual Inversion](https://github.com/PaddlePaddle/PaddleMIX/tree/develop/ppdiffusers/examples/textual_inversion)和[DreamBooth](https://github.com/PaddlePaddle/PaddleMIX/tree/develop/ppdiffusers/examples/dreambooth)使用3-5张图定制化训练生成图像的风格或物体,还支持[🔥 Latent Diffusion Model](https://github.com/PaddlePaddle/PaddleMIX/tree/develop/ppdiffusers/examples/text_to_image_laion400m)、[🔥 ControlNet](https://github.com/PaddlePaddle/PaddleMIX/tree/develop/ppdiffusers/examples/controlnet)、[🔥 T2I-Adapter](https://github.com/PaddlePaddle/PaddleMIX/tree/develop/ppdiffusers/examples/t2i-adapter) 等扩散模型的训练!
63
+ 此外,我们还提供了丰富的[🔥 Pipelines推理样例](https://github.com/PaddlePaddle/PaddleMIX/tree/develop/ppdiffusers/examples/inference)。
64
+
65
+ #### 🚀 支持FastDeploy高性能部署
66
+ 我们提供基于[FastDeploy](https://github.com/PaddlePaddle/FastDeploy)的[🔥 高性能Stable Diffusion Pipeline](https://github.com/PaddlePaddle/PaddleMIX/blob/develop/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_fastdeploy_stable_diffusion.py),更多有关FastDeploy进行多推理引擎后端高性能部署的信息请参考[🔥 高性能FastDeploy推理教程](https://github.com/PaddlePaddle/PaddleMIX/tree/develop/ppdiffusers/deploy)。
67
+
68
+ ## 安装
69
+
70
+ ### 环境依赖
71
+ ```
72
+ pip install -r requirements.txt
73
+ ```
74
+ 关于PaddlePaddle安装的详细教程请查看[Installation](https://www.paddlepaddle.org.cn/install/quick?docurl=/documentation/docs/zh/develop/install/pip/linux-pip.html)。
75
+
76
+ ### pip安装
77
+
78
+ ```shell
79
+ pip install --upgrade ppdiffusers
80
+ ```
81
+
82
+ ### 手动安装
83
+ ```shell
84
+ git clone https://github.com/PaddlePaddle/PaddleMIX
85
+ cd PaddleMIX/ppdiffusers
86
+ python setup.py install
87
+ ```
88
+ ### 设置代理
89
+ ```shell
90
+ export HF_HUB_ENABLE_HF_TRANSFER=1
91
+ export HF_ENDPOINT=https://hf-mirror.com
92
+ ```
93
+
94
+ ## 快速开始
95
+ 我们将以扩散模型的典型代表**Stable Diffusion**为例,带你快速了解PPDiffusers。
96
+
97
+ **Stable Diffusion**基于**潜在扩散模型(Latent Diffusion Models)**,专门用于**文图生成(Text-to-Image Generation)任务**。该模型是由来自 [CompVis](https://github.com/CompVis), [Stability AI](https://stability.ai/), [LAION](https://laion.ai/)以及[RunwayML](https://runwayml.com/)的工程师共同开发完成,目前发布了v1和v2两个版本。v1版本采用了LAION-5B数据集子集(分辨率为 512x512)进行训练,并具有以下架构设置:自动编码器下采样因子为8,UNet大小为860M,文本编码器为CLIP ViT-L/14。v2版本相较于v1版本在生成图像的质量和分辨率等进行了改善。
98
+
99
+ ### Stable Diffusion重点模型权重
100
+
101
+ <details><summary>&emsp; Stable Diffusion 模型支持的权重(英文) </summary>
102
+
103
+ **我们只需要将下面的"xxxx",替换成所需的权重名,即可快速使用!**
104
+ ```python
105
+ from ppdiffusers import *
106
+
107
+ pipe_text2img = StableDiffusionPipeline.from_pretrained("xxxx")
108
+ pipe_img2img = StableDiffusionImg2ImgPipeline.from_pretrained("xxxx")
109
+ pipe_inpaint_legacy = StableDiffusionInpaintPipelineLegacy.from_pretrained("xxxx")
110
+ pipe_mega = StableDiffusionMegaPipeline.from_pretrained("xxxx")
111
+
112
+ # pipe_mega.text2img() 等于 pipe_text2img()
113
+ # pipe_mega.img2img() 等于 pipe_img2img()
114
+ # pipe_mega.inpaint_legacy() 等于 pipe_inpaint_legacy()
115
+ ```
116
+
117
+ | PPDiffusers支持的模型名称 | 支持加载的Pipeline | 备注 | huggingface.co地址 |
118
+ | :-------------------------------------------: | :--------------------------------------------------------------------: | --- | :-----------------------------------------: |
119
+ | CompVis/stable-diffusion-v1-4 | StableDiffusionPipeline、StableDiffusionImg2ImgPipeline、StableDiffusionInpaintPipelineLegacy、StableDiffusionMegaPipeline、StableDiffusionPipelineAllinOne | Stable-Diffusion-v1-4 使用 Stable-Diffusion-v1-2 的权重进行初始化。随后在"laion-aesthetics v2 5+"数据集上以 **512x512** 分辨率微调了 **225k** 步数,对文本使用了 **10%** 的dropout(即:训练过程中文图对中的文本有 10% 的概率会变成空文本)。模型使用了[CLIP ViT-L/14](https://huggingface.co/openai/clip-vit-large-patch14)作为文本编码器。| [地址](https://huggingface.co/CompVis/stable-diffusion-v1-4) |
120
+ | CompVis/ldm-text2im-large-256 | LDMTextToImagePipeline | [LDM论文](https://arxiv.org/pdf/2112.10752.pdf) LDM-KL-8-G* 权重。| [地址](https://huggingface.co/CompVis/ldm-text2im-large-256) |
121
+ | CompVis/ldm-super-resolution-4x-openimages | LDMSuperResolutionPipeline | [LDM论文](https://arxiv.org/pdf/2112.10752.pdf) LDM-VQ-4 权重,[原始权重链接](https://ommer-lab.com/files/latent-diffusion/sr_bsr.zip)。| [地址](https://huggingface.co/CompVis/ldm-super-resolution-4x-openimages) |
122
+ | runwayml/stable-diffusion-v1-5 | StableDiffusionPipeline、StableDiffusionImg2ImgPipeline、StableDiffusionInpaintPipelineLegacy、StableDiffusionMegaPipeline、StableDiffusionPipelineAllinOne | Stable-Diffusion-v1-5 使用 Stable-Diffusion-v1-2 的权重进行初始化。随后在"laion-aesthetics v2 5+"数据集上以 **512x512** 分辨率微调了 **595k** 步数,对文本使用了 **10%** 的dropout(即:训练过程中文图对中的文本有 10% 的概率会变成空文本)。模型同样也使用了[CLIP ViT-L/14](https://huggingface.co/openai/clip-vit-large-patch14)作为文本编码器。| [地址](https://huggingface.co/runwayml/stable-diffusion-v1-5) |
123
+ | runwayml/stable-diffusion-inpainting | StableDiffusionInpaintPipeline | Stable-Diffusion-Inpainting 使用 Stable-Diffusion-v1-2 的权重进行初始化。首先进行了 **595k** 步的常规训练(实际也就是 Stable-Diffusion-v1-5 的权重),然后进行了 **440k** 步的 inpainting 修复训练。对于 inpainting 修复训练,给 UNet 额外增加了 **5** 输入通道(其中 **4** 个用于被 Mask 遮盖住的图片,**1** 个用于 Mask 本身)。在训练期间,会随机生成 Mask,并有 **25%** 概率会将原始图片全部 Mask 掉。| [地址](https://huggingface.co/runwayml/stable-diffusion-inpainting) |
124
+ | stabilityai/stable-diffusion-2-base | StableDiffusionPipeline、StableDiffusionImg2ImgPipeline、StableDiffusionInpaintPipelineLegacy、StableDiffusionMegaPipeline、StableDiffusionPipelineAllinOne | 该模型首先在 [LAION-5B 256x256 子集上](https://laion.ai/blog/laion-5b/) (过滤条件:[punsafe = 0.1 的 LAION-NSFW 分类器](https://github.com/LAION-AI/CLIP-based-NSFW-Detector) 和 审美分数大于等于 4.5 )从头开始训练 **550k** 步,然后又在分辨率 **>= 512x512** 的同一数据集上进一步训练 **850k** 步。| [地址](https://huggingface.co/stabilityai/stable-diffusion-2-base) |
125
+ | stabilityai/stable-diffusion-2 | StableDiffusionPipeline、StableDiffusionImg2ImgPipeline、StableDiffusionInpaintPipelineLegacy、StableDiffusionMegaPipeline、StableDiffusionPipelineAllinOne | stable-diffusion-2 使用 stable-diffusion-2-base 权重进行初始化,首先在同一数据集上(**512x512** 分辨率)使用 [v-objective](https://arxiv.org/abs/2202.00512) 训��了 **150k** 步。然后又在 **768x768** 分辨率上使用 [v-objective](https://arxiv.org/abs/2202.00512) 继续训练了 **140k** 步。| [地址](https://huggingface.co/stabilityai/stable-diffusion-2) |
126
+ | stabilityai/stable-diffusion-2-inpainting | StableDiffusionInpaintPipeline |stable-diffusion-2-inpainting 使用 stable-diffusion-2-base 权重初始化,并且额外训练了 **200k** 步。训练过程使用了 [LAMA](https://github.com/saic-mdal/lama) 中提出的 Mask 生成策略,并且使用 Mask 图片的 Latent 表示(经过 VAE 编码)作为附加条件。| [地址](https://huggingface.co/stabilityai/stable-diffusion-2-inpainting) |
127
+ | stabilityai/stable-diffusion-x4-upscaler | StableDiffusionUpscalePipeline | 该模型在**LAION 10M** 子集上(>2048x2048)训练了 1.25M 步。该模型还在分辨率为 **512x512** 的图像上使用 [Text-guided Latent Upscaling Diffusion Model](https://arxiv.org/abs/2112.10752) 进行了训练。除了**文本输入**之外,它还接收 **noise_level** 作为输入参数,因此我们可以使用 [预定义的 Scheduler](https://huggingface.co/stabilityai/stable-diffusion-x4-upscaler/blob/main/low_res_scheduler/scheduler_config.json) 向低分辨率的输入图片添加噪声。| [地址](https://huggingface.co/stabilityai/stable-diffusion-x4-upscaler) |
128
+ | hakurei/waifu-diffusion | StableDiffusionPipeline、StableDiffusionImg2ImgPipeline、StableDiffusionInpaintPipelineLegacy、StableDiffusionMegaPipeline、StableDiffusionPipelineAllinOne | waifu-diffusion-v1-2 使用 stable-diffusion-v1-4 权重初始化,并且在**高质量动漫**图像数据集上进行微调后得到的模型。用于微调的数据是 **680k** 文本图像样本,这些样本是通过 **booru 网站** 下载的。| [地址](https://huggingface.co/hakurei/waifu-diffusion) |
129
+ | hakurei/waifu-diffusion-v1-3 | StableDiffusionPipeline、StableDiffusionImg2ImgPipeline、StableDiffusionInpaintPipelineLegacy、StableDiffusionMegaPipeline、StableDiffusionPipelineAllinOne | waifu-diffusion-v1-3 是 waifu-diffusion-v1-2 基础上进一步训练得到的。他们对数据集进行了额外操作:(1)删除下划线;(2)删除括号;(3)用逗号分隔每个booru 标签;(4)随机化标签顺序。| [地址](https://huggingface.co/hakurei/waifu-diffusion) |
130
+ | naclbit/trinart_stable_diffusion_v2_60k | StableDiffusionPipeline、StableDiffusionImg2ImgPipeline、StableDiffusionInpaintPipelineLegacy、StableDiffusionMegaPipeline、StableDiffusionPipelineAllinOne | trinart_stable_diffusion 使用 stable-diffusion-v1-4 权重初始化,在 40k **高分辨率漫画/动漫风格**的图片数据集上微调了 8 个 epoch。V2 版模型使用 **dropouts**、**10k+ 图像**和**新的标记策略**训练了**更长时间**。| [地址](https://huggingface.co/naclbit/trinart_stable_diffusion_v2) |
131
+ | naclbit/trinart_stable_diffusion_v2_95k | StableDiffusionPipeline、StableDiffusionImg2ImgPipeline、StableDiffusionInpaintPipelineLegacy、StableDiffusionMegaPipeline、StableDiffusionPipelineAllinOne | **95k** 步数的结果,其他同上。| [地址](https://huggingface.co/naclbit/trinart_stable_diffusion_v2) |
132
+ | naclbit/trinart_stable_diffusion_v2_115k | StableDiffusionPipeline、StableDiffusionImg2ImgPipeline、StableDiffusionInpaintPipelineLegacy、StableDiffusionMegaPipeline、StableDiffusionPipelineAllinOne | **115k** 步数的结果,其他同上。| [地址](https://huggingface.co/naclbit/trinart_stable_diffusion_v2) |
133
+ | Deltaadams/Hentai-Diffusion | StableDiffusionPipeline、StableDiffusionImg2ImgPipeline、StableDiffusionInpaintPipelineLegacy、StableDiffusionMegaPipeline、StableDiffusionPipelineAllinOne | None| [地址](https://huggingface.co/Deltaadams/Hentai-Diffusion) |
134
+ | ringhyacinth/nail-set-diffuser | StableDiffusionPipeline、StableDiffusionImg2ImgPipeline、StableDiffusionInpaintPipelineLegacy、StableDiffusionMegaPipeline、StableDiffusionPipelineAllinOne | 美甲领域的扩散模型,训练数据使用了 [Weekend](https://weibo.com/u/5982308498)| [地址](https://huggingface.co/ringhyacinth/nail-set-diffuser) |
135
+ | Linaqruf/anything-v3.0 | StableDiffusionPipeline、StableDiffusionImg2ImgPipeline、StableDiffusionInpaintPipelineLegacy、StableDiffusionMegaPipeline、StableDiffusionPipelineAllinOne | 该模型可通过输入几个文本提示词就能生成**高质量、高度详细的动漫风格图片**,该模型支持使用 **danbooru 标签文本** 生成图像。| [地址](https://huggingface.co/Linaqruf/anything-v3.0) |
136
+
137
+ </details>
138
+ <details><summary>&emsp; Stable Diffusion 模型支持的权重(中文和多语言) </summary>
139
+
140
+
141
+ | PPDiffusers支持的模型名称 | 支持加载的Pipeline | 备注 | huggingface.co地址 |
142
+ | :-------------------------------------------: | :--------------------------------------------------------------------: | --- | :-----------------------------------------: |
143
+ | BAAI/AltDiffusion | AltDiffusionPipeline、AltDiffusionImg2ImgPipeline | 该模型使用 [AltCLIP](https://github.com/FlagAI-Open/FlagAI/tree/master/examples/AltCLIP/README.md) 作为文本编码器,在 Stable Diffusion 基础上训练了**双语Diffusion模型**,其中训练数据来自 [WuDao数据集](https://data.baai.ac.cn/details/WuDaoCorporaText) 和 [LAION](https://huggingface.co/datasets/ChristophSchuhmann/improved_aesthetics_6plus) 。| [地址](https://huggingface.co/BAAI/AltDiffusion) |
144
+ | BAAI/AltDiffusion-m9 | AltDiffusionPipeline、AltDiffusionImg2ImgPipeline |该模型使用9种语言的 [AltCLIP-m9](https://github.com/FlagAI-Open/FlagAI/tree/master/examples/AltCLIP/README.md) 作为文本编码器,其他同上。| [地址](https://huggingface.co/BAAI/AltDiffusion-m9) |
145
+ | IDEA-CCNL/Taiyi-Stable-Diffusion-1B-Chinese-v0.1 | StableDiffusionPipeline、StableDiffusionImg2ImgPipeline、StableDiffusionInpaintPipelineLegacy、StableDiffusionMegaPipeline、StableDiffusionPipelineAllinOne | 他们将 [Noah-Wukong](https://wukong-dataset.github.io/wukong-dataset/) 数据集 (100M) 和 [Zero](https://zero.so.com/) 数据集 (23M) 用作预训练的数据集,先用 [IDEA-CCNL/Taiyi-CLIP-RoBERTa-102M-ViT-L-Chinese](https://huggingface.co/IDEA-CCNL/Taiyi-CLIP-RoBERTa-102M-ViT-L-Chinese) 对这两个数据集的图文对相似性进行打分,取 CLIP Score 大于 0.2 的图文对作为训练集。 他们使用 [IDEA-CCNL/Taiyi-CLIP-RoBERTa-102M-ViT-L-Chinese](https://huggingface.co/IDEA-CCNL/Taiyi-CLIP-RoBERTa-102M-ViT-L-Chinese) 作为初始化的text encoder,冻住 [stable-diffusion-v1-4](https://huggingface.co/CompVis/stable-diffusion-v1-4) ([论文](https://arxiv.org/abs/2112.10752)) 模型的其他部分,只训练 text encoder,以便保留原始模型的生成能力且实现中文概念的对齐。该模型目前在0.2亿图文对上训练了一个 epoch。 在 32 x A100 上训练了大约100小时,该版本只是一个初步的版本。| [地址](https://huggingface.co/IDEA-CCNL/Taiyi-Stable-Diffusion-1B-Chinese-v0.1) |
146
+ | IDEA-CCNL/Taiyi-Stable-Diffusion-1B-Chinese-EN-v0.1 | StableDiffusionPipeline、StableDiffusionImg2ImgPipeline、StableDiffusionInpaintPipelineLegacy、StableDiffusionMegaPipeline、StableDiffusionPipelineAllinOne | 他们将 [Noah-Wukong](https://wukong-dataset.github.io/wukong-dataset/) 数据集 (100M) 和 [Zero](https://zero.so.com/) 数据集 (23M) 用作预训练的数据集,先用 [IDEA-CCNL/Taiyi-CLIP-RoBERTa-102M-ViT-L-Chinese](https://huggingface.co/IDEA-CCNL/Taiyi-CLIP-RoBERTa-102M-ViT-L-Chinese) 对这两个数据集的图文对相似性进行打分,取 CLIP Score 大于 0.2 的图文对作为训练集。 他们使用 [stable-diffusion-v1-4](https://huggingface.co/CompVis/stable-diffusion-v1-4) ([论文](https://arxiv.org/abs/2112.10752)) 模型进行继续训练,其中训练分为**两个stage**。**第一个stage** 中冻住模型的其他部分,只训练 text encoder ,以便保留原始模型的生成能力且实现中文概念的对齐。**第二个stage** 中将全部模型解冻,一起训练 text encoder 和 diffusion model ,以便 diffusion model 更好的适配中文引导。第一个 stage 他们训练了 80 小时,第二个 stage 训练了 100 小时,两个stage都是用了8 x A100,该版本是一个初步的版本。| [地址](https://huggingface.co/IDEA-CCNL/Taiyi-Stable-Diffusion-1B-Chinese-EN-v0.1) |
147
+ </details>
148
+
149
+
150
+ ### 加载HF Diffusers权重
151
+ ```python
152
+ from ppdiffusers import StableDiffusionPipeline
153
+ # 设置from_hf_hub为True,表示从huggingface hub下载,from_diffusers为True表示加载的是diffusers版Pytorch权重
154
+ pipe = StableDiffusionPipeline.from_pretrained("stabilityai/stable-diffusion-2", from_hf_hub=True, from_diffusers=True)
155
+ ```
156
+
157
+ ### 加载原库的Lightning权重
158
+ ```python
159
+ from ppdiffusers import StableDiffusionPipeline
160
+ # 可输入网址 或 本地ckpt、safetensors文件
161
+ pipe = StableDiffusionPipeline.from_single_file("https://paddlenlp.bj.bcebos.com/models/community/junnyu/develop/ppdiffusers/chilloutmix_NiPrunedFp32Fix.safetensors")
162
+ ```
163
+
164
+ ### 加载HF LoRA权重
165
+ ```python
166
+ from ppdiffusers import DiffusionPipeline
167
+
168
+ pipe = DiffusionPipeline.from_pretrained("stabilityai/stable-diffusion-xl-base-1.0", paddle_dtype=paddle.float16)
169
+
170
+ pipe.load_lora_weights("stabilityai/stable-diffusion-xl-base-1.0",
171
+ weight_name="sd_xl_offset_example-lora_1.0.safetensors",
172
+ from_diffusers=True)
173
+ ```
174
+
175
+ ### 加载Civitai社区的LoRA权重
176
+ ```python
177
+ from ppdiffusers import StableDiffusionPipeline
178
+ pipe = StableDiffusionPipeline.from_pretrained("TASUKU2023/Chilloutmix")
179
+ # 加载lora权重
180
+ pipe.load_lora_weights("./",
181
+ weight_name="Moxin_10.safetensors",
182
+ from_diffusers=True)
183
+ pipe.fuse_lora()
184
+ ```
185
+
186
+ ### XFormers加速
187
+ 为了使用**XFormers加速**,我们需要安装`develop`版本的`paddle`,Linux系统的安装命令如下:
188
+ ```sh
189
+ python -m pip install paddlepaddle-gpu==0.0.0.post117 -f https://www.paddlepaddle.org.cn/whl/linux/gpu/develop.html
190
+ ```
191
+
192
+ ```python
193
+ import paddle
194
+ from ppdiffusers import StableDiffusionPipeline
195
+ pipe = StableDiffusionPipeline.from_pretrained("TASUKU2023/Chilloutmix", paddle_dtype=paddle.float16)
196
+ # 开启xformers加速 默认选择"cutlass"加速
197
+ pipe.enable_xformers_memory_efficient_attention()
198
+ # flash 需要使用 A100、A10、3060、3070、3080、3090 等以上显卡。
199
+ # pipe.enable_xformers_memory_efficient_attention("flash")
200
+ ```
201
+
202
+ ### ToME + ControlNet
203
+ ```python
204
+ # 安装develop的ppdiffusers
205
+ # pip install "ppdiffusers>=0.24.0"
206
+ import paddle
207
+ from ppdiffusers import ControlNetModel, StableDiffusionControlNetPipeline
208
+ from ppdiffusers.utils import load_image
209
+
210
+ controlnet = ControlNetModel.from_pretrained("lllyasviel/sd-controlnet-canny")
211
+ pipe = StableDiffusionControlNetPipeline.from_pretrained(
212
+ "runwayml/stable-diffusion-v1-5", safety_checker=None, controlnet=controlnet, paddle_dtype=paddle.float16
213
+ )
214
+
215
+ # Apply ToMe with a 50% merging ratio
216
+ pipe.apply_tome(ratio=0.5) # Can also use pipe.unet in place of pipe here
217
+
218
+ # 我们可以开启 xformers
219
+ # pipe.enable_xformers_memory_efficient_attention()
220
+ generator = paddle.Generator().manual_seed(0)
221
+ prompt = "bird"
222
+ image = load_image(
223
+ "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/sd_controlnet/bird_canny.png"
224
+ )
225
+
226
+ image = pipe(prompt, image, generator=generator).images[0]
227
+
228
+ image.save("bird.png")
229
+ ```
230
+
231
+ ### 文图生成 (Text-to-Image Generation)
232
+
233
+ ```python
234
+ import paddle
235
+ from ppdiffusers import StableDiffusionPipeline
236
+
237
+ pipe = StableDiffusionPipeline.from_pretrained("stabilityai/stable-diffusion-2")
238
+
239
+ # 设置随机种子,我们可以复现下面的结果!
240
+ paddle.seed(5232132133)
241
+ prompt = "a portrait of shiba inu with a red cap growing on its head. intricate. lifelike. soft light. sony a 7 r iv 5 5 mm. cinematic post - processing "
242
+ image = pipe(prompt, guidance_scale=7.5, height=768, width=768).images[0]
243
+
244
+ image.save("shiba_dog_with_a_red_cap.png")
245
+ ```
246
+ <div align="center">
247
+ <img width="500" alt="image" src="https://user-images.githubusercontent.com/50394665/204796701-d7911f76-8670-47d5-8d1b-8368b046c5e4.png">
248
+ </div>
249
+
250
+ ### 文本引导的图像变换(Image-to-Image Text-Guided Generation)
251
+
252
+ <details><summary>&emsp;Image-to-Image Text-Guided Generation Demo </summary>
253
+
254
+ ```python
255
+ import paddle
256
+ from ppdiffusers import StableDiffusionImg2ImgPipeline
257
+ from ppdiffusers.utils import load_image
258
+
259
+ pipe = StableDiffusionImg2ImgPipeline.from_pretrained("Linaqruf/anything-v3.0", safety_checker=None)
260
+
261
+ url = "https://paddlenlp.bj.bcebos.com/models/community/CompVis/data/image_Kurisu.png"
262
+ image = load_image(url).resize((512, 768))
263
+
264
+ # 设置随机种子,我们可以复现下面的结果!
265
+ paddle.seed(42)
266
+ prompt = "Kurisu Makise, looking at viewer, long hair, standing, 1girl, hair ornament, hair flower, cute, jacket, white flower, white dress"
267
+ negative_prompt = "lowres, bad anatomy, bad hands, text, error, missing fingers, extra digit, fewer digits, cropped, worst quality, low quality, normal quality, jpeg artifacts, signature, watermark, username, blurry"
268
+
269
+ image = pipe(prompt=prompt, negative_prompt=negative_prompt, image=image, strength=0.75, guidance_scale=7.5).images[0]
270
+ image.save("image_Kurisu_img2img.png")
271
+ ```
272
+ <div align="center">
273
+ <img width="500" alt="image" src="https://user-images.githubusercontent.com/50394665/204799529-cd89dcdb-eb1d-4247-91ac-b0f7bad777f8.png">
274
+ </div>
275
+ </details>
276
+
277
+ ### 文本引导的图像编辑(Text-Guided Image Inpainting)
278
+
279
+ 注意!当前有两种版本的图像编辑代码,一个是Legacy版本,一个是正式版本,下面将分别介绍两种代码如何使用!
280
+
281
+ <details><summary>&emsp;Legacy版本代码</summary>
282
+
283
+ ```python
284
+ import paddle
285
+ from ppdiffusers import StableDiffusionInpaintPipelineLegacy
286
+ from ppdiffusers.utils import load_image
287
+
288
+ # 可选模型权重
289
+ # CompVis/stable-diffusion-v1-4
290
+ # runwayml/stable-diffusion-v1-5
291
+ # stabilityai/stable-diffusion-2-base (原始策略 512x512)
292
+ # stabilityai/stable-diffusion-2 (v-objective 768x768)
293
+ # Linaqruf/anything-v3.0
294
+ # ......
295
+ img_url = "https://paddlenlp.bj.bcebos.com/models/community/CompVis/stable-diffusion-v1-4/overture-creations.png"
296
+ mask_url = "https://paddlenlp.bj.bcebos.com/models/community/CompVis/stable-diffusion-v1-4/overture-creations-mask.png"
297
+
298
+ image = load_image(img_url).resize((512, 512))
299
+ mask_image = load_image(mask_url).resize((512, 512))
300
+
301
+ pipe = StableDiffusionInpaintPipelineLegacy.from_pretrained("stabilityai/stable-diffusion-2-base", safety_checker=None)
302
+
303
+ # 设置随机种子,我们可以复现下面的结果!
304
+ paddle.seed(10245)
305
+ prompt = "a red cat sitting on a bench"
306
+ image = pipe(prompt=prompt, image=image, mask_image=mask_image, strength=0.75).images[0]
307
+
308
+ image.save("a_red_cat_legacy.png")
309
+ ```
310
+ <div align="center">
311
+ <img width="900" alt="image" src="https://user-images.githubusercontent.com/50394665/204802186-5a6d302b-83aa-4247-a5bb-ebabfcc3abc4.png">
312
+ </div>
313
+
314
+ </details>
315
+
316
+ <details><summary>&emsp;正式版本代码</summary>
317
+
318
+ Tips: 下面的使用方法是新版本的代码,也是官���推荐的代码,注意必须配合 **runwayml/stable-diffusion-inpainting** 和 **stabilityai/stable-diffusion-2-inpainting** 才可正常使用。
319
+ ```python
320
+ import paddle
321
+ from ppdiffusers import StableDiffusionInpaintPipeline
322
+ from ppdiffusers.utils import load_image
323
+
324
+ # 可选模型权重
325
+ # runwayml/stable-diffusion-inpainting
326
+ # stabilityai/stable-diffusion-2-inpainting
327
+ img_url = "https://paddlenlp.bj.bcebos.com/models/community/CompVis/stable-diffusion-v1-4/overture-creations.png"
328
+ mask_url = "https://paddlenlp.bj.bcebos.com/models/community/CompVis/stable-diffusion-v1-4/overture-creations-mask.png"
329
+
330
+ image = load_image(img_url).resize((512, 512))
331
+ mask_image = load_image(mask_url).resize((512, 512))
332
+
333
+ pipe = StableDiffusionInpaintPipeline.from_pretrained("stabilityai/stable-diffusion-2-inpainting")
334
+
335
+ # 设置随机种子,我们可以复现下面的结果!
336
+ paddle.seed(1024)
337
+ prompt = "Face of a yellow cat, high resolution, sitting on a park bench"
338
+ image = pipe(prompt=prompt, image=image, mask_image=mask_image).images[0]
339
+
340
+ image.save("a_yellow_cat.png")
341
+ ```
342
+ <div align="center">
343
+ <img width="900" alt="image" src="https://user-images.githubusercontent.com/50394665/204801946-6cd043bc-f3db-42cf-82cd-6a6171484523.png">
344
+ </div>
345
+ </details>
346
+
347
+ ### 文本引导的图像放大 & 超分(Text-Guided Image Upscaling & Super-Resolution)
348
+
349
+ <details><summary>&emsp;Text-Guided Image Upscaling Demo</summary>
350
+
351
+ ```python
352
+ import paddle
353
+ from ppdiffusers import StableDiffusionUpscalePipeline
354
+ from ppdiffusers.utils import load_image
355
+
356
+ pipe = StableDiffusionUpscalePipeline.from_pretrained("stabilityai/stable-diffusion-x4-upscaler")
357
+
358
+ url = "https://paddlenlp.bj.bcebos.com/models/community/CompVis/data/low_res_cat.png"
359
+ # 我们人工将原始图片缩小成 128x128 分辨率,最终保存的图片会放大4倍!
360
+ low_res_img = load_image(url).resize((128, 128))
361
+
362
+ prompt = "a white cat"
363
+ image = pipe(prompt=prompt, image=low_res_img).images[0]
364
+
365
+ image.save("upscaled_white_cat.png")
366
+ ```
367
+ <div align="center">
368
+ <img width="200" alt="image" src="https://user-images.githubusercontent.com/50394665/204806180-b7f1b9cf-8a62-4577-b5c4-91adda08a13b.png">
369
+ <img width="400" alt="image" src="https://user-images.githubusercontent.com/50394665/204806202-8c110be3-5f48-4946-95ea-21ad5a9a2340.png">
370
+ </div>
371
+ </details>
372
+
373
+ <details><summary>&emsp;Super-Resolution Demo</summary>
374
+
375
+ ```python
376
+ import paddle
377
+ from ppdiffusers import LDMSuperResolutionPipeline
378
+ from ppdiffusers.utils import load_image
379
+
380
+ pipe = LDMSuperResolutionPipeline.from_pretrained("CompVis/ldm-super-resolution-4x-openimages")
381
+
382
+ url = "https://paddlenlp.bj.bcebos.com/models/community/CompVis/stable-diffusion-v1-4/overture-creations.png"
383
+
384
+ # 我们人工将原始图片缩小成 128x128 分辨率,最终保存的图片会放大4倍!
385
+ low_res_img = load_image(url).resize((128, 128))
386
+
387
+ image = pipe(image=low_res_img, num_inference_steps=100).images[0]
388
+
389
+ image.save("ldm-super-resolution-image.png")
390
+ ```
391
+ <div align="center">
392
+ <img width="200" alt="image" src="https://user-images.githubusercontent.com/50394665/204804426-5e28b571-aa41-4f56-ba26-68cca75fdaae.png">
393
+ <img width="400" alt="image" src="https://user-images.githubusercontent.com/50394665/204804148-fe7c293b-6cd7-4942-ae9c-446369fe8410.png">
394
+ </div>
395
+
396
+ </details>
397
+
398
+ ## 模型推理部署
399
+ 除了**Paddle动态图**运行之外,很多模型还支持将模型导出并使用推理引擎运行。我们提供基于[FastDeploy](https://github.com/PaddlePaddle/FastDeploy)上的**StableDiffusion**模型部署示例,涵盖文生图、图生图、图像编辑等任务,用户可以按照我们提供[StableDiffusion模型导出教程](https://github.com/PaddlePaddle/PaddleMIX/blob/develop/ppdiffusers/deploy/export.md)将模型导出,然后使用`FastDeployStableDiffusionMegaPipeline`进行高性能推理部署!
400
+
401
+ <details><summary>&emsp; 已预先导出的FastDeploy版Stable Diffusion权重 </summary>
402
+
403
+ **注意:当前导出的vae encoder带有随机因素!**
404
+
405
+ - CompVis/stable-diffusion-v1-4@fastdeploy
406
+ - runwayml/stable-diffusion-v1-5@fastdeploy
407
+ - runwayml/stable-diffusion-inpainting@fastdeploy
408
+ - stabilityai/stable-diffusion-2-base@fastdeploy
409
+ - stabilityai/stable-diffusion-2@fastdeploy
410
+ - stabilityai/stable-diffusion-2-inpainting@fastdeploy
411
+ - Linaqruf/anything-v3.0@fastdeploy
412
+ - hakurei/waifu-diffusion-v1-3@fastdeploy
413
+
414
+ </details>
415
+
416
+ <details><summary>&emsp; FastDeploy Demo </summary>
417
+
418
+ ```python
419
+ import paddle
420
+ import fastdeploy as fd
421
+ from ppdiffusers import FastDeployStableDiffusionMegaPipeline
422
+ from ppdiffusers.utils import load_image
423
+
424
+ def create_runtime_option(device_id=0, backend="paddle", use_cuda_stream=True):
425
+ option = fd.RuntimeOption()
426
+ if backend == "paddle":
427
+ option.use_paddle_backend()
428
+ else:
429
+ option.use_ort_backend()
430
+ if device_id == -1:
431
+ option.use_cpu()
432
+ else:
433
+ option.use_gpu(device_id)
434
+ if use_cuda_stream:
435
+ paddle_stream = paddle.device.cuda.current_stream(device_id).cuda_stream
436
+ option.set_external_raw_stream(paddle_stream)
437
+ return option
438
+
439
+ runtime_options = {
440
+ "text_encoder": create_runtime_option(0, "paddle"), # use gpu:0
441
+ "vae_encoder": create_runtime_option(0, "paddle"), # use gpu:0
442
+ "vae_decoder": create_runtime_option(0, "paddle"), # use gpu:0
443
+ "unet": create_runtime_option(0, "paddle"), # use gpu:0
444
+ }
445
+
446
+ fd_pipe = FastDeployStableDiffusionMegaPipeline.from_pretrained(
447
+ "Linaqruf/anything-v3.0@fastdeploy", runtime_options=runtime_options
448
+ )
449
+
450
+ # text2img
451
+ prompt = "a portrait of shiba inu with a red cap growing on its head. intricate. lifelike. soft light. sony a 7 r iv 5 5 mm. cinematic post - processing "
452
+ image_text2img = fd_pipe.text2img(prompt=prompt, num_inference_steps=50).images[0]
453
+ image_text2img.save("image_text2img.png")
454
+
455
+ # img2img
456
+ url = "https://paddlenlp.bj.bcebos.com/models/community/CompVis/data/image_Kurisu.png"
457
+ image = load_image(url).resize((512, 512))
458
+ prompt = "Kurisu Makise, looking at viewer, long hair, standing, 1girl, hair ornament, hair flower, cute, jacket, white flower, white dress"
459
+ negative_prompt = "lowres, bad anatomy, bad hands, text, error, missing fingers, extra digit, fewer digits, cropped, worst quality, low quality, normal quality, jpeg artifacts, signature, watermark, username, blurry"
460
+
461
+ image_img2img = fd_pipe.img2img(
462
+ prompt=prompt, negative_prompt=negative_prompt, image=image, strength=0.75, guidance_scale=7.5
463
+ ).images[0]
464
+ image_img2img.save("image_img2img.png")
465
+
466
+ # inpaint_legacy
467
+ img_url = "https://paddlenlp.bj.bcebos.com/models/community/CompVis/stable-diffusion-v1-4/overture-creations.png"
468
+ mask_url = "https://paddlenlp.bj.bcebos.com/models/community/CompVis/stable-diffusion-v1-4/overture-creations-mask.png"
469
+ image = load_image(img_url).resize((512, 512))
470
+ mask_image = load_image(mask_url).resize((512, 512))
471
+ prompt = "a red cat sitting on a bench"
472
+
473
+ image_inpaint_legacy = fd_pipe.inpaint_legacy(
474
+ prompt=prompt, image=image, mask_image=mask_image, strength=0.75, num_inference_steps=50
475
+ ).images[0]
476
+ image_inpaint_legacy.save("image_inpaint_legacy.png")
477
+ ```
478
+ </details>
479
+ <div align="center">
480
+ <img width="900" alt="image" src="https://user-images.githubusercontent.com/50394665/205297240-46b80992-34af-40cd-91a6-ae76589d0e21.png">
481
+ </div>
482
+
483
+
484
+ ## 更多任务分类展示
485
+ ### 文本图像多模
486
+
487
+ <details open>
488
+ <summary>&emsp;文图生成(Text-to-Image Generation)</summary>
489
+
490
+ #### text_to_image_generation-stable_diffusion
491
+
492
+ ```python
493
+ from ppdiffusers import StableDiffusionPipeline
494
+
495
+ # 加载模型和scheduler
496
+ pipe = StableDiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5")
497
+
498
+ # 执行pipeline进行推理
499
+ prompt = "a photo of an astronaut riding a horse on mars"
500
+ image = pipe(prompt).images[0]
501
+
502
+ # 保存图片
503
+ image.save("astronaut_rides_horse_sd.png")
504
+ ```
505
+ <div align="center">
506
+ <img width="300" alt="image" src="https://user-images.githubusercontent.com/20476674/209322401-6ecfeaaa-6878-4302-b592-07a31de4e590.png">
507
+ </div>
508
+
509
+ #### text_to_image_generation-stable_diffusion_xl
510
+
511
+ ```python
512
+ import paddle
513
+ from ppdiffusers import StableDiffusionXLPipeline
514
+
515
+ pipe = StableDiffusionXLPipeline.from_pretrained(
516
+ "stabilityai/stable-diffusion-xl-base-1.0",
517
+ paddle_dtype=paddle.float16,
518
+ variant="fp16"
519
+ )
520
+ prompt = "a photo of an astronaut riding a horse on mars"
521
+ generator = paddle.Generator().manual_seed(42)
522
+ image = pipe(prompt=prompt, generator=generator, num_inference_steps=50).images[0]
523
+ image.save('sdxl_text2image.png')
524
+ ```
525
+ <div align="center">
526
+ <img width="300" alt="image" src="https://github.com/PaddlePaddle/PaddleMIX/assets/35400185/d72729f9-8685-48f9-a238-e4ddf6d264f3">
527
+ </div>
528
+
529
+ #### text_to_image_generation-sdxl_base_with_refiner
530
+
531
+ ```python
532
+ from ppdiffusers import DiffusionPipeline
533
+ import paddle
534
+
535
+ # load both base & refiner
536
+ base = DiffusionPipeline.from_pretrained(
537
+ "stabilityai/stable-diffusion-xl-base-1.0",
538
+ paddle_dtype=paddle.float16,
539
+ )
540
+ refiner = DiffusionPipeline.from_pretrained(
541
+ "stabilityai/stable-diffusion-xl-refiner-1.0",
542
+ text_encoder_2=base.text_encoder_2,
543
+ vae=base.vae,
544
+ paddle_dtype=paddle.float16,
545
+ variant="fp16",
546
+ )
547
+
548
+ # Define how many steps and what % of steps to be run on each experts (80/20) here
549
+ n_steps = 40
550
+ high_noise_frac = 0.8
551
+
552
+ prompt = "A majestic lion jumping from a big stone at night"
553
+ prompt = "a photo of an astronaut riding a horse on mars"
554
+ generator = paddle.Generator().manual_seed(42)
555
+
556
+ # run both experts
557
+ image = base(
558
+ prompt=prompt,
559
+ output_type="latent",
560
+ generator=generator,
561
+ ).images
562
+
563
+ image = refiner(
564
+ prompt=prompt,
565
+ image=image,
566
+ generator=generator,
567
+ ).images[0]
568
+ image.save('text_to_image_generation-sdxl-base-with-refiner-result.png')
569
+ ```
570
+ <div align="center">
571
+ <img width="300" alt="image" src="https://github.com/PaddlePaddle/PaddleMIX/assets/35400185/8ef36826-ed94-4856-a356-af1677f60d1b">
572
+ </div>
573
+
574
+ #### text_to_image_generation-kandinsky2_2
575
+ ```python
576
+ from ppdiffusers import KandinskyV22Pipeline, KandinskyV22PriorPipeline
577
+
578
+ pipe_prior = KandinskyV22PriorPipeline.from_pretrained("kandinsky-community/kandinsky-2-2-prior")
579
+ prompt = "red cat, 4k photo"
580
+ out = pipe_prior(prompt)
581
+ image_emb = out.image_embeds
582
+ zero_image_emb = out.negative_image_embeds
583
+ pipe = KandinskyV22Pipeline.from_pretrained("kandinsky-community/kandinsky-2-2-decoder")
584
+ image = pipe(
585
+ image_embeds=image_emb,
586
+ negative_image_embeds=zero_image_emb,
587
+ height=768,
588
+ width=768,
589
+ num_inference_steps=50,
590
+ ).images
591
+ image[0].save("text_to_image_generation-kandinsky2_2-result-cat.png")
592
+ ```
593
+ <div align="center">
594
+ <img width="300" alt="image" src="https://github.com/PaddlePaddle/PaddleMIX/assets/35400185/188f76dd-4bd7-4a33-8f30-b893c7a9e249">
595
+ </div>
596
+
597
+ #### text_to_image_generation-unidiffuser
598
+ ```python
599
+ import paddle
600
+ from paddlenlp.trainer import set_seed
601
+
602
+ from ppdiffusers import UniDiffuserPipeline
603
+
604
+ model_id_or_path = "thu-ml/unidiffuser-v1"
605
+ pipe = UniDiffuserPipeline.from_pretrained(model_id_or_path, paddle_dtype=paddle.float16)
606
+ set_seed(42)
607
+
608
+ # Text variation can be performed with a text-to-image generation followed by a image-to-text generation:
609
+ # 1. Text-to-image generation
610
+ prompt = "an elephant under the sea"
611
+ sample = pipe(prompt=prompt, num_inference_steps=20, guidance_scale=8.0)
612
+ t2i_image = sample.images[0]
613
+ t2i_image.save("t2i_image.png")
614
+ ````
615
+ <div align="center">
616
+ <img width="300" alt="image" src="https://github.com/PaddlePaddle/PaddleMIX/assets/35400185/a6eb11d2-ad27-4263-8cb4-b0d8dd42b36c">
617
+ </div>
618
+
619
+ #### text_to_image_generation-deepfloyd_if
620
+
621
+ ```python
622
+ import paddle
623
+
624
+ from ppdiffusers import DiffusionPipeline, IFPipeline, IFSuperResolutionPipeline
625
+ from ppdiffusers.utils import pd_to_pil
626
+
627
+ # Stage 1: generate images
628
+ pipe = IFPipeline.from_pretrained("DeepFloyd/IF-I-XL-v1.0", variant="fp16", paddle_dtype=paddle.float16)
629
+ pipe.enable_xformers_memory_efficient_attention()
630
+ prompt = 'a photo of a kangaroo wearing an orange hoodie and blue sunglasses standing in front of the eiffel tower holding a sign that says "very deep learning"'
631
+ prompt_embeds, negative_embeds = pipe.encode_prompt(prompt)
632
+ image = pipe(
633
+ prompt_embeds=prompt_embeds,
634
+ negative_prompt_embeds=negative_embeds,
635
+ output_type="pd",
636
+ ).images
637
+
638
+ # save intermediate image
639
+ pil_image = pd_to_pil(image)
640
+ pil_image[0].save("text_to_image_generation-deepfloyd_if-result-if_stage_I.png")
641
+ # save gpu memory
642
+ pipe.to(paddle_device="cpu")
643
+
644
+ # Stage 2: super resolution stage1
645
+ super_res_1_pipe = IFSuperResolutionPipeline.from_pretrained(
646
+ "DeepFloyd/IF-II-L-v1.0", text_encoder=None, variant="fp16", paddle_dtype=paddle.float16
647
+ )
648
+ super_res_1_pipe.enable_xformers_memory_efficient_attention()
649
+
650
+ image = super_res_1_pipe(
651
+ image=image,
652
+ prompt_embeds=prompt_embeds,
653
+ negative_prompt_embeds=negative_embeds,
654
+ output_type="pd",
655
+ ).images
656
+ # save intermediate image
657
+ pil_image = pd_to_pil(image)
658
+ pil_image[0].save("text_to_image_generation-deepfloyd_if-result-if_stage_II.png")
659
+ # save gpu memory
660
+ super_res_1_pipe.to(paddle_device="cpu")
661
+ ```
662
+ <div align="center">
663
+ <img alt="image" src="https://user-images.githubusercontent.com/20476674/246785766-700dfad9-159d-4bfb-bfc7-c18df938a052.png">
664
+ </div>
665
+ <div align="center">
666
+ <center>if_stage_I</center>
667
+ </div>
668
+ <div align="center">
669
+ <img alt="image" src="https://user-images.githubusercontent.com/20476674/246785773-3359ca5f-dadf-4cc8-b318-ff1f9d4a2d35.png">
670
+ </div>
671
+ <div align="center">
672
+ <center>if_stage_II</center>
673
+ <!-- <img alt="image" src="https://user-images.githubusercontent.com/20476674/246785774-8870829a-354b-4a87-9d67-93af315f51e6.png">
674
+ <center>if_stage_III</center> -->
675
+ </div>
676
+ </details>
677
+
678
+
679
+ <details><summary>&emsp;文本引导的图像放大(Text-Guided Image Upscaling)</summary>
680
+
681
+ #### text_guided_image_upscaling-stable_diffusion_2
682
+
683
+ ```python
684
+ from ppdiffusers import StableDiffusionUpscalePipeline
685
+ from ppdiffusers.utils import load_image
686
+
687
+ pipe = StableDiffusionUpscalePipeline.from_pretrained("stabilityai/stable-diffusion-x4-upscaler")
688
+
689
+ url = "https://paddlenlp.bj.bcebos.com/models/community/CompVis/data/low_res_cat.png"
690
+ low_res_img = load_image(url).resize((128, 128))
691
+
692
+ prompt = "a white cat"
693
+ upscaled_image = pipe(prompt=prompt, image=low_res_img).images[0]
694
+ upscaled_image.save("upsampled_cat_sd2.png")
695
+ ```
696
+ <div align="center">
697
+ <img alt="image" src="https://user-images.githubusercontent.com/20476674/209324085-0d058b70-89b0-43c2-affe-534eedf116cf.png">
698
+ <center>原图像</center>
699
+ <img alt="image" src="https://user-images.githubusercontent.com/20476674/209323862-ce2d8658-a52b-4f35-90cb-aa7d310022e7.png">
700
+ <center>生成图像</center>
701
+ </div>
702
+ </details>
703
+
704
+ <details><summary>&emsp;文本引导的图像编辑(Text-Guided Image Inpainting)</summary>
705
+
706
+ #### text_guided_image_inpainting-stable_diffusion_2
707
+
708
+ ```python
709
+ import paddle
710
+
711
+ from ppdiffusers import PaintByExamplePipeline
712
+ from ppdiffusers.utils import load_image
713
+
714
+ img_url = "https://paddlenlp.bj.bcebos.com/models/community/Fantasy-Studio/data/image_example_1.png"
715
+ mask_url = "https://paddlenlp.bj.bcebos.com/models/community/Fantasy-Studio/data/mask_example_1.png"
716
+ example_url = "https://paddlenlp.bj.bcebos.com/models/community/Fantasy-Studio/data/reference_example_1.jpeg"
717
+
718
+ init_image = load_image(img_url).resize((512, 512))
719
+ mask_image = load_image(mask_url).resize((512, 512))
720
+ example_image = load_image(example_url).resize((512, 512))
721
+
722
+ pipe = PaintByExamplePipeline.from_pretrained("Fantasy-Studio/Paint-by-Example")
723
+
724
+ # 使用fp16加快生成速度
725
+ with paddle.amp.auto_cast(True):
726
+ image = pipe(image=init_image, mask_image=mask_image, example_image=example_image).images[0]
727
+ image.save("image_guided_image_inpainting-paint_by_example-result.png")
728
+ ```
729
+ <div align="center">
730
+ <img alt="image" src="https://user-images.githubusercontent.com/20476674/247118364-5d91f433-f9ac-4514-b5f0-cb4599905847.png" width=300>
731
+ <center>原图像</center>
732
+ <div align="center">
733
+ <img alt="image" src="https://user-images.githubusercontent.com/20476674/247118361-0f78d6db-6896-4f8d-b1bd-8350192f7a4e.png" width=300>
734
+ <center>掩码图像</center>
735
+ <div align="center">
736
+ <img alt="image" src="https://user-images.githubusercontent.com/20476674/247118368-305a048d-ddc3-4a5f-8915-58591ef680f0.jpeg" width=300>
737
+ <center>参考图像</center>
738
+ <img alt="image" src="https://user-images.githubusercontent.com/20476674/247117963-e5b9b754-39a3-480b-a557-46a2f9310e79.png" width=300>
739
+ <center>生成图像</center>
740
+ </div>
741
+ </details>
742
+
743
+
744
+ <details><summary>&emsp;文本引导的图像变换(Image-to-Image Text-Guided Generation)</summary>
745
+
746
+ #### text_guided_image_inpainting-kandinsky2_2
747
+ ```python
748
+ import numpy as np
749
+ import paddle
750
+
751
+ from ppdiffusers import KandinskyV22InpaintPipeline, KandinskyV22PriorPipeline
752
+ from ppdiffusers.utils import load_image
753
+
754
+ pipe_prior = KandinskyV22PriorPipeline.from_pretrained(
755
+ "kandinsky-community/kandinsky-2-2-prior", paddle_dtype=paddle.float16
756
+ )
757
+ prompt = "a hat"
758
+ image_emb, zero_image_emb = pipe_prior(prompt, return_dict=False)
759
+ pipe = KandinskyV22InpaintPipeline.from_pretrained(
760
+ "kandinsky-community/kandinsky-2-2-decoder-inpaint", paddle_dtype=paddle.float16
761
+ )
762
+ init_image = load_image(
763
+ "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/kandinsky/cat.png"
764
+ )
765
+ mask = np.zeros((768, 768), dtype=np.float32)
766
+ mask[:250, 250:-250] = 1
767
+ out = pipe(
768
+ image=init_image,
769
+ mask_image=mask,
770
+ image_embeds=image_emb,
771
+ negative_image_embeds=zero_image_emb,
772
+ height=768,
773
+ width=768,
774
+ num_inference_steps=50,
775
+ )
776
+ image = out.images[0]
777
+ image.save("text_guided_image_inpainting-kandinsky2_2-result-cat_with_hat.png")
778
+ ```
779
+ <div align="center">
780
+ <img width="300" alt="image" src="https://github.com/PaddlePaddle/PaddleMIX/assets/35400185/64a943d5-167b-4433-91c3-3cf9279714db">
781
+ <center>原图像</center>
782
+ <img width="300" alt="image" src="https://github.com/PaddlePaddle/PaddleMIX/assets/35400185/f469c127-52f4-4173-a693-c06b92a052aa">
783
+ <center>生成图像</center>
784
+ </div>
785
+
786
+ #### image_to_image_text_guided_generation-stable_diffusion
787
+ ```python
788
+ import paddle
789
+
790
+ from ppdiffusers import StableDiffusionImg2ImgPipeline
791
+ from ppdiffusers.utils import load_image
792
+
793
+ # 加载pipeline
794
+ pipe = StableDiffusionImg2ImgPipeline.from_pretrained("runwayml/stable-diffusion-v1-5")
795
+
796
+ # 下载初始图片
797
+ url = "https://paddlenlp.bj.bcebos.com/models/community/CompVis/stable-diffusion-v1-4/sketch-mountains-input.png"
798
+
799
+ init_image = load_image(url).resize((768, 512))
800
+
801
+ prompt = "A fantasy landscape, trending on artstation"
802
+ # 使用fp16加快生成速度
803
+ with paddle.amp.auto_cast(True):
804
+ image = pipe(prompt=prompt, image=init_image, strength=0.75, guidance_scale=7.5).images[0]
805
+
806
+ image.save("fantasy_landscape.png")
807
+ ```
808
+ <div align="center">
809
+ <img width="300" alt="image" src="https://user-images.githubusercontent.com/20476674/209327142-d8e1d0c7-3bf8-4a08-a0e8-b11451fc84d8.png">
810
+ <center>原图像</center>
811
+ <img width="300" alt="image" src="https://user-images.githubusercontent.com/20476674/209325799-d9ff279b-0d57-435f-bda7-763e3323be23.png">
812
+ <center>生成图像</center>
813
+ </div>
814
+
815
+ #### image_to_image_text_guided_generation-stable_diffusion_xl
816
+ ```python
817
+ import paddle
818
+ from ppdiffusers import StableDiffusionXLImg2ImgPipeline
819
+ from ppdiffusers.utils import load_image
820
+
821
+ pipe = StableDiffusionXLImg2ImgPipeline.from_pretrained(
822
+ "stabilityai/stable-diffusion-xl-refiner-1.0",
823
+ paddle_dtype=paddle.float16,
824
+ # from_hf_hub=True,
825
+ # from_diffusers=True,
826
+ variant="fp16"
827
+ )
828
+ url = "https://paddlenlp.bj.bcebos.com/models/community/westfish/develop-0-19-3/000000009.png"
829
+ init_image = load_image(url).convert("RGB")
830
+ prompt = "a photo of an astronaut riding a horse on mars"
831
+ image = pipe(prompt, image=init_image).images[0]
832
+ image.save('sdxl_image2image.png')
833
+ ```
834
+ <div align="center">
835
+ <img width="300" alt="image" src="https://github.com/PaddlePaddle/PaddleMIX/assets/35400185/41bd9381-2799-4bed-a5e2-ba312a2f8da9">
836
+ <center>原图像</center>
837
+ <img width="300" alt="image" src="https://github.com/PaddlePaddle/PaddleMIX/assets/35400185/db672d03-2e3a-46ac-97fd-d80cca18dbbe">
838
+ <center>生成图像</center>
839
+ </div>
840
+
841
+ #### image_to_image_text_guided_generation-kandinsky2_2
842
+ ```python
843
+ import paddle
844
+
845
+ from ppdiffusers import KandinskyV22Img2ImgPipeline, KandinskyV22PriorPipeline
846
+ from ppdiffusers.utils import load_image
847
+
848
+ pipe_prior = KandinskyV22PriorPipeline.from_pretrained(
849
+ "kandinsky-community/kandinsky-2-2-prior", paddle_dtype=paddle.float16
850
+ )
851
+ prompt = "A red cartoon frog, 4k"
852
+ image_emb, zero_image_emb = pipe_prior(prompt, return_dict=False)
853
+ pipe = KandinskyV22Img2ImgPipeline.from_pretrained(
854
+ "kandinsky-community/kandinsky-2-2-decoder", paddle_dtype=paddle.float16
855
+ )
856
+
857
+ init_image = load_image(
858
+ "https://hf-mirror.com/datasets/hf-internal-testing/diffusers-images/resolve/main/kandinsky/frog.png"
859
+ )
860
+ image = pipe(
861
+ image=init_image,
862
+ image_embeds=image_emb,
863
+ negative_image_embeds=zero_image_emb,
864
+ height=768,
865
+ width=768,
866
+ num_inference_steps=100,
867
+ strength=0.2,
868
+ ).images
869
+ image[0].save("image_to_image_text_guided_generation-kandinsky2_2-result-red_frog.png")
870
+ ```
871
+ <div align="center">
872
+ <img width="300" alt="image" src="https://github.com/PaddlePaddle/PaddleMIX/assets/35400185/aae57109-94ad-408e-ae75-8cce650cebe5">
873
+ <center>原图像</center>
874
+ <img width="300" alt="image" src="https://github.com/PaddlePaddle/PaddleMIX/assets/35400185/23cf2c4e-416f-4f21-82a6-e57de11b5e83">
875
+ <center>生成图像</center>
876
+ </div>
877
+
878
+ </details>
879
+ </details>
880
+
881
+ <details><summary>&emsp;文本图像双引导图像生成(Dual Text and Image Guided Generation)</summary>
882
+
883
+ #### dual_text_and_image_guided_generation-versatile_diffusion
884
+ ```python
885
+ from ppdiffusers import VersatileDiffusionDualGuidedPipeline
886
+ from ppdiffusers.utils import load_image
887
+
888
+ url = "https://paddlenlp.bj.bcebos.com/models/community/CompVis/data/benz.jpg"
889
+ image = load_image(url)
890
+ text = "a red car in the sun"
891
+
892
+ pipe = VersatileDiffusionDualGuidedPipeline.from_pretrained("shi-labs/versatile-diffusion")
893
+ pipe.remove_unused_weights()
894
+
895
+ text_to_image_strength = 0.75
896
+ image = pipe(prompt=text, image=image, text_to_image_strength=text_to_image_strength).images[0]
897
+ image.save("versatile-diffusion-red_car.png")
898
+ ```
899
+ <div align="center">
900
+ <img width="300" alt="image" src="https://user-images.githubusercontent.com/20476674/209325965-2475e9c4-a524-4970-8498-dfe10ff9cf24.jpg" >
901
+ <center>原图像</center>
902
+ <img width="300" alt="image" src="https://user-images.githubusercontent.com/20476674/209325293-049098d0-d591-4abc-b151-9291ac2636da.png">
903
+ <center>生成图像</center>
904
+ </div>
905
+ </details>
906
+
907
+ ### 文本视频多模
908
+
909
+ <details open>
910
+ <summary>&emsp;文本条件的视频生成(Text-to-Video Generation)</summary>
911
+
912
+ #### text_to_video_generation-lvdm
913
+
914
+ ```python
915
+ import paddle
916
+
917
+ from ppdiffusers import LVDMTextToVideoPipeline
918
+
919
+ # 加载模型和scheduler
920
+ pipe = LVDMTextToVideoPipeline.from_pretrained("westfish/lvdm_text2video_orig_webvid_2m")
921
+
922
+ # 执行pipeline进行推理
923
+ seed = 2013
924
+ generator = paddle.Generator().manual_seed(seed)
925
+ samples = pipe(
926
+ prompt="cutting in kitchen",
927
+ num_frames=16,
928
+ height=256,
929
+ width=256,
930
+ num_inference_steps=50,
931
+ generator=generator,
932
+ guidance_scale=15,
933
+ eta=1,
934
+ save_dir=".",
935
+ save_name="text_to_video_generation-lvdm-result-ddim_lvdm_text_to_video_ucf",
936
+ encoder_type="2d",
937
+ scale_factor=0.18215,
938
+ shift_factor=0,
939
+ )
940
+ ```
941
+ <div align="center">
942
+ <img width="300" alt="image" src="https://user-images.githubusercontent.com/20476674/270906907-2b9d53c1-0272-4c7a-81b2-cd962d23bbee.gif">
943
+ </div>
944
+
945
+ #### text_to_video_generation-synth
946
+
947
+ ```python
948
+ import imageio
949
+
950
+ from ppdiffusers import DPMSolverMultistepScheduler, TextToVideoSDPipeline
951
+
952
+ pipe = TextToVideoSDPipeline.from_pretrained("damo-vilab/text-to-video-ms-1.7b")
953
+ pipe.scheduler = DPMSolverMultistepScheduler.from_config(pipe.scheduler.config)
954
+
955
+ prompt = "An astronaut riding a horse."
956
+ video_frames = pipe(prompt, num_inference_steps=25).frames
957
+ imageio.mimsave("text_to_video_generation-synth-result-astronaut_riding_a_horse.mp4", video_frames, fps=8)
958
+ ```
959
+ <div align="center">
960
+ <img width="300" alt="image" src="https://user-images.githubusercontent.com/20476674/281259277-0ebe29a3-4eba-48ee-a98b-292e60de3c98.gif">
961
+ </div>
962
+
963
+
964
+ #### text_to_video_generation-synth with zeroscope_v2_XL
965
+
966
+ ```python
967
+ import imageio
968
+
969
+ from ppdiffusers import DPMSolverMultistepScheduler, TextToVideoSDPipeline
970
+
971
+ # from ppdiffusers.utils import export_to_video
972
+
973
+ pipe = TextToVideoSDPipeline.from_pretrained("cerspense/zeroscope_v2_XL")
974
+ pipe.scheduler = DPMSolverMultistepScheduler.from_config(pipe.scheduler.config)
975
+
976
+ prompt = "An astronaut riding a horse."
977
+ video_frames = pipe(prompt, num_inference_steps=50, height=320, width=576, num_frames=24).frames
978
+ imageio.mimsave("text_to_video_generation-synth-result-astronaut_riding_a_horse.mp4", video_frames, fps=8)
979
+ ```
980
+ <div align="center">
981
+ <img width="300" alt="image" src="https://github.com/PaddlePaddle/PaddleMIX/assets/35400185/43ebbca0-9f07-458b-809a-acf296a2539b">
982
+ </div>
983
+
984
+ #### text_to_video_generation-zero
985
+
986
+ ```python
987
+ import imageio
988
+
989
+ # pip install imageio[ffmpeg]
990
+ import paddle
991
+
992
+ from ppdiffusers import TextToVideoZeroPipeline
993
+
994
+ model_id = "runwayml/stable-diffusion-v1-5"
995
+ pipe = TextToVideoZeroPipeline.from_pretrained(model_id, paddle_dtype=paddle.float16)
996
+
997
+ prompt = "A panda is playing guitar on times square"
998
+ result = pipe(prompt=prompt).images
999
+ result = [(r * 255).astype("uint8") for r in result]
1000
+ imageio.mimsave("text_to_video_generation-zero-result-panda.mp4", result, fps=4)
1001
+ ```
1002
+ <div align="center">
1003
+ <img width="300" alt="image" src="https://user-images.githubusercontent.com/20476674/246779321-c2b0c2b4-e383-40c7-a4d8-f417e8062b35.gif">
1004
+ </div>
1005
+
1006
+ </details>
1007
+
1008
+ ### 文本音频多模
1009
+ <details>
1010
+ <summary>&emsp;文本条件的音频生成(Text-to-Audio Generation)</summary>
1011
+
1012
+ #### text_to_audio_generation-audio_ldm
1013
+
1014
+ ```python
1015
+ import paddle
1016
+ import scipy
1017
+
1018
+ from ppdiffusers import AudioLDM2Pipeline
1019
+
1020
+ pipe = AudioLDM2Pipeline.from_pretrained("cvssp/audioldm2", paddle_dtype=paddle.float16)
1021
+
1022
+ prompt = "Musical constellations twinkling in the night sky, forming a cosmic melody."
1023
+ negative_prompt = "Low quality."
1024
+ audio = pipe(prompt, negative_prompt=negative_prompt, num_inference_steps=200, audio_length_in_s=10).audios[0]
1025
+
1026
+ output_path = f"{prompt}.wav"
1027
+ # save the audio sample as a .wav file
1028
+ scipy.io.wavfile.write(output_path, rate=16000, data=audio)
1029
+ ```
1030
+ <div align = "center">
1031
+ <thead>
1032
+ </thead>
1033
+ <tbody>
1034
+ <tr>
1035
+ <td align = "center">
1036
+ <a href="https://paddlenlp.bj.bcebos.com/models/community/paddlemix/ppdiffusers/AudioLDM2-Music.wav" rel="nofollow">
1037
+ <img align="center" src="https://user-images.githubusercontent.com/20476674/209344877-edbf1c24-f08d-4e3b-88a4-a27e1fd0a858.png" width="200 style="max-width: 100%;"></a><br>
1038
+ </td>
1039
+ </tr>
1040
+ </tbody>
1041
+ </div>
1042
+ </details>
1043
+
1044
+ 可以使用以下代码转换[huggingface](https://huggingface.co/docs/diffusers/api/pipelines/audioldm2)的模型,一键在paddle中使用
1045
+ ```python
1046
+ pipe = AudioLDM2Pipeline.from_pretrained("cvssp/audioldm2-music", from_hf_hub=True, from_diffusers=True).save_pretrained("cvssp/audioldm2-music")
1047
+ ```
1048
+ ### 图像
1049
+
1050
+ <details><summary>&emsp;无条件图像生成(Unconditional Image Generation)</summary>
1051
+
1052
+ #### unconditional_image_generation-latent_diffusion_uncond
1053
+
1054
+ ```python
1055
+ from ppdiffusers import LDMPipeline
1056
+
1057
+ # 加载模型和scheduler
1058
+ pipe = LDMPipeline.from_pretrained("CompVis/ldm-celebahq-256")
1059
+
1060
+ # 执行pipeline进行推理
1061
+ image = pipe(num_inference_steps=200).images[0]
1062
+
1063
+ # 保存图片
1064
+ image.save("ldm_generated_image.png")
1065
+ ```
1066
+ <div align="center">
1067
+ <img width="300" alt="image" src="https://user-images.githubusercontent.com/20476674/209327936-7fe914e0-0ea0-4e21-a433-24eaed6ee94c.png">
1068
+ </div>
1069
+ </details>
1070
+
1071
+ <details><summary>&emsp;超分(Super Superresolution)</summary>
1072
+
1073
+ #### super_resolution-latent_diffusion
1074
+ ```python
1075
+ import paddle
1076
+
1077
+ from ppdiffusers import LDMSuperResolutionPipeline
1078
+ from ppdiffusers.utils import load_image
1079
+
1080
+ # 加载pipeline
1081
+ pipe = LDMSuperResolutionPipeline.from_pretrained("CompVis/ldm-super-resolution-4x-openimages")
1082
+
1083
+ # 下载初始图片
1084
+ url = "https://paddlenlp.bj.bcebos.com/models/community/CompVis/stable-diffusion-v1-4/overture-creations.png"
1085
+
1086
+ init_image = load_image(url).resize((128, 128))
1087
+ init_image.save("original-image.png")
1088
+
1089
+ # 使用fp16加快生成速度
1090
+ with paddle.amp.auto_cast(True):
1091
+ image = pipe(init_image, num_inference_steps=100, eta=1).images[0]
1092
+
1093
+ image.save("super-resolution-image.png")
1094
+ ```
1095
+ <div align="center">
1096
+ <img alt="image" src="https://user-images.githubusercontent.com/20476674/209328660-9700fdc3-72b3-43bd-9a00-23b370ba030b.png">
1097
+ <center>原图像</center>
1098
+ <img alt="image" src="https://user-images.githubusercontent.com/20476674/209328479-4eaea5d8-aa4a-4f31-aa2a-b47e3c730f15.png">
1099
+ <center>生成图像</center>
1100
+ </div>
1101
+ </details>
1102
+
1103
+
1104
+ <details><summary>&emsp;图像编辑(Image Inpainting)</summary>
1105
+
1106
+ #### image_inpainting-repaint
1107
+ ```python
1108
+ from ppdiffusers import RePaintPipeline, RePaintScheduler
1109
+ from ppdiffusers.utils import load_image
1110
+
1111
+ img_url = "https://paddlenlp.bj.bcebos.com/models/community/CompVis/data/celeba_hq_256.png"
1112
+ mask_url = "https://paddlenlp.bj.bcebos.com/models/community/CompVis/data/mask_256.png"
1113
+
1114
+ # Load the original image and the mask as PIL images
1115
+ original_image = load_image(img_url).resize((256, 256))
1116
+ mask_image = load_image(mask_url).resize((256, 256))
1117
+
1118
+ scheduler = RePaintScheduler.from_pretrained("google/ddpm-ema-celebahq-256", subfolder="scheduler")
1119
+ pipe = RePaintPipeline.from_pretrained("google/ddpm-ema-celebahq-256", scheduler=scheduler)
1120
+
1121
+ output = pipe(
1122
+ original_image=original_image,
1123
+ mask_image=mask_image,
1124
+ num_inference_steps=250,
1125
+ eta=0.0,
1126
+ jump_length=10,
1127
+ jump_n_sample=10,
1128
+ )
1129
+ inpainted_image = output.images[0]
1130
+
1131
+ inpainted_image.save("repaint-image.png")
1132
+ ```
1133
+ <div align="center">
1134
+ <img alt="image" src="https://user-images.githubusercontent.com/20476674/209329052-b6fc2aaf-1a59-49a3-92ef-60180fdffd81.png">
1135
+ <center>原图像</center>
1136
+ <img alt="image" src="https://user-images.githubusercontent.com/20476674/209329048-4fe12176-32a0-4800-98f2-49bd8d593799.png">
1137
+ <center>mask图像</center>
1138
+ <img alt="image" src="https://user-images.githubusercontent.com/20476674/209329241-b7e4d99e-468a-4b95-8829-d77ee14bfe98.png">
1139
+ <center>生成图像</center>
1140
+ </div>
1141
+ </details>
1142
+
1143
+
1144
+
1145
+ <details><summary>&emsp;图像变化(Image Variation)</summary>
1146
+
1147
+ #### image_variation-versatile_diffusion
1148
+ ```python
1149
+ from ppdiffusers import VersatileDiffusionImageVariationPipeline
1150
+ from ppdiffusers.utils import load_image
1151
+
1152
+ url = "https://paddlenlp.bj.bcebos.com/models/community/CompVis/data/benz.jpg"
1153
+ image = load_image(url)
1154
+
1155
+ pipe = VersatileDiffusionImageVariationPipeline.from_pretrained("shi-labs/versatile-diffusion")
1156
+
1157
+ image = pipe(image).images[0]
1158
+ image.save("versatile-diffusion-car_variation.png")
1159
+ ```
1160
+ <div align="center">
1161
+ <img width="300" alt="image" src="https://user-images.githubusercontent.com/20476674/209331434-51f6cdbd-b8e4-4faa-8e49-1cc852e35603.jpg">
1162
+ <center>原图像</center>
1163
+ <img width="300" alt="image" src="https://user-images.githubusercontent.com/20476674/209331591-f6cc4cd8-8430-4627-8d22-bf404fb2bfdd.png">
1164
+ <center>生成图像</center>
1165
+ </div>
1166
+ </details>
1167
+
1168
+
1169
+
1170
+
1171
+
1172
+ ### 音频
1173
+ <details>
1174
+ <summary>&emsp;无条件音频生成(Unconditional Audio Generation)</summary>
1175
+
1176
+ #### unconditional_audio_generation-audio_diffusion
1177
+
1178
+ ```python
1179
+ from scipy.io.wavfile import write
1180
+ from ppdiffusers import AudioDiffusionPipeline
1181
+ import paddle
1182
+
1183
+ # 加载模型和scheduler
1184
+ pipe = AudioDiffusionPipeline.from_pretrained("teticio/audio-diffusion-ddim-256")
1185
+ pipe.set_progress_bar_config(disable=None)
1186
+ generator = paddle.Generator().manual_seed(42)
1187
+
1188
+ output = pipe(generator=generator)
1189
+ audio = output.audios[0]
1190
+ image = output.images[0]
1191
+
1192
+ # 保存音频到本地
1193
+ for i, audio in enumerate(audio):
1194
+ write(f"audio_diffusion_test{i}.wav", pipe.mel.config.sample_rate, audio.transpose())
1195
+
1196
+ # 保存图片
1197
+ image.save("audio_diffusion_test.png")
1198
+ ```
1199
+ <div align = "center">
1200
+ <thead>
1201
+ </thead>
1202
+ <tbody>
1203
+ <tr>
1204
+ <td align = "center">
1205
+ <a href="https://paddlenlp.bj.bcebos.com/models/community/teticio/data/audio_diffusion_test0.wav" rel="nofollow">
1206
+ <img align="center" src="https://user-images.githubusercontent.com/20476674/209344877-edbf1c24-f08d-4e3b-88a4-a27e1fd0a858.png" width="200 style="max-width: 100%;"></a><br>
1207
+ </td>
1208
+ </tr>
1209
+ </tbody>
1210
+ </div>
1211
+
1212
+ <div align="center">
1213
+ <img width="300" alt="image" src="https://user-images.githubusercontent.com/20476674/209342125-93e8715e-895b-4115-9e1e-e65c6c2cd95a.png">
1214
+ </div>
1215
+
1216
+
1217
+ #### unconditional_audio_generation-spectrogram_diffusion
1218
+
1219
+ ```python
1220
+ import paddle
1221
+ import scipy
1222
+
1223
+ from ppdiffusers import MidiProcessor, SpectrogramDiffusionPipeline
1224
+ from ppdiffusers.utils.download_utils import ppdiffusers_url_download
1225
+
1226
+ # Download MIDI from: wget https://paddlenlp.bj.bcebos.com/models/community/junnyu/develop/beethoven_hammerklavier_2.mid
1227
+ mid_file_path = ppdiffusers_url_download(
1228
+ "https://paddlenlp.bj.bcebos.com/models/community/junnyu/develop/beethoven_hammerklavier_2.mid", cache_dir="."
1229
+ )
1230
+ pipe = SpectrogramDiffusionPipeline.from_pretrained("google/music-spectrogram-diffusion", paddle_dtype=paddle.float16)
1231
+ processor = MidiProcessor()
1232
+ output = pipe(processor(mid_file_path))
1233
+ audio = output.audios[0]
1234
+
1235
+ output_path = "unconditional_audio_generation-spectrogram_diffusion-result-beethoven_hammerklavier_2.wav"
1236
+ # save the audio sample as a .wav file
1237
+ scipy.io.wavfile.write(output_path, rate=16000, data=audio)
1238
+ ```
1239
+ <div align = "center">
1240
+ <thead>
1241
+ </thead>
1242
+ <tbody>
1243
+ <tr>
1244
+ <td align = "center">
1245
+ <a href="https://paddlenlp.bj.bcebos.com/models/community/westfish/develop_ppdiffusers_data/beethoven_hammerklavier_2.wav" rel="nofollow">
1246
+ <img align="center" src="https://user-images.githubusercontent.com/20476674/209344877-edbf1c24-f08d-4e3b-88a4-a27e1fd0a858.png" width="200 style="max-width: 100%;"></a><br>
1247
+ </td>
1248
+ </tr>
1249
+ </tbody>
1250
+ </div>
1251
+ </details>
1252
+
1253
+
1254
+
1255
+ ## License
1256
+ PPDiffusers 遵循 [Apache-2.0开源协议](https://github.com/PaddlePaddle/PaddleMIX/blob/develop/ppdiffusers/LICENSE)。
1257
+
1258
+ Stable Diffusion 遵循 [The CreativeML OpenRAIL M 开源协议](https://huggingface.co/spaces/CompVis/stable-diffusion-license)。
1259
+ > The CreativeML OpenRAIL M is an [Open RAIL M license](https://www.licenses.ai/blog/2022/8/18/naming-convention-of-responsible-ai-licenses), adapted from the work that [BigScience](https://bigscience.huggingface.co/) and [the RAIL Initiative](https://www.licenses.ai/) are jointly carrying in the area of responsible AI licensing. See also [the article about the BLOOM Open RAIL license](https://bigscience.huggingface.co/blog/the-bigscience-rail-license) on which this license is based.
1260
+
1261
+ Stable Diffusion 3遵循 [Stability Community 开源协议](https://stability.ai/license)。
1262
+ > Community License: Free for research, non-commercial, and commercial use for organisations or individuals with less than $1M annual revenue. You only need a paid Enterprise license if your yearly revenues exceed USD$1M and you use Stability AI models in commercial products or services. Read more: https://stability.ai/license
1263
+
1264
+ ## Acknowledge
1265
+ 我们借鉴了🤗 Hugging Face的[Diffusers](https://github.com/huggingface/diffusers)关于预训练扩散模型使用的优秀设计,在此对Hugging Face作者及其开源社区表示感谢。
1266
+
1267
+ ## Citation
1268
+
1269
+ ```bibtex
1270
+ @misc{ppdiffusers,
1271
+ author = {PaddlePaddle Authors},
1272
+ title = {PPDiffusers: State-of-the-art diffusion model toolkit based on PaddlePaddle},
1273
+ year = {2022},
1274
+ publisher = {GitHub},
1275
+ journal = {GitHub repository},
1276
+ howpublished = {\url{https://github.com/PaddlePaddle/PaddleMIX/tree/develop/ppdiffusers}}
1277
+ }
1278
+ ```
VLMEvalKit_old/PaddleMIX/ppdiffusers/deploy/controlnet/export_model.py ADDED
@@ -0,0 +1,263 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ import argparse
15
+ import os
16
+
17
+ # set USE_PPXFORMERS=False to avoid using ppxformers
18
+ os.environ["USE_PPXFORMERS"] = "False"
19
+ from pathlib import Path
20
+ from types import MethodType
21
+
22
+ import paddle
23
+
24
+ from ppdiffusers import (
25
+ ControlNetModel,
26
+ PaddleInferRuntimeModel,
27
+ PaddleInferStableDiffusionControlNetPipeline,
28
+ StableDiffusionControlNetPipeline,
29
+ UNet2DConditionModel,
30
+ )
31
+
32
+
33
+ class ControlNetWithUnetModel(paddle.nn.Layer):
34
+ def __init__(
35
+ self,
36
+ unet,
37
+ controlnet,
38
+ ):
39
+ super().__init__()
40
+ self.unet = unet
41
+ self.controlnet = controlnet
42
+
43
+ def forward(
44
+ self,
45
+ sample,
46
+ timestep,
47
+ encoder_hidden_states,
48
+ controlnet_cond,
49
+ controlnet_conditioning_scale,
50
+ return_dict=True,
51
+ ):
52
+ down_block_res_samples, mid_block_res_sample = self.controlnet(
53
+ sample,
54
+ timestep,
55
+ encoder_hidden_states=encoder_hidden_states,
56
+ controlnet_cond=controlnet_cond,
57
+ conditioning_scale=controlnet_conditioning_scale,
58
+ return_dict=False,
59
+ )
60
+
61
+ noise_pred = self.unet(
62
+ sample,
63
+ timestep,
64
+ encoder_hidden_states=encoder_hidden_states,
65
+ down_block_additional_residuals=down_block_res_samples,
66
+ mid_block_additional_residual=mid_block_res_sample,
67
+ return_dict=return_dict,
68
+ )
69
+ return noise_pred
70
+
71
+
72
+ def convert_ppdiffusers_pipeline_to_paddleinfer_pipeline(
73
+ model_path: str,
74
+ controlnet_model_path: str,
75
+ output_path: str,
76
+ sample: bool = False,
77
+ height: int = None,
78
+ width: int = None,
79
+ ):
80
+ unet_tmp = UNet2DConditionModel.from_pretrained(model_path, resnet_pre_temb_non_linearity=False, subfolder="unet")
81
+ controlnet_tmp = ControlNetModel.from_pretrained(controlnet_model_path, resnet_pre_temb_non_linearity=False)
82
+
83
+ pipeline = StableDiffusionControlNetPipeline.from_pretrained(
84
+ model_path,
85
+ unet=unet_tmp,
86
+ controlnet=controlnet_tmp,
87
+ safety_checker=None,
88
+ feature_extractor=None,
89
+ requires_safety_checker=False,
90
+ )
91
+ output_path = Path(output_path)
92
+ # calculate latent's H and W
93
+ latent_height = height // 8 if height is not None else None
94
+ latent_width = width // 8 if width is not None else None
95
+ # get arguments
96
+ cross_attention_dim = pipeline.unet.config.cross_attention_dim # 768 or 1024 or 1280
97
+ unet_channels = pipeline.unet.config.in_channels # 4
98
+ vae_in_channels = pipeline.vae.config.in_channels # 3
99
+ vae_latent_channels = pipeline.vae.config.latent_channels # 4
100
+ print(
101
+ f"cross_attention_dim: {cross_attention_dim}\n",
102
+ f"unet_in_channels: {unet_channels}\n",
103
+ f"vae_encoder_in_channels: {vae_in_channels}\n",
104
+ f"vae_decoder_latent_channels: {vae_latent_channels}",
105
+ )
106
+ # 1. Convert text_encoder
107
+ text_encoder = paddle.jit.to_static(
108
+ pipeline.text_encoder,
109
+ input_spec=[paddle.static.InputSpec(shape=[None, None], dtype="int64", name="input_ids")], # input_ids
110
+ )
111
+ save_path = os.path.join(args.output_path, "text_encoder", "inference")
112
+ paddle.jit.save(text_encoder, save_path)
113
+ print(f"Save text_encoder model in {save_path} successfully.")
114
+ del pipeline.text_encoder
115
+
116
+ # wrap unet + controlnet
117
+ new_unet = ControlNetWithUnetModel(unet=pipeline.unet, controlnet=pipeline.controlnet)
118
+
119
+ # 2. Convert unet
120
+ unet = paddle.jit.to_static(
121
+ new_unet,
122
+ input_spec=[
123
+ paddle.static.InputSpec(
124
+ shape=[None, unet_channels, latent_height, latent_width],
125
+ dtype="float32",
126
+ name="sample",
127
+ ), # sample
128
+ paddle.static.InputSpec(shape=[1], dtype="float32", name="timestep"), # timestep
129
+ paddle.static.InputSpec(
130
+ shape=[None, None, cross_attention_dim],
131
+ dtype="float32",
132
+ name="encoder_hidden_states",
133
+ ), # encoder_hidden_states
134
+ paddle.static.InputSpec(
135
+ shape=[None, vae_in_channels, height, width],
136
+ dtype="float32",
137
+ name="controlnet_cond",
138
+ ), # controlnet_cond
139
+ paddle.static.InputSpec(
140
+ shape=[len(pipeline.unet.config.block_out_channels) * 3 + 1],
141
+ dtype="float32",
142
+ name="controlnet_conditioning_scale",
143
+ ), # controlnet_conditioning_scale
144
+ ],
145
+ )
146
+
147
+ save_path = os.path.join(args.output_path, "unet", "inference")
148
+ paddle.jit.save(unet, save_path)
149
+ print(f"Save unet model in {save_path} successfully.")
150
+ del pipeline.unet
151
+ del new_unet
152
+
153
+ def forward_vae_encoder_mode(self, z):
154
+ return self.encode(z, True).latent_dist.mode()
155
+
156
+ def forward_vae_encoder_sample(self, z):
157
+ return self.encode(z, True).latent_dist.sample()
158
+
159
+ # 3. Convert vae encoder
160
+ vae_encoder = pipeline.vae
161
+ if sample:
162
+ vae_encoder.forward = MethodType(forward_vae_encoder_sample, vae_encoder)
163
+ else:
164
+ vae_encoder.forward = MethodType(forward_vae_encoder_mode, vae_encoder)
165
+
166
+ vae_encoder = paddle.jit.to_static(
167
+ vae_encoder,
168
+ input_spec=[
169
+ paddle.static.InputSpec(
170
+ shape=[None, vae_in_channels, height, width],
171
+ dtype="float32",
172
+ name="sample", # N, C, H, W
173
+ ), # latent
174
+ ],
175
+ )
176
+ # Save vae_encoder in static graph model.
177
+ save_path = os.path.join(args.output_path, "vae_encoder", "inference")
178
+ paddle.jit.save(vae_encoder, save_path)
179
+ print(f"Save vae_encoder model in {save_path} successfully.")
180
+
181
+ # 4. Convert vae encoder
182
+ vae_decoder = pipeline.vae
183
+
184
+ def forward_vae_decoder(self, z):
185
+ return self.decode(z, True).sample
186
+
187
+ vae_decoder.forward = MethodType(forward_vae_decoder, vae_decoder)
188
+ vae_decoder = paddle.jit.to_static(
189
+ vae_decoder,
190
+ input_spec=[
191
+ paddle.static.InputSpec(
192
+ shape=[None, vae_latent_channels, latent_height, latent_width],
193
+ dtype="float32",
194
+ name="latent_sample",
195
+ ), # latent_sample
196
+ ],
197
+ )
198
+ # Save vae_decoder in static graph model.
199
+ save_path = os.path.join(args.output_path, "vae_decoder", "inference")
200
+ paddle.jit.save(vae_decoder, save_path)
201
+ print(f"Save vae_decoder model in {save_path} successfully.")
202
+ del pipeline.vae
203
+
204
+ paddleinfer_pipeline = PaddleInferStableDiffusionControlNetPipeline(
205
+ vae_encoder=PaddleInferRuntimeModel.from_pretrained(output_path / "vae_encoder"),
206
+ vae_decoder=PaddleInferRuntimeModel.from_pretrained(output_path / "vae_decoder"),
207
+ text_encoder=PaddleInferRuntimeModel.from_pretrained(output_path / "text_encoder"),
208
+ unet=PaddleInferRuntimeModel.from_pretrained(output_path / "unet"),
209
+ tokenizer=pipeline.tokenizer,
210
+ scheduler=pipeline.scheduler,
211
+ safety_checker=None,
212
+ feature_extractor=None,
213
+ image_encoder=None,
214
+ requires_safety_checker=False,
215
+ )
216
+ paddleinfer_pipeline.save_pretrained(str(output_path))
217
+ print("PaddleInfer pipeline saved to", output_path)
218
+
219
+
220
+ if __name__ == "__main__":
221
+ parser = argparse.ArgumentParser()
222
+
223
+ parser.add_argument(
224
+ "--pretrained_model_name_or_path",
225
+ type=str,
226
+ default="runwayml/stable-diffusion-v1-5",
227
+ help="Path to the `ppdiffusers` checkpoint to convert (either a local directory or on the bos).",
228
+ )
229
+ parser.add_argument(
230
+ "--controlnet_pretrained_model_name_or_path",
231
+ type=str,
232
+ default="lllyasviel/sd-controlnet-canny",
233
+ help="Path to the `ppdiffusers` controlnet_pretrained_model_name_or_path checkpoint to convert (either a local directory or on the bos).",
234
+ )
235
+ parser.add_argument("--output_path", type=str, required=True, help="Path to the output model.")
236
+ parser.add_argument(
237
+ "--sample",
238
+ action="store_true",
239
+ default=False,
240
+ help="Export the vae encoder in mode or sample",
241
+ )
242
+ parser.add_argument(
243
+ "--height",
244
+ type=int,
245
+ default=None,
246
+ help="The height of output images. Default: None",
247
+ )
248
+ parser.add_argument(
249
+ "--width",
250
+ type=int,
251
+ default=None,
252
+ help="The width of output images. Default: None",
253
+ )
254
+ args = parser.parse_args()
255
+
256
+ convert_ppdiffusers_pipeline_to_paddleinfer_pipeline(
257
+ args.pretrained_model_name_or_path,
258
+ args.controlnet_pretrained_model_name_or_path,
259
+ args.output_path,
260
+ args.sample,
261
+ args.height,
262
+ args.width,
263
+ )
VLMEvalKit_old/PaddleMIX/ppdiffusers/deploy/controlnet/scripts/benchmark_paddle.sh ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ # attention raw fp16
16
+ python infer_dygraph_paddle.py --scheduler "ddim" --task_name all --attention_type raw --use_fp16 True --inference_steps 50 --height 512 --width 512 --benchmark_steps 10
17
+
18
+ # attention cutlass fp16
19
+ python infer_dygraph_paddle.py --scheduler "ddim" --task_name all --attention_type cutlass --use_fp16 True --inference_steps 50 --height 512 --width 512 --benchmark_steps 10
20
+
21
+ # attention flash fp16
22
+ python infer_dygraph_paddle.py --scheduler "ddim" --task_name all --attention_type flash --use_fp16 True --inference_steps 50 --height 512 --width 512 --benchmark_steps 10
23
+
24
+
25
+ # attention raw fp32
26
+ python infer_dygraph_paddle.py --scheduler "ddim" --task_name all --attention_type raw --use_fp16 False --inference_steps 50 --height 512 --width 512 --benchmark_steps 10
27
+
28
+ # attention cutlass fp32
29
+ python infer_dygraph_paddle.py --scheduler "ddim" --task_name all --attention_type cutlass --use_fp16 False --inference_steps 50 --height 512 --width 512 --benchmark_steps 10
30
+
31
+ # attention flash fp32
32
+ python infer_dygraph_paddle.py --scheduler "ddim" --task_name all --attention_type flash --use_fp16 False --inference_steps 50 --height 512 --width 512 --benchmark_steps 10
VLMEvalKit_old/PaddleMIX/ppdiffusers/deploy/controlnet/scripts/benchmark_torch.sh ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ # attention raw
16
+ python infer_dygraph_torch.py --scheduler "ddim" --task_name all --attention_type raw --use_fp16 True --inference_steps 50 --height 512 --width 512 --benchmark_steps 10
17
+
18
+ # attention sdp
19
+ python infer_dygraph_torch.py --scheduler "ddim" --task_name all --attention_type sdp --use_fp16 True --inference_steps 50 --height 512 --width 512 --benchmark_steps 10
20
+
21
+
22
+ # attention raw fp32
23
+ python infer_dygraph_torch.py --scheduler "ddim" --task_name all --attention_type raw --use_fp16 False --inference_steps 50 --height 512 --width 512 --benchmark_steps 10
24
+
25
+ # attention sdp fp32
26
+ python infer_dygraph_torch.py --scheduler "ddim" --task_name all --attention_type sdp --use_fp16 False --inference_steps 50 --height 512 --width 512 --benchmark_steps 10
VLMEvalKit_old/PaddleMIX/ppdiffusers/deploy/controlnet/scripts/inference.sh ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ # ==============================================================================
16
+ # use paddle as backend to inference static model is not fast,
17
+ # this script is used to make sure the inference is correct.
18
+ # ==============================================================================
19
+ # text2img
20
+ python infer.py --model_dir static_model/stable-diffusion-v1-5-canny/ --scheduler "ddim" --backend paddle --device gpu --task_name text2img
21
+
22
+ # img2img
23
+ python infer.py --model_dir static_model/stable-diffusion-v1-5-canny/ --scheduler "ddim" --backend paddle --device gpu --task_name img2img
24
+
25
+ # inpaint
26
+ python infer.py --model_dir static_model/stable-diffusion-v1-5-canny/ --scheduler "ddim" --backend paddle --device gpu --task_name inpaint_legacy
VLMEvalKit_old/PaddleMIX/ppdiffusers/deploy/sd15/export_model.py ADDED
@@ -0,0 +1,205 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ import argparse
16
+ import os
17
+
18
+ # set USE_PPXFORMERS=False to avoid using ppxformers
19
+ os.environ["USE_PPXFORMERS"] = "False"
20
+ from pathlib import Path
21
+ from types import MethodType
22
+
23
+ import paddle
24
+ from unet_2d_condition_housing import UNet2DConditionModelSDHousing
25
+
26
+ from ppdiffusers import (
27
+ PaddleInferRuntimeModel,
28
+ PaddleInferStableDiffusionInpaintPipeline,
29
+ PaddleInferStableDiffusionMegaPipeline,
30
+ StableDiffusionPipeline,
31
+ )
32
+
33
+
34
+ def convert_ppdiffusers_pipeline_to_paddleinfer_pipeline(
35
+ model_path: str,
36
+ output_path: str,
37
+ sample: bool = False,
38
+ height: int = None,
39
+ width: int = None,
40
+ ):
41
+ # specify unet model with unet pre_temb_act opt enabled.
42
+ unet_model = UNet2DConditionModelSDHousing.from_pretrained(
43
+ model_path, resnet_pre_temb_non_linearity=False, subfolder="unet"
44
+ )
45
+ pipeline = StableDiffusionPipeline.from_pretrained(
46
+ model_path,
47
+ unet=unet_model,
48
+ safety_checker=None,
49
+ )
50
+ output_path = Path(output_path)
51
+ # calculate latent's H and W
52
+ latent_height = height // 8 if height is not None else None
53
+ latent_width = width // 8 if width is not None else None
54
+ # get arguments
55
+ cross_attention_dim = pipeline.unet.config.cross_attention_dim # 768 or 1024 or 1280
56
+ unet_channels = pipeline.unet.config.in_channels # 4 or 9
57
+ vae_in_channels = pipeline.vae.config.in_channels # 3
58
+ vae_latent_channels = pipeline.vae.config.latent_channels # 4
59
+ print(
60
+ f"cross_attention_dim: {cross_attention_dim}\n",
61
+ f"unet_in_channels: {unet_channels}\n",
62
+ f"vae_encoder_in_channels: {vae_in_channels}\n",
63
+ f"vae_decoder_latent_channels: {vae_latent_channels}",
64
+ )
65
+ # 1. Convert text_encoder
66
+ text_encoder = paddle.jit.to_static(
67
+ pipeline.text_encoder,
68
+ input_spec=[paddle.static.InputSpec(shape=[None, None], dtype="int64", name="input_ids")], # input_ids
69
+ )
70
+ save_path = os.path.join(args.output_path, "text_encoder", "inference")
71
+ paddle.jit.save(text_encoder, save_path)
72
+ print(f"Save text_encoder model in {save_path} successfully.")
73
+ del pipeline.text_encoder
74
+
75
+ # 2. Convert unet
76
+ unet = paddle.jit.to_static(
77
+ pipeline.unet,
78
+ input_spec=[
79
+ paddle.static.InputSpec(
80
+ shape=[None, unet_channels, latent_height, latent_width],
81
+ dtype="float32",
82
+ name="sample",
83
+ ), # sample
84
+ paddle.static.InputSpec(shape=[1], dtype="float32", name="timestep"), # timestep
85
+ paddle.static.InputSpec(
86
+ shape=[None, None, cross_attention_dim],
87
+ dtype="float32",
88
+ name="encoder_hidden_states",
89
+ ), # encoder_hidden_states
90
+ ],
91
+ )
92
+ save_path = os.path.join(args.output_path, "unet", "inference")
93
+ paddle.jit.save(unet, save_path)
94
+ print(f"Save unet model in {save_path} successfully.")
95
+ del pipeline.unet
96
+
97
+ def forward_vae_encoder_mode(self, z):
98
+ return self.encode(z, True).latent_dist.mode()
99
+
100
+ def forward_vae_encoder_sample(self, z):
101
+ return self.encode(z, True).latent_dist.sample()
102
+
103
+ # 3. Convert vae encoder
104
+ vae_encoder = pipeline.vae
105
+ if sample:
106
+ vae_encoder.forward = MethodType(forward_vae_encoder_sample, vae_encoder)
107
+ else:
108
+ vae_encoder.forward = MethodType(forward_vae_encoder_mode, vae_encoder)
109
+
110
+ vae_encoder = paddle.jit.to_static(
111
+ vae_encoder,
112
+ input_spec=[
113
+ paddle.static.InputSpec(
114
+ shape=[None, vae_in_channels, height, width],
115
+ dtype="float32",
116
+ name="sample", # N, C, H, W
117
+ ), # latent
118
+ ],
119
+ )
120
+ # Save vae_encoder in static graph model.
121
+ save_path = os.path.join(args.output_path, "vae_encoder", "inference")
122
+ paddle.jit.save(vae_encoder, save_path)
123
+ print(f"Save vae_encoder model in {save_path} successfully.")
124
+
125
+ # 4. Convert vae encoder
126
+ vae_decoder = pipeline.vae
127
+
128
+ def forward_vae_decoder(self, z):
129
+ return self.decode(z, True).sample
130
+
131
+ vae_decoder.forward = MethodType(forward_vae_decoder, vae_decoder)
132
+ vae_decoder = paddle.jit.to_static(
133
+ vae_decoder,
134
+ input_spec=[
135
+ paddle.static.InputSpec(
136
+ shape=[None, vae_latent_channels, latent_height, latent_width],
137
+ dtype="float32",
138
+ name="latent_sample",
139
+ ), # latent_sample
140
+ ],
141
+ )
142
+ # Save vae_decoder in static graph model.
143
+ save_path = os.path.join(args.output_path, "vae_decoder", "inference")
144
+ paddle.jit.save(vae_decoder, save_path)
145
+ print(f"Save vae_decoder model in {save_path} successfully.")
146
+ del pipeline.vae
147
+
148
+ if "inpainting" in model_path:
149
+ fd_pipe_cls = PaddleInferStableDiffusionInpaintPipeline
150
+ else:
151
+ fd_pipe_cls = PaddleInferStableDiffusionMegaPipeline
152
+
153
+ paddleinfer_pipeline = fd_pipe_cls(
154
+ vae_encoder=PaddleInferRuntimeModel.from_pretrained(output_path / "vae_encoder"),
155
+ vae_decoder=PaddleInferRuntimeModel.from_pretrained(output_path / "vae_decoder"),
156
+ text_encoder=PaddleInferRuntimeModel.from_pretrained(output_path / "text_encoder"),
157
+ unet=PaddleInferRuntimeModel.from_pretrained(output_path / "unet"),
158
+ tokenizer=pipeline.tokenizer,
159
+ scheduler=pipeline.scheduler,
160
+ feature_extractor=pipeline.feature_extractor,
161
+ image_encoder=None,
162
+ safety_checker=None,
163
+ requires_safety_checker=False,
164
+ )
165
+ paddleinfer_pipeline.save_pretrained(str(output_path))
166
+ print("PaddleInfer pipeline saved to", output_path)
167
+
168
+
169
+ if __name__ == "__main__":
170
+ parser = argparse.ArgumentParser()
171
+
172
+ parser.add_argument(
173
+ "--pretrained_model_name_or_path",
174
+ type=str,
175
+ required=True,
176
+ help="Path to the `ppdiffusers` checkpoint to convert (either a local directory or on the bos).",
177
+ )
178
+ parser.add_argument("--output_path", type=str, required=True, help="Path to the output model.")
179
+ parser.add_argument(
180
+ "--sample",
181
+ action="store_true",
182
+ default=False,
183
+ help="Export the vae encoder in mode or sample",
184
+ )
185
+ parser.add_argument(
186
+ "--height",
187
+ type=int,
188
+ default=None,
189
+ help="The height of output images. Default: None",
190
+ )
191
+ parser.add_argument(
192
+ "--width",
193
+ type=int,
194
+ default=None,
195
+ help="The width of output images. Default: None",
196
+ )
197
+ args = parser.parse_args()
198
+
199
+ convert_ppdiffusers_pipeline_to_paddleinfer_pipeline(
200
+ args.pretrained_model_name_or_path,
201
+ args.output_path,
202
+ args.sample,
203
+ args.height,
204
+ args.width,
205
+ )
VLMEvalKit_old/PaddleMIX/ppdiffusers/deploy/sd15/infer.py ADDED
@@ -0,0 +1,408 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ import argparse
16
+ import os
17
+ import time
18
+
19
+ # isort: split
20
+ import paddle
21
+ import paddle.inference as paddle_infer
22
+
23
+ # isort: split
24
+ import numpy as np
25
+ from paddlenlp.trainer.argparser import strtobool
26
+ from tqdm.auto import trange
27
+
28
+ from ppdiffusers import ( # noqa
29
+ DiffusionPipeline,
30
+ PaddleInferStableDiffusionMegaPipeline,
31
+ )
32
+ from ppdiffusers.utils import load_image
33
+
34
+
35
+ def parse_arguments():
36
+
37
+ parser = argparse.ArgumentParser()
38
+ parser.add_argument(
39
+ "--model_dir",
40
+ default="runwayml/stable-diffusion-v1-5@paddleinfer",
41
+ help="The model directory of diffusion_model.",
42
+ )
43
+ parser.add_argument(
44
+ "--inference_steps",
45
+ type=int,
46
+ default=50,
47
+ help="The number of unet inference steps.",
48
+ )
49
+ parser.add_argument(
50
+ "--benchmark_steps",
51
+ type=int,
52
+ default=10,
53
+ help="The number of performance benchmark steps.",
54
+ )
55
+ parser.add_argument(
56
+ "--backend",
57
+ type=str,
58
+ default="paddle_tensorrt",
59
+ choices=["paddle", "paddle_tensorrt"],
60
+ help="The inference runtime backend of unet model and text encoder model.",
61
+ )
62
+ parser.add_argument(
63
+ "--device",
64
+ type=str,
65
+ default="gpu",
66
+ choices=[
67
+ "cpu",
68
+ "gpu",
69
+ "huawei_ascend_npu",
70
+ "kunlunxin_xpu",
71
+ ],
72
+ help="The inference runtime device of models.",
73
+ )
74
+ parser.add_argument(
75
+ "--task_name",
76
+ type=str,
77
+ default="text2img",
78
+ choices=[
79
+ "text2img",
80
+ "img2img",
81
+ "inpaint_legacy",
82
+ "all",
83
+ ],
84
+ help="The task can be one of [text2img, img2img, inpaint_legacy, all]. ",
85
+ )
86
+ parser.add_argument(
87
+ "--parse_prompt_type",
88
+ type=str,
89
+ default="lpw",
90
+ choices=[
91
+ "raw",
92
+ "lpw",
93
+ ],
94
+ help="The parse_prompt_type can be one of [raw, lpw]. ",
95
+ )
96
+ parser.add_argument("--use_fp16", type=strtobool, default=True, help="Wheter to use FP16 mode")
97
+ parser.add_argument("--device_id", type=int, default=0, help="The selected gpu id. -1 means use cpu")
98
+ parser.add_argument(
99
+ "--scheduler",
100
+ type=str,
101
+ default="preconfig-euler-ancestral",
102
+ choices=[
103
+ "pndm",
104
+ "lms",
105
+ "euler",
106
+ "euler-ancestral",
107
+ "preconfig-euler-ancestral",
108
+ "dpm-multi",
109
+ "dpm-single",
110
+ "unipc-multi",
111
+ "ddim",
112
+ "ddpm",
113
+ "deis-multi",
114
+ "heun",
115
+ "kdpm2-ancestral",
116
+ "kdpm2",
117
+ ],
118
+ help="The scheduler type of stable diffusion.",
119
+ )
120
+ parser.add_argument("--height", type=int, default=512, help="Height of input image")
121
+ parser.add_argument("--width", type=int, default=512, help="Width of input image")
122
+ parser.add_argument("--strength", type=float, default=1.0, help="Strength for img2img / inpaint")
123
+ parser.add_argument("--hr_resize_height", type=int, default=768, help="HR Height of input image")
124
+ parser.add_argument("--hr_resize_width", type=int, default=768, help="HR Width of input image")
125
+ parser.add_argument("--is_sd2_0", type=strtobool, default=False, help="Is sd2_0 model?")
126
+ parser.add_argument(
127
+ "--tune",
128
+ type=strtobool,
129
+ default=False,
130
+ help="Whether to tune the shape of tensorrt engine.",
131
+ )
132
+
133
+ return parser.parse_args()
134
+
135
+
136
+ def create_paddle_inference_runtime(
137
+ model_dir="",
138
+ model_name="",
139
+ use_trt=False,
140
+ precision_mode=paddle_infer.PrecisionType.Half,
141
+ device_id=0,
142
+ disable_paddle_trt_ops=[],
143
+ disable_paddle_pass=[],
144
+ workspace=24 * 1024 * 1024 * 1024,
145
+ tune=False,
146
+ ):
147
+ config = paddle_infer.Config()
148
+ config.enable_memory_optim()
149
+ shape_file = f"{model_dir}/{model_name}/shape_range_info.pbtxt"
150
+ if tune:
151
+ config.collect_shape_range_info(shape_file)
152
+ config.switch_ir_optim(False)
153
+ else:
154
+ config.enable_new_executor()
155
+ if str(os.environ.get("FLAGS_enable_pir_in_executor")).lower() in ("true", "1"):
156
+ config.enable_new_ir()
157
+ if str(os.environ.get("FLAGS_use_cinn")).lower() in ("true", "1"):
158
+ config.enable_cinn()
159
+
160
+ if device_id != -1:
161
+ config.use_gpu()
162
+ config.enable_use_gpu(memory_pool_init_size_mb=2000, device_id=device_id, precision_mode=precision_mode)
163
+ for pass_name in disable_paddle_pass:
164
+ config.delete_pass(pass_name)
165
+ if use_trt:
166
+ config.enable_tensorrt_engine(
167
+ workspace_size=workspace,
168
+ precision_mode=precision_mode,
169
+ max_batch_size=1,
170
+ min_subgraph_size=3,
171
+ use_static=True,
172
+ )
173
+ config.enable_tensorrt_memory_optim()
174
+ config.enable_tuned_tensorrt_dynamic_shape(shape_file, True)
175
+ cache_file = os.path.join(model_dir, model_name, "_opt_cache/")
176
+ config.set_optim_cache_dir(cache_file)
177
+ if precision_mode != paddle_infer.PrecisionType.Half:
178
+ only_fp16_passes = [
179
+ "trt_cross_multihead_matmul_fuse_pass",
180
+ "trt_flash_multihead_matmul_fuse_pass",
181
+ "preln_elementwise_groupnorm_act_pass",
182
+ "elementwise_groupnorm_act_pass",
183
+ ]
184
+ for curr_pass in only_fp16_passes:
185
+ config.delete_pass(curr_pass)
186
+ return config
187
+
188
+
189
+ def main(args):
190
+ if args.device_id == -1:
191
+ paddle.set_device("cpu")
192
+ else:
193
+ paddle.set_device(f"gpu:{args.device_id}")
194
+
195
+ seed = 1024
196
+ min_image_size = 512
197
+ max_image_size = 768
198
+ max_image_size = max(min_image_size, max_image_size)
199
+
200
+ # 4. Init runtime
201
+ only_fp16_passes = [
202
+ "trt_cross_multihead_matmul_fuse_pass",
203
+ "trt_flash_multihead_matmul_fuse_pass",
204
+ "preln_elementwise_groupnorm_act_pass",
205
+ "elementwise_groupnorm_act_pass",
206
+ ]
207
+ no_need_passes = [
208
+ "trt_prompt_tuning_embedding_eltwise_layernorm_fuse_pass",
209
+ "add_support_int8_pass",
210
+ "elementwise_groupnorm_act_pass",
211
+ "groupnorm_act_pass",
212
+ "preln_elementwise_groupnorm_act_pass",
213
+ ]
214
+ paddle_delete_passes = dict(
215
+ text_encoder=only_fp16_passes + no_need_passes if not args.use_fp16 else no_need_passes,
216
+ text_encoder_2=only_fp16_passes + no_need_passes if not args.use_fp16 else no_need_passes,
217
+ vae_encoder=only_fp16_passes + [] if args.use_fp16 else [],
218
+ vae_decoder=only_fp16_passes + no_need_passes if not args.use_fp16 else no_need_passes,
219
+ unet=only_fp16_passes + no_need_passes if not args.use_fp16 else no_need_passes,
220
+ image_encoder=only_fp16_passes + no_need_passes if not args.use_fp16 else no_need_passes,
221
+ )
222
+ args.use_trt = args.backend == "paddle_tensorrt"
223
+ precision_mode = paddle_infer.PrecisionType.Half if args.use_fp16 else paddle_infer.PrecisionType.Float32
224
+ infer_configs = dict(
225
+ text_encoder=create_paddle_inference_runtime(
226
+ model_dir=args.model_dir,
227
+ use_trt=False,
228
+ model_name="text_encoder",
229
+ precision_mode=paddle_infer.PrecisionType.Half,
230
+ device_id=args.device_id,
231
+ disable_paddle_trt_ops=["range", "lookup_table_v2"],
232
+ disable_paddle_pass=paddle_delete_passes.get("text_encoder", []),
233
+ tune=False,
234
+ ),
235
+ vae_encoder=create_paddle_inference_runtime(
236
+ model_dir=args.model_dir,
237
+ model_name="vae_encoder",
238
+ use_trt=False,
239
+ precision_mode=paddle_infer.PrecisionType.Half,
240
+ device_id=args.device_id,
241
+ disable_paddle_pass=paddle_delete_passes.get("vae_encoder", []),
242
+ tune=False,
243
+ ),
244
+ vae_decoder=create_paddle_inference_runtime(
245
+ model_dir=args.model_dir,
246
+ model_name="vae_decoder",
247
+ use_trt=False,
248
+ precision_mode=paddle_infer.PrecisionType.Float32,
249
+ device_id=args.device_id,
250
+ disable_paddle_pass=paddle_delete_passes.get("vae_decoder", []),
251
+ tune=False,
252
+ ),
253
+ unet=create_paddle_inference_runtime(
254
+ model_dir=args.model_dir,
255
+ model_name="unet",
256
+ use_trt=args.use_trt,
257
+ precision_mode=precision_mode,
258
+ device_id=args.device_id,
259
+ disable_paddle_pass=no_need_passes,
260
+ tune=args.tune,
261
+ ),
262
+ )
263
+ pipe = PaddleInferStableDiffusionMegaPipeline.from_pretrained(
264
+ args.model_dir,
265
+ infer_configs=infer_configs,
266
+ use_optim_cache=False,
267
+ )
268
+ pipe.set_progress_bar_config(disable=False)
269
+ pipe.change_scheduler(args.scheduler)
270
+ parse_prompt_type = args.parse_prompt_type
271
+ width = args.width
272
+ height = args.height
273
+
274
+ folder = f"results-{args.backend}"
275
+ os.makedirs(folder, exist_ok=True)
276
+ if args.task_name in ["text2img", "all"]:
277
+ # text2img
278
+ prompt = "a photo of an astronaut riding a horse on mars"
279
+ time_costs = []
280
+ # warmup
281
+ pipe.text2img(
282
+ prompt,
283
+ num_inference_steps=20,
284
+ height=height,
285
+ width=width,
286
+ # parse_prompt_type=parse_prompt_type,
287
+ )
288
+ print("==> Test text2img performance.")
289
+ for step in trange(args.benchmark_steps):
290
+ start = time.time()
291
+ paddle.seed(seed)
292
+ images = pipe.text2img(
293
+ prompt,
294
+ output_type="pil",
295
+ num_inference_steps=args.inference_steps,
296
+ height=height,
297
+ width=width,
298
+ # parse_prompt_type=parse_prompt_type,
299
+ ).images
300
+ latency = time.time() - start
301
+ time_costs += [latency]
302
+ # print(f"No {step:3d} time cost: {latency:2f} s")
303
+ print(
304
+ f"Use fp16: {'true' if args.use_fp16 else 'false'}, "
305
+ f"Mean iter/sec: {1 / (np.mean(time_costs) / args.inference_steps):2f} it/s, "
306
+ f"Mean latency: {np.mean(time_costs):2f} s, p50 latency: {np.percentile(time_costs, 50):2f} s, "
307
+ f"p90 latency: {np.percentile(time_costs, 90):2f} s, p95 latency: {np.percentile(time_costs, 95):2f} s."
308
+ )
309
+ images[0].save(f"{folder}/text2img.png")
310
+
311
+ if args.task_name in ["img2img", "all"]:
312
+ # img2img
313
+ img_url = (
314
+ "https://paddlenlp.bj.bcebos.com/models/community/CompVis/stable-diffusion-v1-4/sketch-mountains-input.png"
315
+ )
316
+ init_image = load_image(img_url)
317
+ prompt = "A fantasy landscape, trending on artstation"
318
+ time_costs = []
319
+ # warmup
320
+ pipe.img2img(
321
+ prompt,
322
+ image=init_image,
323
+ num_inference_steps=20,
324
+ height=height,
325
+ width=width,
326
+ strength=args.strength,
327
+ # parse_prompt_type=parse_prompt_type,
328
+ )
329
+ print("==> Test img2img performance.")
330
+ for step in trange(args.benchmark_steps):
331
+ start = time.time()
332
+ paddle.seed(seed)
333
+ images = pipe.img2img(
334
+ prompt,
335
+ image=init_image,
336
+ num_inference_steps=args.inference_steps,
337
+ height=height,
338
+ width=width,
339
+ strength=args.strength,
340
+ # parse_prompt_type=parse_prompt_type,
341
+ ).images
342
+ latency = time.time() - start
343
+ time_costs += [latency]
344
+ # print(f"No {step:3d} time cost: {latency:2f} s")
345
+ print(
346
+ f"Use fp16: {'true' if args.use_fp16 else 'false'}, "
347
+ f"Mean iter/sec: {1 / (np.mean(time_costs) / args.inference_steps):2f} it/s, "
348
+ f"Mean latency: {np.mean(time_costs):2f} s, p50 latency: {np.percentile(time_costs, 50):2f} s, "
349
+ f"p90 latency: {np.percentile(time_costs, 90):2f} s, p95 latency: {np.percentile(time_costs, 95):2f} s."
350
+ )
351
+ images[0].save(f"{folder}/img2img.png")
352
+
353
+ if args.task_name in ["inpaint", "inpaint_legacy", "all"]:
354
+ img_url = (
355
+ "https://paddlenlp.bj.bcebos.com/models/community/CompVis/stable-diffusion-v1-4/overture-creations.png"
356
+ )
357
+ mask_url = "https://paddlenlp.bj.bcebos.com/models/community/CompVis/stable-diffusion-v1-4/overture-creations-mask.png"
358
+ init_image = load_image(img_url)
359
+ mask_image = load_image(mask_url)
360
+ prompt = "Face of a yellow cat, high resolution, sitting on a park bench"
361
+ time_costs = []
362
+ # warmup
363
+ if args.task_name in ["inpaint_legacy", "all"]:
364
+ call_fn = pipe.inpaint_legacy
365
+ task_name = "inpaint_legacy"
366
+ else:
367
+ call_fn = pipe.inpaint
368
+ task_name = "inpaint"
369
+ call_fn(
370
+ prompt,
371
+ image=init_image,
372
+ mask_image=mask_image,
373
+ num_inference_steps=20,
374
+ height=height,
375
+ width=width,
376
+ strength=args.strength,
377
+ parse_prompt_type=parse_prompt_type,
378
+ )
379
+ print(f"==> Test {task_name} performance.")
380
+ for step in trange(args.benchmark_steps):
381
+ start = time.time()
382
+ paddle.seed(seed)
383
+ images = call_fn(
384
+ prompt,
385
+ image=init_image,
386
+ mask_image=mask_image,
387
+ num_inference_steps=args.inference_steps,
388
+ height=height,
389
+ width=width,
390
+ strength=args.strength,
391
+ parse_prompt_type=parse_prompt_type,
392
+ ).images
393
+ latency = time.time() - start
394
+ time_costs += [latency]
395
+ # print(f"No {step:3d} time cost: {latency:2f} s")
396
+ print(
397
+ f"Use fp16: {'true' if args.use_fp16 else 'false'}, "
398
+ f"Mean iter/sec: {1 / (np.mean(time_costs) / args.inference_steps):2f} it/s, "
399
+ f"Mean latency: {np.mean(time_costs):2f} s, p50 latency: {np.percentile(time_costs, 50):2f} s, "
400
+ f"p90 latency: {np.percentile(time_costs, 90):2f} s, p95 latency: {np.percentile(time_costs, 95):2f} s."
401
+ )
402
+
403
+ images[0].save(f"{folder}/{task_name}.png")
404
+
405
+
406
+ if __name__ == "__main__":
407
+ args = parse_arguments()
408
+ main(args)
VLMEvalKit_old/PaddleMIX/ppdiffusers/deploy/sd15/infer_dygraph_paddle.py ADDED
@@ -0,0 +1,357 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ import argparse
16
+ import os
17
+ import time
18
+ import warnings
19
+
20
+ import cv2
21
+ import numpy as np
22
+ import paddle
23
+ from PIL import Image
24
+ from tqdm.auto import trange
25
+
26
+ from ppdiffusers import (
27
+ DDIMScheduler,
28
+ DDPMScheduler,
29
+ DEISMultistepScheduler,
30
+ DPMSolverMultistepScheduler,
31
+ DPMSolverSinglestepScheduler,
32
+ EulerAncestralDiscreteScheduler,
33
+ EulerDiscreteScheduler,
34
+ HeunDiscreteScheduler,
35
+ KDPM2AncestralDiscreteScheduler,
36
+ KDPM2DiscreteScheduler,
37
+ LMSDiscreteScheduler,
38
+ PNDMScheduler,
39
+ StableDiffusionImg2ImgPipeline,
40
+ StableDiffusionInpaintPipeline,
41
+ StableDiffusionPipeline,
42
+ UniPCMultistepScheduler,
43
+ )
44
+ from ppdiffusers.utils import load_image
45
+
46
+
47
+ def get_canny_image(image, args):
48
+ if isinstance(image, Image.Image):
49
+ image = np.array(image)
50
+ image = cv2.Canny(image, args.low_threshold, args.high_threshold)
51
+ image = image[:, :, None]
52
+ image = np.concatenate([image, image, image], axis=2)
53
+ canny_image = Image.fromarray(image)
54
+ return canny_image
55
+
56
+
57
+ def strtobool(v):
58
+ if isinstance(v, bool):
59
+ return v
60
+ if v.lower() in ("yes", "true", "t", "y", "1"):
61
+ return True
62
+ elif v.lower() in ("no", "false", "f", "n", "0"):
63
+ return False
64
+ else:
65
+ raise ValueError(
66
+ f"Truthy value expected: got {v} but expected one of yes/no, true/false, t/f, y/n, 1/0 (case insensitive)."
67
+ )
68
+
69
+
70
+ def change_scheduler(self, scheduler_type="ddim"):
71
+ self.orginal_scheduler_config = self.scheduler.config
72
+ scheduler_type = scheduler_type.lower()
73
+ if scheduler_type == "pndm":
74
+ scheduler = PNDMScheduler.from_config(self.orginal_scheduler_config, skip_prk_steps=True)
75
+ elif scheduler_type == "lms":
76
+ scheduler = LMSDiscreteScheduler.from_config(self.orginal_scheduler_config)
77
+ elif scheduler_type == "heun":
78
+ scheduler = HeunDiscreteScheduler.from_config(self.orginal_scheduler_config)
79
+ elif scheduler_type == "euler":
80
+ scheduler = EulerDiscreteScheduler.from_config(self.orginal_scheduler_config)
81
+ elif scheduler_type == "euler-ancestral":
82
+ scheduler = EulerAncestralDiscreteScheduler.from_config(self.orginal_scheduler_config)
83
+ elif scheduler_type == "dpm-multi":
84
+ scheduler = DPMSolverMultistepScheduler.from_config(self.orginal_scheduler_config)
85
+ elif scheduler_type == "dpm-single":
86
+ scheduler = DPMSolverSinglestepScheduler.from_config(self.orginal_scheduler_config)
87
+ elif scheduler_type == "kdpm2-ancestral":
88
+ scheduler = KDPM2AncestralDiscreteScheduler.from_config(self.orginal_scheduler_config)
89
+ elif scheduler_type == "kdpm2":
90
+ scheduler = KDPM2DiscreteScheduler.from_config(self.orginal_scheduler_config)
91
+ elif scheduler_type == "unipc-multi":
92
+ scheduler = UniPCMultistepScheduler.from_config(self.orginal_scheduler_config)
93
+ elif scheduler_type == "ddim":
94
+ scheduler = DDIMScheduler.from_config(
95
+ self.orginal_scheduler_config,
96
+ steps_offset=1,
97
+ clip_sample=False,
98
+ set_alpha_to_one=False,
99
+ )
100
+ elif scheduler_type == "ddpm":
101
+ scheduler = DDPMScheduler.from_config(
102
+ self.orginal_scheduler_config,
103
+ )
104
+ elif scheduler_type == "deis-multi":
105
+ scheduler = DEISMultistepScheduler.from_config(
106
+ self.orginal_scheduler_config,
107
+ )
108
+ else:
109
+ raise ValueError(f"Scheduler of type {scheduler_type} doesn't exist!")
110
+ return scheduler
111
+
112
+
113
+ def parse_arguments():
114
+
115
+ parser = argparse.ArgumentParser()
116
+ parser.add_argument(
117
+ "--pretrained_model_name_or_path",
118
+ type=str,
119
+ default="runwayml/stable-diffusion-v1-5",
120
+ help="Path to the `diffusers` checkpoint to convert (either a local directory or on the bos).",
121
+ )
122
+ parser.add_argument(
123
+ "--inference_steps",
124
+ type=int,
125
+ default=50,
126
+ help="The number of unet inference steps.",
127
+ )
128
+ parser.add_argument(
129
+ "--benchmark_steps",
130
+ type=int,
131
+ default=10,
132
+ help="The number of performance benchmark steps.",
133
+ )
134
+ parser.add_argument(
135
+ "--task_name",
136
+ type=str,
137
+ default="all",
138
+ choices=[
139
+ "text2img",
140
+ "img2img",
141
+ "inpaint_legacy",
142
+ "all",
143
+ ],
144
+ help="The task can be one of [text2img, img2img, inpaint_legacy, all]. ",
145
+ )
146
+ parser.add_argument(
147
+ "--parse_prompt_type",
148
+ type=str,
149
+ default="raw",
150
+ choices=[
151
+ "raw",
152
+ "lpw",
153
+ ],
154
+ help="The parse_prompt_type can be one of [raw, lpw]. ",
155
+ )
156
+ parser.add_argument("--use_fp16", type=strtobool, default=True, help="Wheter to use FP16 mode")
157
+ parser.add_argument(
158
+ "--attention_type", type=str, default="raw", choices=["raw", "cutlass", "flash", "all"], help="attention_type."
159
+ )
160
+ parser.add_argument("--device_id", type=int, default=0, help="The selected gpu id. -1 means use cpu")
161
+ parser.add_argument(
162
+ "--scheduler",
163
+ type=str,
164
+ default="euler-ancestral",
165
+ choices=[
166
+ "pndm",
167
+ "lms",
168
+ "euler",
169
+ "euler-ancestral",
170
+ "dpm-multi",
171
+ "dpm-single",
172
+ "unipc-multi",
173
+ "ddim",
174
+ "ddpm",
175
+ "deis-multi",
176
+ "heun",
177
+ "kdpm2-ancestral",
178
+ "kdpm2",
179
+ ],
180
+ help="The scheduler type of stable diffusion.",
181
+ )
182
+ parser.add_argument("--height", type=int, default=512, help="Height of input image")
183
+ parser.add_argument("--width", type=int, default=512, help="Width of input image")
184
+ parser.add_argument("--strength", type=float, default=1.0, help="Strength for img2img / inpaint")
185
+ return parser.parse_args()
186
+
187
+
188
+ def main(args):
189
+
190
+ seed = 1024
191
+ paddle_dtype = paddle.float16 if args.use_fp16 else paddle.float32
192
+ pipe = StableDiffusionPipeline.from_pretrained(
193
+ args.pretrained_model_name_or_path,
194
+ safety_checker=None,
195
+ feature_extractor=None,
196
+ requires_safety_checker=False,
197
+ paddle_dtype=paddle_dtype,
198
+ )
199
+ scheduler = change_scheduler(pipe, args.scheduler)
200
+ pipe.scheduler = scheduler
201
+
202
+ if args.attention_type == "all":
203
+ args.attention_type = ["raw", "cutlass", "flash"]
204
+ else:
205
+ args.attention_type = [args.attention_type]
206
+
207
+ for attention_type in args.attention_type:
208
+ if attention_type == "raw":
209
+ pipe.disable_xformers_memory_efficient_attention()
210
+ else:
211
+ try:
212
+ pipe.enable_xformers_memory_efficient_attention(attention_type)
213
+ except Exception as e:
214
+ if attention_type == "flash":
215
+ warnings.warn(
216
+ "Attention type flash is not supported on your GPU! We need to use 3060、3070、3080、3090、4060、4070、4080、4090、A30、A100 etc."
217
+ )
218
+ continue
219
+ else:
220
+ raise ValueError(e)
221
+
222
+ if not args.use_fp16 and attention_type == "flash":
223
+ print("Flash attention is not supported dtype=float32! Please use float16 or bfloat16. We will skip this!")
224
+ continue
225
+
226
+ width = args.width
227
+ height = args.height
228
+ pipe.set_progress_bar_config(disable=False)
229
+
230
+ folder = f"paddle_attn_{attention_type}_fp16" if args.use_fp16 else f"paddle_attn_{attention_type}_fp32"
231
+ os.makedirs(folder, exist_ok=True)
232
+ if args.task_name in ["text2img", "all"]:
233
+ init_image = load_image(
234
+ "https://paddlenlp.bj.bcebos.com/models/community/junnyu/develop/control_bird_canny_demo.png"
235
+ )
236
+ # text2img
237
+ prompt = "bird"
238
+ time_costs = []
239
+ # warmup
240
+ pipe(
241
+ prompt,
242
+ num_inference_steps=10,
243
+ height=height,
244
+ width=width,
245
+ )
246
+ print("==> Test text2img performance.")
247
+ for step in trange(args.benchmark_steps):
248
+ start = time.time()
249
+ paddle.seed(seed)
250
+ images = pipe(
251
+ prompt,
252
+ num_inference_steps=args.inference_steps,
253
+ height=height,
254
+ width=width,
255
+ ).images
256
+ latency = time.time() - start
257
+ time_costs += [latency]
258
+ # print(f"No {step:3d} time cost: {latency:2f} s")
259
+ print(
260
+ f"Attention type: {attention_type}, "
261
+ f"Use fp16: {'true' if args.use_fp16 else 'false'}, "
262
+ f"Mean iter/sec: {1 / (np.mean(time_costs) / args.inference_steps):2f} it/s, "
263
+ f"Mean latency: {np.mean(time_costs):2f} s, p50 latency: {np.percentile(time_costs, 50):2f} s, "
264
+ f"p90 latency: {np.percentile(time_costs, 90):2f} s, p95 latency: {np.percentile(time_costs, 95):2f} s."
265
+ )
266
+ images[0].save(f"{folder}/text2img.png")
267
+
268
+ if args.task_name in ["img2img", "all"]:
269
+ pipe_img2img = StableDiffusionImg2ImgPipeline(**pipe.components)
270
+ pipe_img2img.set_progress_bar_config(disable=False)
271
+ img_url = "https://paddlenlp.bj.bcebos.com/models/community/CompVis/stable-diffusion-v1-4/sketch-mountains-input.png"
272
+ init_image = load_image(img_url).resize((width, height))
273
+ prompt = "A fantasy landscape, trending on artstation"
274
+ time_costs = []
275
+ # warmup
276
+ pipe_img2img(
277
+ prompt,
278
+ image=init_image,
279
+ num_inference_steps=20,
280
+ height=height,
281
+ width=width,
282
+ strength=args.strength,
283
+ )
284
+ print("==> Test img2img performance.")
285
+ for step in trange(args.benchmark_steps):
286
+ start = time.time()
287
+ paddle.seed(seed)
288
+ images = pipe_img2img(
289
+ prompt,
290
+ image=init_image,
291
+ num_inference_steps=args.inference_steps,
292
+ height=height,
293
+ width=width,
294
+ strength=args.strength,
295
+ ).images
296
+ latency = time.time() - start
297
+ time_costs += [latency]
298
+ # print(f"No {step:3d} time cost: {latency:2f} s")
299
+ print(
300
+ f"Attention type: {attention_type}, "
301
+ f"Use fp16: {'true' if args.use_fp16 else 'false'}, "
302
+ f"Mean iter/sec: {1 / (np.mean(time_costs) / args.inference_steps):2f} it/s, "
303
+ f"Mean latency: {np.mean(time_costs):2f} s, p50 latency: {np.percentile(time_costs, 50):2f} s, "
304
+ f"p90 latency: {np.percentile(time_costs, 90):2f} s, p95 latency: {np.percentile(time_costs, 95):2f} s."
305
+ )
306
+ images[0].save(f"{folder}/img2img.png")
307
+
308
+ if args.task_name in ["inpaint_legacy", "all"]:
309
+ pipe_inpaint = StableDiffusionInpaintPipeline(**pipe.components)
310
+ pipe_inpaint.set_progress_bar_config(disable=False)
311
+ img_url = (
312
+ "https://paddlenlp.bj.bcebos.com/models/community/CompVis/stable-diffusion-v1-4/overture-creations.png"
313
+ )
314
+ mask_url = "https://paddlenlp.bj.bcebos.com/models/community/CompVis/stable-diffusion-v1-4/overture-creations-mask.png"
315
+ init_image = load_image(img_url).resize((width, height))
316
+ mask_image = load_image(mask_url).resize((width, height))
317
+ prompt = "Face of a yellow cat, high resolution, sitting on a park bench"
318
+ time_costs = []
319
+ task_name = "inpaint_legacy"
320
+ pipe_inpaint(
321
+ prompt,
322
+ image=init_image,
323
+ mask_image=mask_image,
324
+ num_inference_steps=20,
325
+ height=height,
326
+ width=width,
327
+ strength=args.strength,
328
+ )
329
+ print(f"==> Test {task_name} performance.")
330
+ for step in trange(args.benchmark_steps):
331
+ start = time.time()
332
+ paddle.seed(seed)
333
+ images = pipe_inpaint(
334
+ prompt,
335
+ image=init_image,
336
+ mask_image=mask_image,
337
+ num_inference_steps=args.inference_steps,
338
+ height=height,
339
+ width=width,
340
+ strength=args.strength,
341
+ ).images
342
+ latency = time.time() - start
343
+ time_costs += [latency]
344
+ # print(f"No {step:3d} time cost: {latency:2f} s")
345
+ print(
346
+ f"Attention type: {attention_type}, "
347
+ f"Use fp16: {'true' if args.use_fp16 else 'false'}, "
348
+ f"Mean iter/sec: {1 / (np.mean(time_costs) / args.inference_steps):2f} it/s, "
349
+ f"Mean latency: {np.mean(time_costs):2f} s, p50 latency: {np.percentile(time_costs, 50):2f} s, "
350
+ f"p90 latency: {np.percentile(time_costs, 90):2f} s, p95 latency: {np.percentile(time_costs, 95):2f} s."
351
+ )
352
+ images[0].save(f"{folder}/{task_name}.png")
353
+
354
+
355
+ if __name__ == "__main__":
356
+ args = parse_arguments()
357
+ main(args)
VLMEvalKit_old/PaddleMIX/ppdiffusers/deploy/sd15/infer_dygraph_torch.py ADDED
@@ -0,0 +1,417 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ import argparse
16
+ import os
17
+ import time
18
+
19
+ import torch
20
+
21
+ torch.nn.functional.scaled_dot_product_attention_ = torch.nn.functional.scaled_dot_product_attention
22
+ delattr(torch.nn.functional, "scaled_dot_product_attention")
23
+
24
+ import cv2
25
+ import numpy as np
26
+ from diffusers import (
27
+ DDIMScheduler,
28
+ DDPMScheduler,
29
+ DEISMultistepScheduler,
30
+ DPMSolverMultistepScheduler,
31
+ DPMSolverSinglestepScheduler,
32
+ EulerAncestralDiscreteScheduler,
33
+ EulerDiscreteScheduler,
34
+ HeunDiscreteScheduler,
35
+ KDPM2AncestralDiscreteScheduler,
36
+ KDPM2DiscreteScheduler,
37
+ LMSDiscreteScheduler,
38
+ PNDMScheduler,
39
+ StableDiffusionImg2ImgPipeline,
40
+ StableDiffusionInpaintPipeline,
41
+ StableDiffusionPipeline,
42
+ UniPCMultistepScheduler,
43
+ )
44
+ from diffusers.models.attention_processor import AttnProcessor, AttnProcessor2_0
45
+ from diffusers.utils import load_image
46
+ from PIL import Image
47
+ from tqdm.auto import trange
48
+
49
+
50
+ def get_canny_image(image, args):
51
+ if isinstance(image, Image.Image):
52
+ image = np.array(image)
53
+ image = cv2.Canny(image, args.low_threshold, args.high_threshold)
54
+ image = image[:, :, None]
55
+ image = np.concatenate([image, image, image], axis=2)
56
+ canny_image = Image.fromarray(image)
57
+ return canny_image
58
+
59
+
60
+ def strtobool(v):
61
+ if isinstance(v, bool):
62
+ return v
63
+ if v.lower() in ("yes", "true", "t", "y", "1"):
64
+ return True
65
+ elif v.lower() in ("no", "false", "f", "n", "0"):
66
+ return False
67
+ else:
68
+ raise ValueError(
69
+ f"Truthy value expected: got {v} but expected one of yes/no, true/false, t/f, y/n, 1/0 (case insensitive)."
70
+ )
71
+
72
+
73
+ def change_scheduler(self, scheduler_type="ddim"):
74
+ self.orginal_scheduler_config = self.scheduler.config
75
+ scheduler_type = scheduler_type.lower()
76
+ if scheduler_type == "pndm":
77
+ scheduler = PNDMScheduler.from_config(self.orginal_scheduler_config, skip_prk_steps=True)
78
+ elif scheduler_type == "lms":
79
+ scheduler = LMSDiscreteScheduler.from_config(self.orginal_scheduler_config)
80
+ elif scheduler_type == "heun":
81
+ scheduler = HeunDiscreteScheduler.from_config(self.orginal_scheduler_config)
82
+ elif scheduler_type == "euler":
83
+ scheduler = EulerDiscreteScheduler.from_config(self.orginal_scheduler_config)
84
+ elif scheduler_type == "euler-ancestral":
85
+ scheduler = EulerAncestralDiscreteScheduler.from_config(self.orginal_scheduler_config)
86
+ elif scheduler_type == "dpm-multi":
87
+ scheduler = DPMSolverMultistepScheduler.from_config(self.orginal_scheduler_config)
88
+ elif scheduler_type == "dpm-single":
89
+ scheduler = DPMSolverSinglestepScheduler.from_config(self.orginal_scheduler_config)
90
+ elif scheduler_type == "kdpm2-ancestral":
91
+ scheduler = KDPM2AncestralDiscreteScheduler.from_config(self.orginal_scheduler_config)
92
+ elif scheduler_type == "kdpm2":
93
+ scheduler = KDPM2DiscreteScheduler.from_config(self.orginal_scheduler_config)
94
+ elif scheduler_type == "unipc-multi":
95
+ scheduler = UniPCMultistepScheduler.from_config(self.orginal_scheduler_config)
96
+ elif scheduler_type == "ddim":
97
+ scheduler = DDIMScheduler.from_config(
98
+ self.orginal_scheduler_config,
99
+ steps_offset=1,
100
+ clip_sample=False,
101
+ set_alpha_to_one=False,
102
+ )
103
+ elif scheduler_type == "ddpm":
104
+ scheduler = DDPMScheduler.from_config(
105
+ self.orginal_scheduler_config,
106
+ )
107
+ elif scheduler_type == "deis-multi":
108
+ scheduler = DEISMultistepScheduler.from_config(
109
+ self.orginal_scheduler_config,
110
+ )
111
+ else:
112
+ raise ValueError(f"Scheduler of type {scheduler_type} doesn't exist!")
113
+ return scheduler
114
+
115
+
116
+ def parse_arguments():
117
+
118
+ parser = argparse.ArgumentParser()
119
+ parser.add_argument(
120
+ "--pretrained_model_name_or_path",
121
+ type=str,
122
+ default="runwayml/stable-diffusion-v1-5",
123
+ help="Path to the `diffusers` checkpoint to convert (either a local directory or on the bos).",
124
+ )
125
+ parser.add_argument(
126
+ "--inference_steps",
127
+ type=int,
128
+ default=50,
129
+ help="The number of unet inference steps.",
130
+ )
131
+ parser.add_argument(
132
+ "--benchmark_steps",
133
+ type=int,
134
+ default=10,
135
+ help="The number of performance benchmark steps.",
136
+ )
137
+ parser.add_argument(
138
+ "--task_name",
139
+ type=str,
140
+ default="all",
141
+ choices=[
142
+ "text2img",
143
+ "img2img",
144
+ "inpaint_legacy",
145
+ "all",
146
+ ],
147
+ help="The task can be one of [text2img, img2img, inpaint_legacy, all]. ",
148
+ )
149
+ parser.add_argument(
150
+ "--parse_prompt_type",
151
+ type=str,
152
+ default="raw",
153
+ choices=[
154
+ "raw",
155
+ "lpw",
156
+ ],
157
+ help="The parse_prompt_type can be one of [raw, lpw]. ",
158
+ )
159
+ parser.add_argument(
160
+ "--channels_last",
161
+ type=strtobool,
162
+ default=False,
163
+ help="Wheter to use channels_last",
164
+ )
165
+ parser.add_argument("--use_fp16", type=strtobool, default=True, help="Wheter to use FP16 mode")
166
+ parser.add_argument("--tf32", type=strtobool, default=True, help="tf32")
167
+ parser.add_argument("--compile", type=strtobool, default=False, help="compile")
168
+ parser.add_argument(
169
+ "--attention_type",
170
+ type=str,
171
+ default="sdp",
172
+ choices=[
173
+ "raw",
174
+ "sdp",
175
+ ],
176
+ help="attention_type.",
177
+ )
178
+ parser.add_argument("--device_id", type=int, default=0, help="The selected gpu id. -1 means use cpu")
179
+ parser.add_argument(
180
+ "--scheduler",
181
+ type=str,
182
+ default="euler-ancestral",
183
+ choices=[
184
+ "pndm",
185
+ "lms",
186
+ "euler",
187
+ "euler-ancestral",
188
+ "dpm-multi",
189
+ "dpm-single",
190
+ "unipc-multi",
191
+ "ddim",
192
+ "ddpm",
193
+ "deis-multi",
194
+ "heun",
195
+ "kdpm2-ancestral",
196
+ "kdpm2",
197
+ ],
198
+ help="The scheduler type of stable diffusion.",
199
+ )
200
+ parser.add_argument("--height", type=int, default=512, help="Height of input image")
201
+ parser.add_argument("--width", type=int, default=512, help="Width of input image")
202
+ parser.add_argument("--strength", type=float, default=1.0, help="Strength for img2img / inpaint")
203
+ return parser.parse_args()
204
+
205
+
206
+ def attn_processors(self):
207
+ processors = {}
208
+
209
+ def fn_recursive_add_processors(name: str, module, processors):
210
+ if hasattr(module, "set_processor"):
211
+ processors[f"{name}.processor"] = module.processor
212
+
213
+ for sub_name, child in module.named_children():
214
+ fn_recursive_add_processors(f"{name}.{sub_name}", child, processors)
215
+
216
+ return processors
217
+
218
+ for name, module in self.named_children():
219
+ fn_recursive_add_processors(name, module, processors)
220
+
221
+ return processors
222
+
223
+
224
+ def set_attn_processor(self, processor):
225
+ count = len(attn_processors(self).keys())
226
+
227
+ if isinstance(processor, dict) and len(processor) != count:
228
+ raise ValueError(
229
+ f"A dict of processors was passed, but the number of processors {len(processor)} does not match the"
230
+ f" number of attention layers: {count}. Please make sure to pass {count} processor classes."
231
+ )
232
+
233
+ def fn_recursive_attn_processor(name: str, module, processor):
234
+ if hasattr(module, "set_processor"):
235
+ if not isinstance(processor, dict):
236
+ module.set_processor(processor)
237
+ else:
238
+ module.set_processor(processor.pop(f"{name}.processor"))
239
+
240
+ for sub_name, child in module.named_children():
241
+ fn_recursive_attn_processor(f"{name}.{sub_name}", child, processor)
242
+
243
+ for name, module in self.named_children():
244
+ fn_recursive_attn_processor(name, module, processor)
245
+
246
+
247
+ def main(args):
248
+ if args.tf32:
249
+ torch.backends.cuda.matmul.allow_tf32 = True
250
+ else:
251
+ torch.backends.cuda.matmul.allow_tf32 = False
252
+
253
+ seed = 1024
254
+ torch_dtype = torch.float16 if args.use_fp16 else torch.float32
255
+ pipe = StableDiffusionPipeline.from_pretrained(
256
+ args.pretrained_model_name_or_path,
257
+ safety_checker=None,
258
+ feature_extractor=None,
259
+ requires_safety_checker=False,
260
+ torch_dtype=torch_dtype,
261
+ )
262
+ scheduler = change_scheduler(pipe, args.scheduler)
263
+ pipe.scheduler = scheduler
264
+ if args.device_id >= 0:
265
+ pipe.to(f"cuda:{args.device_id}")
266
+
267
+ if args.attention_type == "all":
268
+ args.attention_type = ["raw", "sdp"]
269
+ else:
270
+ args.attention_type = [args.attention_type]
271
+
272
+ for attention_type in args.attention_type:
273
+ attn_prrocessor_cls = AttnProcessor if attention_type == "raw" else AttnProcessor2_0
274
+ if attention_type == "sdp":
275
+ torch.nn.functional.scaled_dot_product_attention = torch.nn.functional.scaled_dot_product_attention_
276
+ set_attn_processor(pipe.unet, attn_prrocessor_cls())
277
+ set_attn_processor(pipe.vae, attn_prrocessor_cls())
278
+
279
+ if args.channels_last:
280
+ pipe.unet.to(memory_format=torch.channels_last)
281
+
282
+ if args.compile:
283
+ print("Run torch compile")
284
+ pipe.unet = torch.compile(pipe.unet, mode="reduce-overhead", fullgraph=True)
285
+
286
+ width = args.width
287
+ height = args.height
288
+ pipe.set_progress_bar_config(disable=False)
289
+
290
+ folder = f"torch_attn_{attention_type}_fp16" if args.use_fp16 else f"torch_attn_{attention_type}_fp32"
291
+ os.makedirs(folder, exist_ok=True)
292
+ if args.task_name in ["text2img", "all"]:
293
+ init_image = load_image(
294
+ "https://paddlenlp.bj.bcebos.com/models/community/junnyu/develop/control_bird_canny_demo.png"
295
+ )
296
+ # text2img
297
+ prompt = "bird"
298
+ time_costs = []
299
+ # warmup
300
+ pipe(
301
+ prompt,
302
+ num_inference_steps=10,
303
+ height=height,
304
+ width=width,
305
+ )
306
+ print("==> Test text2img performance.")
307
+ for step in trange(args.benchmark_steps):
308
+ start = time.time()
309
+ torch.cuda.manual_seed(seed)
310
+ images = pipe(
311
+ prompt,
312
+ num_inference_steps=args.inference_steps,
313
+ height=height,
314
+ width=width,
315
+ ).images
316
+ latency = time.time() - start
317
+ time_costs += [latency]
318
+ # print(f"No {step:3d} time cost: {latency:2f} s")
319
+ print(
320
+ f"Attention type: {attention_type}, "
321
+ f"Use fp16: {'true' if args.use_fp16 else 'false'}, "
322
+ f"Mean iter/sec: {1 / (np.mean(time_costs) / args.inference_steps):2f} it/s, "
323
+ f"Mean latency: {np.mean(time_costs):2f} s, p50 latency: {np.percentile(time_costs, 50):2f} s, "
324
+ f"p90 latency: {np.percentile(time_costs, 90):2f} s, p95 latency: {np.percentile(time_costs, 95):2f} s."
325
+ )
326
+ images[0].save(f"{folder}/text2img.png")
327
+
328
+ if args.task_name in ["img2img", "all"]:
329
+ pipe_img2img = StableDiffusionImg2ImgPipeline(**pipe.components)
330
+ pipe_img2img.set_progress_bar_config(disable=False)
331
+ img_url = "https://paddlenlp.bj.bcebos.com/models/community/CompVis/stable-diffusion-v1-4/sketch-mountains-input.png"
332
+ init_image = load_image(img_url).resize((width, height))
333
+ prompt = "A fantasy landscape, trending on artstation"
334
+ time_costs = []
335
+ # warmup
336
+ pipe_img2img(
337
+ prompt,
338
+ image=init_image,
339
+ num_inference_steps=20,
340
+ height=height,
341
+ width=width,
342
+ strength=args.strength,
343
+ )
344
+ print("==> Test img2img performance.")
345
+ for step in trange(args.benchmark_steps):
346
+ start = time.time()
347
+ torch.cuda.manual_seed(seed)
348
+ images = pipe_img2img(
349
+ prompt,
350
+ image=init_image,
351
+ num_inference_steps=args.inference_steps,
352
+ height=height,
353
+ width=width,
354
+ strength=args.strength,
355
+ ).images
356
+ latency = time.time() - start
357
+ time_costs += [latency]
358
+ # print(f"No {step:3d} time cost: {latency:2f} s")
359
+ print(
360
+ f"Attention type: {attention_type}, "
361
+ f"Use fp16: {'true' if args.use_fp16 else 'false'}, "
362
+ f"Mean iter/sec: {1 / (np.mean(time_costs) / args.inference_steps):2f} it/s, "
363
+ f"Mean latency: {np.mean(time_costs):2f} s, p50 latency: {np.percentile(time_costs, 50):2f} s, "
364
+ f"p90 latency: {np.percentile(time_costs, 90):2f} s, p95 latency: {np.percentile(time_costs, 95):2f} s."
365
+ )
366
+ images[0].save(f"{folder}/img2img.png")
367
+
368
+ if args.task_name in ["inpaint_legacy", "all"]:
369
+ pipe_inpaint = StableDiffusionInpaintPipeline(**pipe.components)
370
+ pipe_inpaint.set_progress_bar_config(disable=False)
371
+ img_url = (
372
+ "https://paddlenlp.bj.bcebos.com/models/community/CompVis/stable-diffusion-v1-4/overture-creations.png"
373
+ )
374
+ mask_url = "https://paddlenlp.bj.bcebos.com/models/community/CompVis/stable-diffusion-v1-4/overture-creations-mask.png"
375
+ init_image = load_image(img_url).resize((width, height))
376
+ mask_image = load_image(mask_url).resize((width, height))
377
+ prompt = "Face of a yellow cat, high resolution, sitting on a park bench"
378
+ time_costs = []
379
+ task_name = "inpaint_legacy"
380
+ pipe_inpaint(
381
+ prompt,
382
+ image=init_image,
383
+ mask_image=mask_image,
384
+ num_inference_steps=20,
385
+ height=height,
386
+ width=width,
387
+ strength=args.strength,
388
+ )
389
+ print(f"==> Test {task_name} performance.")
390
+ for step in trange(args.benchmark_steps):
391
+ start = time.time()
392
+ torch.cuda.manual_seed(seed)
393
+ images = pipe_inpaint(
394
+ prompt,
395
+ image=init_image,
396
+ mask_image=mask_image,
397
+ num_inference_steps=args.inference_steps,
398
+ height=height,
399
+ width=width,
400
+ strength=args.strength,
401
+ ).images
402
+ latency = time.time() - start
403
+ time_costs += [latency]
404
+ # print(f"No {step:3d} time cost: {latency:2f} s")
405
+ print(
406
+ f"Attention type: {attention_type}, "
407
+ f"Use fp16: {'true' if args.use_fp16 else 'false'}, "
408
+ f"Mean iter/sec: {1 / (np.mean(time_costs) / args.inference_steps):2f} it/s, "
409
+ f"Mean latency: {np.mean(time_costs):2f} s, p50 latency: {np.percentile(time_costs, 50):2f} s, "
410
+ f"p90 latency: {np.percentile(time_costs, 90):2f} s, p95 latency: {np.percentile(time_costs, 95):2f} s."
411
+ )
412
+ images[0].save(f"{folder}/{task_name}.png")
413
+
414
+
415
+ if __name__ == "__main__":
416
+ args = parse_arguments()
417
+ main(args)
VLMEvalKit_old/PaddleMIX/ppdiffusers/deploy/sd3/README.md ADDED
@@ -0,0 +1,77 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Stable Diffusion 3 高性能推理
2
+
3
+ - Paddle Inference提供Stable Diffusion 3 模型高性能推理实现,推理性能提升70%+
4
+ 环境准备:
5
+ ```shell
6
+ # 安装 triton并适配paddle
7
+ python -m pip install triton
8
+ python -m pip install git+https://github.com/zhoutianzi666/UseTritonInPaddle.git
9
+ python -c "import use_triton_in_paddle; use_triton_in_paddle.make_triton_compatible_with_paddle()"
10
+
11
+ # 安装develop版本的paddle,请根据自己的cuda版本选择对应的paddle版本,这里选择12.3的cuda版本
12
+ python -m pip install --pre paddlepaddle-gpu -i https://www.paddlepaddle.org.cn/packages/nightly/cu123/
13
+
14
+ # 安装paddlemix库,使用集成在paddlemix库中的自定义算子。
15
+ python -m pip install paddlemix
16
+
17
+ # 指定 libCutlassGemmEpilogue.so 的路径
18
+ # 详情请参考 https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/phi/kernels/fusion/cutlass/gemm_epilogue/README.md
19
+ export LD_LIBRARY_PATH=/your_dir/Paddle/paddle/phi/kernels/fusion/cutlass/gemm_epilogue/build:$LD_LIBRARY_PATH
20
+ - 请注意,该项用于在静态图推理时利用Cutlass融合算子提升推理性能,但是并不是必须项。
21
+ 如果不使用Cutlass可以将`./text_to_image_generation-stable_diffusion_3.py`中的`exp_enable_use_cutlass`设为False。
22
+ -
23
+ ```
24
+
25
+ 高性能推理指令:
26
+ ```shell
27
+ # 执行FP16推理
28
+ python text_to_image_generation-stable_diffusion_3.py --dtype float16 --height 512 --width 512 \
29
+ --num-inference-steps 50 --inference_optimize 1 \
30
+ --benchmark 1
31
+ ```
32
+ 注:--inference_optimize 1 用于开启推理优化,--benchmark 1 用于开启性能测试。
33
+
34
+
35
+ - 在 NVIDIA A100-SXM4-40GB 上测试的性能如下:
36
+
37
+ | Paddle Inference| PyTorch | Paddle 动态图 |
38
+ | --------------- | ------------ | ------------ |
39
+ | 1.2 s | 1.78 s | 4.202 s |
40
+
41
+
42
+ ## Paddle Stable Diffusion 3 模型多卡推理:
43
+ ### Data Parallel 实现原理
44
+ - 在SD3中,对于输入是一个prompt时,使用CFG需要同时进行unconditional guide和text guide的生成,此时 MM-DiT-blocks 的输入batch_size=2;
45
+ 所以我们考虑在多卡并行的方案中,将batch为2的输入拆分到两张卡上进行计算,这样单卡的计算量就减少为原来的一半,降低了单卡所承载的浮点计算量。
46
+ 计算完成后,我们再把两张卡的计算结果聚合在一起,结果与单卡计算完全一致。
47
+
48
+ ### Model parallel 实现原理
49
+ - 在SD3中,在Linear和Attnetion中有大量的GEMM(General Matrix Multiply),当生成高分辨率图像时,GEMM的计算量以及模型的预训练权重大小都呈线性递增。
50
+ 因此,我们考虑在多卡并行方案中,将模型的这些GEMM拆分到两张卡上进行计算,这样单卡的计算量和权重大小就都减少为原来的一半,不仅降低了单卡所承载的浮点计算量,也降低了单卡的显存占用。
51
+
52
+ ### 开启多卡推理方法
53
+ - Paddle Inference 提供了SD3模型的多卡推理功能,用户可以通过设置 `mp_size 2` 来开启Model Parallel,使用 `dp_size 2`来开启Data Parallel。
54
+ 使用 `python -m paddle.distributed.launch --gpus “0,1,2,3”` 指定使用哪些卡进行推理,其中`--gpus “0,1,2,3”`即为启用的GPU卡号。
55
+ 如果只需使用两卡推理,则只需指定两卡即可,如 `python -m paddle.distributed.launch --gpus “0,1”`。同时需要指定使用的并行方法及并行度,如 `mp_size 2` 或者 `dp_size 2`。
56
+
57
+ - 注意,这里的`mp_size`需要设定为不大于输入的batch_size个,且`mp_size`和`dp_size`的和不能超过机器总卡数。
58
+ - 高性能多卡推理指令:
59
+ ```shell
60
+ # 执行多卡推理指令
61
+ python -m paddle.distributed.launch --gpus "0,1,2,3" text_to_image_generation-stable_diffusion_3.py \
62
+ --dtype float16 \
63
+ --height 1024 \
64
+ --width 1024 \
65
+ --num-inference-steps 20 \
66
+ --inference_optimize 1 \
67
+ --mp_size 2 \
68
+ --dp_size 2 \
69
+ --benchmark 1
70
+ ```
71
+ 注:--inference_optimize 1 用于开启推理优化,--benchmark 1 用于开启性能测试。
72
+
73
+ ## 在 NVIDIA A800-SXM4-80GB 上测试的性能如下:
74
+
75
+ | Paddle mp_size=2 & dp_size=2 | Paddle mp_size=2 | Paddle dp_size=2 | Paddle Single Card | Paddle 动态图 |
76
+ | ---------------------------- | ------------------- | ---------------- | ------------------ | ------------ |
77
+ | 0.99s | 1.581 s | 1.319 s | 2.376 s | 3.2 s |
VLMEvalKit_old/PaddleMIX/ppdiffusers/deploy/sd3/infer_dygraph_paddle.py ADDED
@@ -0,0 +1,264 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ import argparse
16
+ import os
17
+ import time
18
+ import warnings
19
+
20
+ import cv2
21
+ import numpy as np
22
+ import paddle
23
+ from PIL import Image
24
+ from tqdm.auto import trange
25
+
26
+ from ppdiffusers import (
27
+ FlowMatchEulerDiscreteScheduler,
28
+ DDIMScheduler,
29
+ DDPMScheduler,
30
+ DEISMultistepScheduler,
31
+ DPMSolverMultistepScheduler,
32
+ DPMSolverSinglestepScheduler,
33
+ EulerAncestralDiscreteScheduler,
34
+ EulerDiscreteScheduler,
35
+ HeunDiscreteScheduler,
36
+ KDPM2AncestralDiscreteScheduler,
37
+ KDPM2DiscreteScheduler,
38
+ LMSDiscreteScheduler,
39
+ PNDMScheduler,
40
+ StableDiffusion3Pipeline,
41
+ UniPCMultistepScheduler,
42
+ )
43
+ from ppdiffusers.utils import load_image
44
+
45
+
46
+
47
+ def strtobool(v):
48
+ if isinstance(v, bool):
49
+ return v
50
+ if v.lower() in ("yes", "true", "t", "y", "1"):
51
+ return True
52
+ elif v.lower() in ("no", "false", "f", "n", "0"):
53
+ return False
54
+ else:
55
+ raise ValueError(
56
+ f"Truthy value expected: got {v} but expected one of yes/no, true/false, t/f, y/n, 1/0 (case insensitive)."
57
+ )
58
+
59
+
60
+ def change_scheduler(self, scheduler_type="ddim"):
61
+ self.orginal_scheduler_config = self.scheduler.config
62
+ scheduler_type = scheduler_type.lower()
63
+ if scheduler_type == "flow":
64
+ scheduler = FlowMatchEulerDiscreteScheduler.from_config(self.orginal_scheduler_config, skip_prk_steps=True)
65
+ elif scheduler_type == "pndm":
66
+ scheduler = PNDMScheduler.from_config(self.orginal_scheduler_config, skip_prk_steps=True)
67
+ elif scheduler_type == "lms":
68
+ scheduler = LMSDiscreteScheduler.from_config(self.orginal_scheduler_config)
69
+ elif scheduler_type == "heun":
70
+ scheduler = HeunDiscreteScheduler.from_config(self.orginal_scheduler_config)
71
+ elif scheduler_type == "euler":
72
+ scheduler = EulerDiscreteScheduler.from_config(self.orginal_scheduler_config)
73
+ elif scheduler_type == "euler-ancestral":
74
+ scheduler = EulerAncestralDiscreteScheduler.from_config(self.orginal_scheduler_config)
75
+ elif scheduler_type == "dpm-multi":
76
+ scheduler = DPMSolverMultistepScheduler.from_config(self.orginal_scheduler_config)
77
+ elif scheduler_type == "dpm-single":
78
+ scheduler = DPMSolverSinglestepScheduler.from_config(self.orginal_scheduler_config)
79
+ elif scheduler_type == "kdpm2-ancestral":
80
+ scheduler = KDPM2AncestralDiscreteScheduler.from_config(self.orginal_scheduler_config)
81
+ elif scheduler_type == "kdpm2":
82
+ scheduler = KDPM2DiscreteScheduler.from_config(self.orginal_scheduler_config)
83
+ elif scheduler_type == "unipc-multi":
84
+ scheduler = UniPCMultistepScheduler.from_config(self.orginal_scheduler_config)
85
+ elif scheduler_type == "ddim":
86
+ scheduler = DDIMScheduler.from_config(
87
+ self.orginal_scheduler_config,
88
+ steps_offset=1,
89
+ clip_sample=False,
90
+ set_alpha_to_one=False,
91
+ )
92
+ elif scheduler_type == "ddpm":
93
+ scheduler = DDPMScheduler.from_config(
94
+ self.orginal_scheduler_config,
95
+ )
96
+ elif scheduler_type == "deis-multi":
97
+ scheduler = DEISMultistepScheduler.from_config(
98
+ self.orginal_scheduler_config,
99
+ )
100
+ else:
101
+ raise ValueError(f"Scheduler of type {scheduler_type} doesn't exist!")
102
+ return scheduler
103
+
104
+
105
+ def parse_arguments():
106
+
107
+ parser = argparse.ArgumentParser()
108
+ parser.add_argument(
109
+ "--pretrained_model_name_or_path",
110
+ type=str,
111
+ default="stabilityai/stable-diffusion-3-medium-diffusers",
112
+ help="Path to the `diffusers` checkpoint to convert (either a local directory or on the bos).",
113
+ )
114
+ parser.add_argument(
115
+ "--inference_steps",
116
+ type=int,
117
+ default=50,
118
+ help="The number of unet inference steps.",
119
+ )
120
+ parser.add_argument(
121
+ "--benchmark_steps",
122
+ type=int,
123
+ default=10,
124
+ help="The number of performance benchmark steps.",
125
+ )
126
+ parser.add_argument(
127
+ "--task_name",
128
+ type=str,
129
+ default="all",
130
+ choices=[
131
+ "text2img",
132
+ "img2img",
133
+ "inpaint_legacy",
134
+ "all",
135
+ ],
136
+ help="The task can be one of [text2img, img2img, inpaint_legacy, all]. ",
137
+ )
138
+ parser.add_argument(
139
+ "--parse_prompt_type",
140
+ type=str,
141
+ default="raw",
142
+ choices=[
143
+ "raw",
144
+ "lpw",
145
+ ],
146
+ help="The parse_prompt_type can be one of [raw, lpw]. ",
147
+ )
148
+ parser.add_argument("--use_fp16", type=strtobool, default=True, help="Wheter to use FP16 mode")
149
+ parser.add_argument(
150
+ "--attention_type", type=str, default="raw", choices=["raw", "cutlass", "flash", "all"], help="attention_type."
151
+ )
152
+ parser.add_argument("--device_id", type=int, default=0, help="The selected gpu id. -1 means use cpu")
153
+ parser.add_argument(
154
+ "--scheduler",
155
+ type=str,
156
+ default="euler-ancestral",
157
+ choices=[
158
+ "flow",
159
+ "pndm",
160
+ "lms",
161
+ "euler",
162
+ "euler-ancestral",
163
+ "dpm-multi",
164
+ "dpm-single",
165
+ "unipc-multi",
166
+ "ddim",
167
+ "ddpm",
168
+ "deis-multi",
169
+ "heun",
170
+ "kdpm2-ancestral",
171
+ "kdpm2",
172
+ ],
173
+ help="The scheduler type of stable diffusion.",
174
+ )
175
+ parser.add_argument("--height", type=int, default=512, help="Height of input image")
176
+ parser.add_argument("--width", type=int, default=512, help="Width of input image")
177
+ parser.add_argument("--strength", type=float, default=1.0, help="Strength for img2img / inpaint")
178
+ return parser.parse_args()
179
+
180
+
181
+ def main(args):
182
+
183
+ seed = 1024
184
+ paddle_dtype = paddle.float16 if args.use_fp16 else paddle.float32
185
+ pipe = StableDiffusion3Pipeline.from_pretrained(
186
+ args.pretrained_model_name_or_path,
187
+ safety_checker=None,
188
+ feature_extractor=None,
189
+ requires_safety_checker=False,
190
+ paddle_dtype=paddle_dtype,
191
+ )
192
+ scheduler = change_scheduler(pipe, args.scheduler)
193
+ pipe.scheduler = scheduler
194
+
195
+ if args.attention_type == "all":
196
+ args.attention_type = ["raw", "cutlass", "flash"]
197
+ else:
198
+ args.attention_type = [args.attention_type]
199
+
200
+ for attention_type in args.attention_type:
201
+ if attention_type == "raw":
202
+ pipe.disable_xformers_memory_efficient_attention()
203
+ else:
204
+ try:
205
+ pipe.enable_xformers_memory_efficient_attention(attention_type)
206
+ except Exception as e:
207
+ if attention_type == "flash":
208
+ warnings.warn(
209
+ "Attention type flash is not supported on your GPU! We need to use 3060、3070、3080、3090、4060、4070、4080、4090、A30、A100 etc."
210
+ )
211
+ continue
212
+ else:
213
+ raise ValueError(e)
214
+
215
+ if not args.use_fp16 and attention_type == "flash":
216
+ print("Flash attention is not supported dtype=float32! Please use float16 or bfloat16. We will skip this!")
217
+ continue
218
+
219
+ width = args.width
220
+ height = args.height
221
+ pipe.set_progress_bar_config(disable=False)
222
+
223
+ folder = f"paddle_attn_{attention_type}_fp16" if args.use_fp16 else f"paddle_attn_{attention_type}_fp32"
224
+ os.makedirs(folder, exist_ok=True)
225
+ if args.task_name in ["text2img", "all"]:
226
+ init_image = load_image(
227
+ "https://paddlenlp.bj.bcebos.com/models/community/junnyu/develop/control_bird_canny_demo.png"
228
+ )
229
+ # text2img
230
+ prompt = "bird"
231
+ time_costs = []
232
+ # warmup
233
+ pipe(
234
+ prompt,
235
+ num_inference_steps=10,
236
+ height=height,
237
+ width=width,
238
+ )
239
+ print("==> Test text2img performance.")
240
+ for step in trange(args.benchmark_steps):
241
+ start = time.time()
242
+ paddle.seed(seed)
243
+ images = pipe(
244
+ prompt,
245
+ num_inference_steps=args.inference_steps,
246
+ height=height,
247
+ width=width,
248
+ ).images
249
+ latency = time.time() - start
250
+ time_costs += [latency]
251
+ # print(f"No {step:3d} time cost: {latency:2f} s")
252
+ print(
253
+ f"Attention type: {attention_type}, "
254
+ f"Use fp16: {'true' if args.use_fp16 else 'false'}, "
255
+ f"Mean iter/sec: {1 / (np.mean(time_costs) / args.inference_steps):2f} it/s, "
256
+ f"Mean latency: {np.mean(time_costs):2f} s, p50 latency: {np.percentile(time_costs, 50):2f} s, "
257
+ f"p90 latency: {np.percentile(time_costs, 90):2f} s, p95 latency: {np.percentile(time_costs, 95):2f} s."
258
+ )
259
+ images[0].save(f"{folder}/text2img.png")
260
+
261
+
262
+ if __name__ == "__main__":
263
+ args = parse_arguments()
264
+ main(args)
VLMEvalKit_old/PaddleMIX/ppdiffusers/deploy/sd3/infer_dygraph_torch.py ADDED
@@ -0,0 +1,325 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ import argparse
16
+ import os
17
+ import time
18
+
19
+ import torch
20
+
21
+ # torch.nn.functional.scaled_dot_product_attention_ = torch.nn.functional.scaled_dot_product_attention
22
+ # delattr(torch.nn.functional, "scaled_dot_product_attention")
23
+
24
+ import cv2
25
+ import numpy as np
26
+ from diffusers import (
27
+ FlowMatchEulerDiscreteScheduler,
28
+ DDIMScheduler,
29
+ DDPMScheduler,
30
+ DEISMultistepScheduler,
31
+ DPMSolverMultistepScheduler,
32
+ DPMSolverSinglestepScheduler,
33
+ EulerAncestralDiscreteScheduler,
34
+ EulerDiscreteScheduler,
35
+ HeunDiscreteScheduler,
36
+ KDPM2AncestralDiscreteScheduler,
37
+ KDPM2DiscreteScheduler,
38
+ LMSDiscreteScheduler,
39
+ PNDMScheduler,
40
+ StableDiffusion3Pipeline,
41
+ UniPCMultistepScheduler,
42
+ )
43
+ from diffusers.models.attention_processor import AttnProcessor, AttnProcessor2_0
44
+ from diffusers.utils import load_image
45
+ from PIL import Image
46
+ from tqdm.auto import trange
47
+
48
+
49
+
50
+ def strtobool(v):
51
+ if isinstance(v, bool):
52
+ return v
53
+ if v.lower() in ("yes", "true", "t", "y", "1"):
54
+ return True
55
+ elif v.lower() in ("no", "false", "f", "n", "0"):
56
+ return False
57
+ else:
58
+ raise ValueError(
59
+ f"Truthy value expected: got {v} but expected one of yes/no, true/false, t/f, y/n, 1/0 (case insensitive)."
60
+ )
61
+
62
+
63
+ def change_scheduler(self, scheduler_type="ddim"):
64
+ self.orginal_scheduler_config = self.scheduler.config
65
+ scheduler_type = scheduler_type.lower()
66
+ if scheduler_type == "flow":
67
+ scheduler = FlowMatchEulerDiscreteScheduler.from_config(self.orginal_scheduler_config, skip_prk_steps=True)
68
+ elif scheduler_type == "pndm":
69
+ scheduler = PNDMScheduler.from_config(self.orginal_scheduler_config, skip_prk_steps=True)
70
+ elif scheduler_type == "lms":
71
+ scheduler = LMSDiscreteScheduler.from_config(self.orginal_scheduler_config)
72
+ elif scheduler_type == "heun":
73
+ scheduler = HeunDiscreteScheduler.from_config(self.orginal_scheduler_config)
74
+ elif scheduler_type == "euler":
75
+ scheduler = EulerDiscreteScheduler.from_config(self.orginal_scheduler_config)
76
+ elif scheduler_type == "euler-ancestral":
77
+ scheduler = EulerAncestralDiscreteScheduler.from_config(self.orginal_scheduler_config)
78
+ elif scheduler_type == "dpm-multi":
79
+ scheduler = DPMSolverMultistepScheduler.from_config(self.orginal_scheduler_config)
80
+ elif scheduler_type == "dpm-single":
81
+ scheduler = DPMSolverSinglestepScheduler.from_config(self.orginal_scheduler_config)
82
+ elif scheduler_type == "kdpm2-ancestral":
83
+ scheduler = KDPM2AncestralDiscreteScheduler.from_config(self.orginal_scheduler_config)
84
+ elif scheduler_type == "kdpm2":
85
+ scheduler = KDPM2DiscreteScheduler.from_config(self.orginal_scheduler_config)
86
+ elif scheduler_type == "unipc-multi":
87
+ scheduler = UniPCMultistepScheduler.from_config(self.orginal_scheduler_config)
88
+ elif scheduler_type == "ddim":
89
+ scheduler = DDIMScheduler.from_config(
90
+ self.orginal_scheduler_config,
91
+ steps_offset=1,
92
+ clip_sample=False,
93
+ set_alpha_to_one=False,
94
+ )
95
+ elif scheduler_type == "ddpm":
96
+ scheduler = DDPMScheduler.from_config(
97
+ self.orginal_scheduler_config,
98
+ )
99
+ elif scheduler_type == "deis-multi":
100
+ scheduler = DEISMultistepScheduler.from_config(
101
+ self.orginal_scheduler_config,
102
+ )
103
+ else:
104
+ raise ValueError(f"Scheduler of type {scheduler_type} doesn't exist!")
105
+ return scheduler
106
+
107
+
108
+ def parse_arguments():
109
+
110
+ parser = argparse.ArgumentParser()
111
+ parser.add_argument(
112
+ "--pretrained_model_name_or_path",
113
+ type=str,
114
+ default="stabilityai/stable-diffusion-3-medium-diffusers",
115
+ help="Path to the `diffusers` checkpoint to convert (either a local directory or on the bos).",
116
+ )
117
+ parser.add_argument(
118
+ "--inference_steps",
119
+ type=int,
120
+ default=50,
121
+ help="The number of unet inference steps.",
122
+ )
123
+ parser.add_argument(
124
+ "--benchmark_steps",
125
+ type=int,
126
+ default=10,
127
+ help="The number of performance benchmark steps.",
128
+ )
129
+ parser.add_argument(
130
+ "--task_name",
131
+ type=str,
132
+ default="all",
133
+ choices=[
134
+ "text2img",
135
+ "img2img",
136
+ "inpaint_legacy",
137
+ "all",
138
+ ],
139
+ help="The task can be one of [text2img, img2img, inpaint_legacy, all]. ",
140
+ )
141
+ parser.add_argument(
142
+ "--parse_prompt_type",
143
+ type=str,
144
+ default="raw",
145
+ choices=[
146
+ "raw",
147
+ "lpw",
148
+ ],
149
+ help="The parse_prompt_type can be one of [raw, lpw]. ",
150
+ )
151
+ parser.add_argument(
152
+ "--channels_last",
153
+ type=strtobool,
154
+ default=False,
155
+ help="Wheter to use channels_last",
156
+ )
157
+ parser.add_argument("--use_fp16", type=strtobool, default=True, help="Wheter to use FP16 mode")
158
+ parser.add_argument("--tf32", type=strtobool, default=True, help="tf32")
159
+ parser.add_argument("--compile", type=strtobool, default=False, help="compile")
160
+ parser.add_argument(
161
+ "--attention_type",
162
+ type=str,
163
+ default="sdp",
164
+ choices=[
165
+ "raw",
166
+ "sdp",
167
+ ],
168
+ help="attention_type.",
169
+ )
170
+ parser.add_argument("--device_id", type=int, default=0, help="The selected gpu id. -1 means use cpu")
171
+ parser.add_argument(
172
+ "--scheduler",
173
+ type=str,
174
+ default="euler-ancestral",
175
+ choices=[
176
+ "flow",
177
+ "pndm",
178
+ "lms",
179
+ "euler",
180
+ "euler-ancestral",
181
+ "dpm-multi",
182
+ "dpm-single",
183
+ "unipc-multi",
184
+ "ddim",
185
+ "ddpm",
186
+ "deis-multi",
187
+ "heun",
188
+ "kdpm2-ancestral",
189
+ "kdpm2",
190
+ ],
191
+ help="The scheduler type of stable diffusion.",
192
+ )
193
+ parser.add_argument("--height", type=int, default=512, help="Height of input image")
194
+ parser.add_argument("--width", type=int, default=512, help="Width of input image")
195
+ parser.add_argument("--strength", type=float, default=1.0, help="Strength for img2img / inpaint")
196
+ return parser.parse_args()
197
+
198
+
199
+ def attn_processors(self):
200
+ processors = {}
201
+
202
+ def fn_recursive_add_processors(name: str, module, processors):
203
+ if hasattr(module, "set_processor"):
204
+ processors[f"{name}.processor"] = module.processor
205
+
206
+ for sub_name, child in module.named_children():
207
+ fn_recursive_add_processors(f"{name}.{sub_name}", child, processors)
208
+
209
+ return processors
210
+
211
+ for name, module in self.named_children():
212
+ fn_recursive_add_processors(name, module, processors)
213
+
214
+ return processors
215
+
216
+
217
+ def set_attn_processor(self, processor):
218
+ count = len(attn_processors(self).keys())
219
+
220
+ if isinstance(processor, dict) and len(processor) != count:
221
+ raise ValueError(
222
+ f"A dict of processors was passed, but the number of processors {len(processor)} does not match the"
223
+ f" number of attention layers: {count}. Please make sure to pass {count} processor classes."
224
+ )
225
+
226
+ def fn_recursive_attn_processor(name: str, module, processor):
227
+ if hasattr(module, "set_processor"):
228
+ if not isinstance(processor, dict):
229
+ module.set_processor(processor)
230
+ else:
231
+ module.set_processor(processor.pop(f"{name}.processor"))
232
+
233
+ for sub_name, child in module.named_children():
234
+ fn_recursive_attn_processor(f"{name}.{sub_name}", child, processor)
235
+
236
+ for name, module in self.named_children():
237
+ fn_recursive_attn_processor(name, module, processor)
238
+
239
+
240
+ def main(args):
241
+ if args.tf32:
242
+ torch.backends.cuda.matmul.allow_tf32 = True
243
+ else:
244
+ torch.backends.cuda.matmul.allow_tf32 = False
245
+
246
+ seed = 1024
247
+ torch_dtype = torch.float16 if args.use_fp16 else torch.float32
248
+ pipe = StableDiffusion3Pipeline.from_pretrained(
249
+ args.pretrained_model_name_or_path,
250
+ safety_checker=None,
251
+ feature_extractor=None,
252
+ requires_safety_checker=False,
253
+ torch_dtype=torch_dtype,
254
+ )
255
+ scheduler = change_scheduler(pipe, args.scheduler)
256
+ pipe.scheduler = scheduler
257
+ if args.device_id >= 0:
258
+ pipe.to(f"cuda:{args.device_id}")
259
+
260
+ if args.attention_type == "all":
261
+ args.attention_type = ["raw", "sdp"]
262
+ else:
263
+ args.attention_type = [args.attention_type]
264
+
265
+ for attention_type in args.attention_type:
266
+ # attn_prrocessor_cls = AttnProcessor if attention_type == "raw" else AttnProcessor2_0
267
+ # if attention_type == "sdp":
268
+ # torch.nn.functional.scaled_dot_product_attention = torch.nn.functional.scaled_dot_product_attention_
269
+ # set_attn_processor(pipe.transformer, attn_prrocessor_cls())
270
+ # set_attn_processor(pipe.vae, attn_prrocessor_cls())
271
+
272
+ # if args.channels_last:
273
+ # pipe.transformer.to(memory_format=torch.channels_last)
274
+
275
+ # if args.compile:
276
+ # print("Run torch compile")
277
+ # pipe.unet = torch.compile(pipe.transformer, mode="reduce-overhead", fullgraph=True)
278
+
279
+ width = args.width
280
+ height = args.height
281
+ pipe.set_progress_bar_config(disable=False)
282
+
283
+ folder = f"torch_attn_{attention_type}_fp16" if args.use_fp16 else f"torch_attn_{attention_type}_fp32"
284
+ os.makedirs(folder, exist_ok=True)
285
+ if args.task_name in ["text2img", "all"]:
286
+ init_image = load_image(
287
+ "https://paddlenlp.bj.bcebos.com/models/community/junnyu/develop/control_bird_canny_demo.png"
288
+ )
289
+ # text2img
290
+ prompt = "bird"
291
+ time_costs = []
292
+ # warmup
293
+ pipe(
294
+ prompt,
295
+ num_inference_steps=10,
296
+ height=height,
297
+ width=width,
298
+ )
299
+ print("==> Test text2img performance.")
300
+ for step in trange(args.benchmark_steps):
301
+ start = time.time()
302
+ torch.cuda.manual_seed(seed)
303
+ images = pipe(
304
+ prompt,
305
+ num_inference_steps=args.inference_steps,
306
+ height=height,
307
+ width=width,
308
+ ).images
309
+ latency = time.time() - start
310
+ time_costs += [latency]
311
+ # print(f"No {step:3d} time cost: {latency:2f} s")
312
+ print(
313
+ f"Attention type: {attention_type}, "
314
+ f"Use fp16: {'true' if args.use_fp16 else 'false'}, "
315
+ f"Mean iter/sec: {1 / (np.mean(time_costs) / args.inference_steps):2f} it/s, "
316
+ f"Mean latency: {np.mean(time_costs):2f} s, p50 latency: {np.percentile(time_costs, 50):2f} s, "
317
+ f"p90 latency: {np.percentile(time_costs, 90):2f} s, p95 latency: {np.percentile(time_costs, 95):2f} s."
318
+ )
319
+ images[0].save(f"{folder}/text2img.png")
320
+
321
+
322
+
323
+ if __name__ == "__main__":
324
+ args = parse_arguments()
325
+ main(args)
VLMEvalKit_old/PaddleMIX/ppdiffusers/deploy/sd3/scripts/benchmark_paddle.sh ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ # attention raw fp16
16
+ python infer_dygraph_paddle.py --scheduler "flow" --task_name all --attention_type raw --use_fp16 True --inference_steps 50 --height 1024 --width 1024 --benchmark_steps 10
17
+
18
+ # attention cutlass fp16
19
+ python infer_dygraph_paddle.py --scheduler "flow" --task_name all --attention_type cutlass --use_fp16 True --inference_steps 50 --height 1024 --width 1024 --benchmark_steps 10
20
+
21
+ # attention flash fp16
22
+ python infer_dygraph_paddle.py --scheduler "flow" --task_name all --attention_type flash --use_fp16 True --inference_steps 50 --height 1024 --width 1024 --benchmark_steps 10
23
+
24
+
25
+ # attention raw fp32
26
+ python infer_dygraph_paddle.py --scheduler "flow" --task_name all --attention_type raw --use_fp16 False --inference_steps 50 --height 1024 --width 1024 --benchmark_steps 10
27
+
28
+ # attention cutlass fp32
29
+ python infer_dygraph_paddle.py --scheduler "flow" --task_name all --attention_type cutlass --use_fp16 False --inference_steps 50 --height 1024 --width 1024 --benchmark_steps 10
30
+
31
+ # attention flash fp32
32
+ python infer_dygraph_paddle.py --scheduler "flow" --task_name all --attention_type flash --use_fp16 False --inference_steps 50 --height 1024 --width 1024 --benchmark_steps 10
VLMEvalKit_old/PaddleMIX/ppdiffusers/deploy/sd3/scripts/benchmark_torch.sh ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ # sd3 do ot supprot attention raw
16
+
17
+ # attention sdp
18
+ python infer_dygraph_torch.py --scheduler "flow" --task_name all --attention_type sdp --use_fp16 True --inference_steps 50 --height 1024 --width 1024 --benchmark_steps 10
19
+
20
+ # attention sdp fp32
21
+ python infer_dygraph_torch.py --scheduler "flow" --task_name all --attention_type sdp --use_fp16 False --inference_steps 50 --height 1024 --width 1024 --benchmark_steps 10
VLMEvalKit_old/PaddleMIX/ppdiffusers/deploy/sd3/text_to_image_generation-stable_diffusion_3.py ADDED
@@ -0,0 +1,149 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ import argparse
15
+ import os
16
+
17
+ import paddle
18
+
19
+
20
+ def parse_args():
21
+ parser = argparse.ArgumentParser(
22
+ description=" Use PaddleMIX to accelerate the Stable Diffusion3 image generation model."
23
+ )
24
+ parser.add_argument(
25
+ "--benchmark",
26
+ type=(lambda x: str(x).lower() in ["true", "1", "yes"]),
27
+ default=False,
28
+ help="if set to True, measure inference performance",
29
+ )
30
+ parser.add_argument(
31
+ "--inference_optimize",
32
+ type=(lambda x: str(x).lower() in ["true", "1", "yes"]),
33
+ default=False,
34
+ help="If set to True, all optimizations except Triton are enabled.",
35
+ )
36
+
37
+ parser.add_argument("--height", type=int, default=512, help="Height of the generated image.")
38
+ parser.add_argument("--width", type=int, default=512, help="Width of the generated image.")
39
+ parser.add_argument("--num-inference-steps", type=int, default=50, help="Number of inference steps.")
40
+ parser.add_argument("--dtype", type=str, default="float32", help="Inference data types.")
41
+ parser.add_argument(
42
+ "--mp_size", type=int, default=1, help="This size refers to the degree of parallelism using model parallel."
43
+ )
44
+ parser.add_argument(
45
+ "--dp_size", type=int, default=1, help="This size refers to the degree of parallelism using data parallel."
46
+ )
47
+
48
+ return parser.parse_args()
49
+
50
+
51
+ args = parse_args()
52
+
53
+ if args.inference_optimize:
54
+ os.environ["INFERENCE_OPTIMIZE"] = "True"
55
+ os.environ["INFERENCE_OPTIMIZE_TRITON"] = "True"
56
+ os.environ["INFERENCE_MP_SIZE"] = str(args.mp_size)
57
+ os.environ["INFERENCE_DP_SIZE"] = str(args.dp_size)
58
+ if args.dtype == "float32":
59
+ inference_dtype = paddle.float32
60
+ elif args.dtype == "float16":
61
+ inference_dtype = paddle.float16
62
+
63
+
64
+ import paddle.distributed as dist
65
+ import paddle.distributed.fleet as fleet
66
+
67
+ if args.mp_size > 1 or args.dp_size > 1:
68
+ strategy = fleet.DistributedStrategy()
69
+ model_parallel_size = args.mp_size
70
+ data_parallel_size = args.dp_size
71
+ strategy.hybrid_configs = {"dp_degree": data_parallel_size, "mp_degree": model_parallel_size, "pp_degree": 1}
72
+ fleet.init(is_collective=True, strategy=strategy)
73
+ hcg = fleet.get_hybrid_communicate_group()
74
+ mp_id = hcg.get_model_parallel_rank()
75
+ dp_id = hcg.get_data_parallel_rank()
76
+ rank_id = dist.get_rank()
77
+ mp_degree = hcg.get_model_parallel_world_size()
78
+ dp_degree = hcg.get_data_parallel_world_size()
79
+ assert mp_degree == args.mp_size
80
+ assert dp_degree == args.dp_size
81
+
82
+ # this is for triton kernel cache for dynamic graph
83
+ # os.environ["TRITON_KERNEL_CACHE_DIR"] = f"./tmp/sd3_parallel/{rank_id}"
84
+
85
+ import datetime
86
+
87
+ from ppdiffusers import StableDiffusion3Pipeline
88
+
89
+ pipe = StableDiffusion3Pipeline.from_pretrained(
90
+ "stabilityai/stable-diffusion-3-medium-diffusers",
91
+ paddle_dtype=inference_dtype,
92
+ )
93
+
94
+ pipe.transformer = paddle.incubate.jit.inference(
95
+ pipe.transformer,
96
+ save_model_dir="./tmp/sd3",
97
+ enable_new_ir=True,
98
+ cache_static_model=True,
99
+ exp_enable_use_cutlass=True,
100
+ delete_pass_lists=["add_norm_fuse_pass"],
101
+ )
102
+
103
+ generator = paddle.Generator().manual_seed(42)
104
+ prompt = "A cat holding a sign that says hello world"
105
+
106
+
107
+ image = pipe(
108
+ prompt, num_inference_steps=args.num_inference_steps, width=args.width, height=args.height, generator=generator
109
+ ).images[0]
110
+
111
+ if args.benchmark:
112
+ # warmup
113
+ for i in range(3):
114
+ image = pipe(
115
+ prompt,
116
+ num_inference_steps=args.num_inference_steps,
117
+ width=args.width,
118
+ height=args.height,
119
+ generator=generator,
120
+ ).images[0]
121
+
122
+ repeat_times = 10
123
+ sumtime = 0.0
124
+ for i in range(repeat_times):
125
+ paddle.device.synchronize()
126
+ starttime = datetime.datetime.now()
127
+ image = pipe(
128
+ prompt,
129
+ num_inference_steps=args.num_inference_steps,
130
+ width=args.width,
131
+ height=args.height,
132
+ generator=generator,
133
+ ).images[0]
134
+ paddle.device.synchronize()
135
+ endtime = datetime.datetime.now()
136
+ duringtime = endtime - starttime
137
+ duringtime = duringtime.seconds * 1000 + duringtime.microseconds / 1000.0
138
+ sumtime += duringtime
139
+ print("SD3 end to end time : ", duringtime, "ms")
140
+
141
+ print("SD3 ave end to end time : ", sumtime / repeat_times, "ms")
142
+
143
+ cuda_mem_after_used = paddle.device.cuda.max_memory_allocated() / (1024**3)
144
+ print(f"Max used CUDA memory : {cuda_mem_after_used:.3f} GiB")
145
+
146
+
147
+ rank_id = dist.get_rank()
148
+ if rank_id == 0:
149
+ image.save("text_to_image_generation-stable_diffusion_3-result.png")
VLMEvalKit_old/PaddleMIX/ppdiffusers/deploy/sdxl/README.md ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # PaddleInfer Stable Diffusion XL 模型高性能部署
2
+
3
+ **目录**
4
+ * [环境依赖](#环境依赖)
5
+ * [快速体验](#快速体验)
6
+ * [文图生成(Text-to-Image Generation)](#文图生成)
7
+ * [文本引导的图像变换(Image-to-Image Text-Guided Generation)](#文本引导的图像变换)
8
+ * [文本引导的图像编辑(Text-Guided Image Inpainting)](#文本引导的图像编辑)
9
+
10
+ ⚡️[PaddleInfer]是一款全场景、易用灵活、极致高效的AI推理部署工具,为开发者提供多硬件、多推理引擎后端的部署能力。开发者只需调用一行代码即可随意切换硬件、推理引擎后端。本示例展现如何通过 PaddleInfer 将我们 PPDiffusers 训练好的 Stable Diffusion XL模型进行多硬件、多推理引擎后端高性能部署。
11
+
12
+ <a name="环境依赖"></a>
13
+
14
+ ## 环境依赖
15
+
16
+ 在示例中使用了 PaddleInfer,需要执行以下命令安装依赖。
17
+
18
+ ```shell
19
+ python -m pip install paddlepaddle-gpu==2.6.0.post112 -f https://www.paddlepaddle.org.cn/whl/linux/mkl/avx/stable.html
20
+ ```
21
+
22
+ <a name="快速体验"></a>
23
+
24
+ ## 静态图模型导出 (static model export)
25
+ ```
26
+ export USE_PPXFORMERS=False
27
+ python export_model.py --pretrained_model_name_or_path stabilityai/stable-diffusion-xl-base-1.0 --output_path static_model/stable-diffusion-xl-base-1.0
28
+ ```
29
+ 导出模型在static_model/stable-diffusion-xl-base-1.0目录下。
30
+
31
+ ### 文图生成(Text-to-Image Generation)
32
+ ```
33
+ python infer.py --model_dir static_model/stable-diffusion-xl-base-1.0 --scheduler "preconfig-euler-ancestral" --backend paddle --device gpu --task_name text2img
34
+ ```
35
+
36
+ ### 文本引导的图像变换(Image-to-Image Text-Guided Generation)
37
+ ```
38
+ python infer.py --model_dir static_model/stable-diffusion-xl-base-1.0 --scheduler "preconfig-euler-ancestral" --backend paddle --device gpu --task_name img2img
39
+ ```
40
+
41
+ ### 文本引导的图像编辑(Text-Guided Image Inpainting)
42
+ ```
43
+ python infer.py --model_dir static_model/stable-diffusion-xl-base-1.0 --scheduler "preconfig-euler-ancestral" --backend paddle --device gpu --task_name inpaint
44
+ ```
VLMEvalKit_old/PaddleMIX/ppdiffusers/ppdiffusers/experimental/rl/__init__.py ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ from .value_guided_sampling import ValueGuidedRLPipeline
VLMEvalKit_old/PaddleMIX/ppdiffusers/ppdiffusers/experimental/rl/value_guided_sampling.py ADDED
@@ -0,0 +1,153 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright 2023 The HuggingFace Team. All rights reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ import numpy as np
16
+ import paddle
17
+
18
+ from ...models.unet_1d import UNet1DModel
19
+ from ...pipelines import DiffusionPipeline
20
+ from ...utils.dummy_paddle_objects import DDPMScheduler
21
+ from ...utils.paddle_utils import randn_tensor
22
+
23
+
24
+ class ValueGuidedRLPipeline(DiffusionPipeline):
25
+ r"""
26
+ Pipeline for value-guided sampling from a diffusion model trained to predict sequences of states.
27
+
28
+ This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods
29
+ implemented for all pipelines (downloading, saving, running on a particular device, etc.).
30
+
31
+ Parameters:
32
+ value_function ([`UNet1DModel`]):
33
+ A specialized UNet for fine-tuning trajectories base on reward.
34
+ unet ([`UNet1DModel`]):
35
+ UNet architecture to denoise the encoded trajectories.
36
+ scheduler ([`SchedulerMixin`]):
37
+ A scheduler to be used in combination with `unet` to denoise the encoded trajectories. Default for this
38
+ application is [`DDPMScheduler`].
39
+ env ():
40
+ An environment following the OpenAI gym API to act in. For now only Hopper has pretrained models.
41
+ """
42
+
43
+ def __init__(
44
+ self,
45
+ value_function: UNet1DModel,
46
+ unet: UNet1DModel,
47
+ scheduler: DDPMScheduler,
48
+ env,
49
+ ):
50
+ super().__init__()
51
+ self.value_function = value_function
52
+ self.unet = unet
53
+ self.scheduler = scheduler
54
+ self.env = env
55
+ self.data = env.get_dataset()
56
+ self.means = {}
57
+ for key in self.data.keys():
58
+ try:
59
+ self.means[key] = self.data[key].mean()
60
+ except Exception:
61
+ pass
62
+ self.stds = {}
63
+ for key in self.data.keys():
64
+ try:
65
+ self.stds[key] = self.data[key].std()
66
+ except Exception:
67
+ pass
68
+ self.state_dim = env.observation_space.shape[0]
69
+ self.action_dim = env.action_space.shape[0]
70
+
71
+ def normalize(self, x_in, key):
72
+ return (x_in - self.means[key]) / self.stds[key]
73
+
74
+ def de_normalize(self, x_in, key):
75
+ return x_in * self.stds[key] + self.means[key]
76
+
77
+ def to_paddle(self, x_in):
78
+ if isinstance(x_in, dict):
79
+ return {k: self.to_paddle(v) for k, v in x_in.items()}
80
+ elif paddle.is_tensor(x_in):
81
+ return x_in
82
+ return paddle.to_tensor(x_in)
83
+
84
+ def reset_x0(self, x_in, cond, act_dim):
85
+ for key, val in cond.items():
86
+ x_in[:, key, act_dim:] = val.clone()
87
+ return x_in
88
+
89
+ def run_diffusion(self, x, conditions, n_guide_steps, scale):
90
+ batch_size = x.shape[0]
91
+ y = None
92
+ for i in self.progress_bar(self.scheduler.timesteps):
93
+ # create batch of timesteps to pass into model
94
+ timesteps = paddle.full((batch_size,), i, dtype=paddle.int64)
95
+ for _ in range(n_guide_steps):
96
+ with paddle.set_grad_enabled(True):
97
+ x.stop_gradient = False
98
+
99
+ # permute to match dimension for pre-trained models
100
+ y = self.value_function(x.transpose([0, 2, 1]), timesteps).sample
101
+ grad = paddle.autograd.grad([y.sum()], [x])[0]
102
+
103
+ posterior_variance = self.scheduler._get_variance(i)
104
+ model_std = paddle.exp(0.5 * posterior_variance)
105
+ grad = model_std * grad
106
+
107
+ grad[timesteps < 2] = 0
108
+ x = x.detach()
109
+ x = x + scale * grad
110
+ x = self.reset_x0(x, conditions, self.action_dim)
111
+
112
+ prev_x = self.unet(x.transpose([0, 2, 1]), timesteps).sample.transpose([0, 2, 1])
113
+
114
+ # TODO: verify deprecation of this kwarg
115
+ x = self.scheduler.step(prev_x, i, x, predict_epsilon=False)["prev_sample"]
116
+
117
+ # apply conditions to the trajectory (set the initial state)
118
+ x = self.reset_x0(x, conditions, self.action_dim)
119
+ x = self.to_paddle(x)
120
+ return x, y
121
+
122
+ def __call__(self, obs, batch_size=64, planning_horizon=32, n_guide_steps=2, scale=0.1):
123
+ # normalize the observations and create batch dimension
124
+ obs = self.normalize(obs, "observations")
125
+ obs = obs[None].repeat(batch_size, axis=0)
126
+
127
+ conditions = {0: self.to_paddle(obs)}
128
+ shape = (batch_size, planning_horizon, self.state_dim + self.action_dim)
129
+
130
+ # generate initial noise and apply our conditions (to make the trajectories start at current state)
131
+ x1 = randn_tensor(shape, dtype=self.unet.dtype)
132
+ x = self.reset_x0(x1, conditions, self.action_dim)
133
+ x = self.to_paddle(x)
134
+
135
+ # run the diffusion process
136
+ x, y = self.run_diffusion(x, conditions, n_guide_steps, scale)
137
+
138
+ # sort output trajectories by value
139
+ sorted_idx = paddle.argsort(y, 0, descending=True).squeeze()
140
+ sorted_values = x[sorted_idx]
141
+ actions = sorted_values[:, :, : self.action_dim]
142
+ actions = actions.detach().cpu().numpy()
143
+ denorm_actions = self.de_normalize(actions, key="actions")
144
+
145
+ # select the action with the highest value
146
+ if y is not None:
147
+ selected_index = 0
148
+ else:
149
+ # if we didn't run value guiding, select a random action
150
+ selected_index = np.random.randint(0, batch_size)
151
+
152
+ denorm_actions = denorm_actions[selected_index, 0]
153
+ return denorm_actions
VLMEvalKit_old/PaddleMIX/ppdiffusers/ppdiffusers/models/animate_anyone/pose_guider.py ADDED
@@ -0,0 +1,68 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ from typing import Tuple
16
+
17
+ import paddle
18
+
19
+ from ppdiffusers.models.animate_anyone.motion_module import zero_module
20
+ from ppdiffusers.models.animate_anyone.resnet import InflatedConv3d
21
+ from ppdiffusers.models.modeling_utils import ContextManagers, ModelMixin
22
+
23
+
24
+ class PoseGuider(ModelMixin):
25
+ def __init__(
26
+ self,
27
+ conditioning_embedding_channels: int,
28
+ conditioning_channels: int = 3,
29
+ block_out_channels: Tuple[int] = (16, 32, 64, 128),
30
+ weight_dtype=None,
31
+ ):
32
+ super().__init__()
33
+
34
+ init_contexts = []
35
+ if weight_dtype is not None:
36
+ init_contexts.append(paddle.dtype_guard(weight_dtype))
37
+
38
+ with ContextManagers(init_contexts):
39
+ self.conv_in = InflatedConv3d(conditioning_channels, block_out_channels[0], kernel_size=3, padding=1)
40
+
41
+ self.blocks = paddle.nn.LayerList(sublayers=[])
42
+
43
+ for i in range(len(block_out_channels) - 1):
44
+ channel_in = block_out_channels[i]
45
+ channel_out = block_out_channels[i + 1]
46
+ self.blocks.append(InflatedConv3d(channel_in, channel_in, kernel_size=3, padding=1))
47
+ self.blocks.append(InflatedConv3d(channel_in, channel_out, kernel_size=3, padding=1, stride=2))
48
+
49
+ self.conv_out = zero_module(
50
+ InflatedConv3d(
51
+ block_out_channels[-1],
52
+ conditioning_embedding_channels,
53
+ kernel_size=3,
54
+ padding=1,
55
+ )
56
+ )
57
+
58
+ def forward(self, conditioning):
59
+ embedding = self.conv_in(conditioning)
60
+ embedding = paddle.nn.functional.silu(x=embedding)
61
+
62
+ for block in self.blocks:
63
+ embedding = block(embedding)
64
+ embedding = paddle.nn.functional.silu(x=embedding)
65
+
66
+ embedding = self.conv_out(embedding)
67
+
68
+ return embedding
VLMEvalKit_old/PaddleMIX/ppdiffusers/ppdiffusers/models/animate_anyone/resnet.py ADDED
@@ -0,0 +1,235 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ # Adapted from https://github.com/huggingface/ppdiffusers/blob/main/src/ppdiffusers/models/resnet.py
16
+
17
+ import paddle
18
+ from einops import rearrange
19
+
20
+
21
+ class InflatedConv3d(paddle.nn.Conv2D):
22
+ def forward(self, x):
23
+ video_length = x.shape[2]
24
+ x = rearrange(x, "b c f h w -> (b f) c h w")
25
+ x = super().forward(x)
26
+
27
+ x = rearrange(x, "(b f) c h w -> b c f h w", f=video_length)
28
+
29
+ return x
30
+
31
+
32
+ class InflatedGroupNorm(paddle.nn.GroupNorm):
33
+ def forward(self, x):
34
+ video_length = x.shape[2]
35
+
36
+ x = rearrange(x, "b c f h w -> (b f) c h w")
37
+ x = super().forward(x)
38
+ x = rearrange(x, "(b f) c h w -> b c f h w", f=video_length)
39
+
40
+ return x
41
+
42
+
43
+ class Upsample3D(paddle.nn.Layer):
44
+ def __init__(
45
+ self,
46
+ channels,
47
+ use_conv=False,
48
+ use_conv_transpose=False,
49
+ out_channels=None,
50
+ name="conv",
51
+ ):
52
+ super().__init__()
53
+ self.channels = channels
54
+ self.out_channels = out_channels or channels
55
+ self.use_conv = use_conv
56
+ self.use_conv_transpose = use_conv_transpose
57
+ self.name = name
58
+
59
+ if use_conv_transpose:
60
+ raise NotImplementedError
61
+ elif use_conv:
62
+ self.conv = InflatedConv3d(self.channels, self.out_channels, 3, padding=1)
63
+
64
+ def forward(self, hidden_states, output_size=None):
65
+ assert hidden_states.shape[1] == self.channels
66
+
67
+ if self.use_conv_transpose:
68
+ raise NotImplementedError
69
+
70
+ # Cast to float32 to as 'upsample_nearest2d_out_frame' op does not support bfloat16
71
+ dtype = hidden_states.dtype
72
+ if dtype == "bfloat16":
73
+ hidden_states = hidden_states.to("float32")
74
+
75
+ # upsample_nearest_nhwc fails with large batch sizes. see https://github.com/huggingface/ppdiffusers/issues/984
76
+ if hidden_states.shape[0] >= 64:
77
+ hidden_states = hidden_states.contiguous()
78
+
79
+ if output_size is None:
80
+ hidden_states = paddle.nn.functional.interpolate(
81
+ x=hidden_states, scale_factor=[1.0, 2.0, 2.0], mode="nearest", data_format="NCDHW"
82
+ )
83
+ else:
84
+ hidden_states = paddle.nn.functional.interpolate(
85
+ x=hidden_states, size=output_size, mode="nearest", data_format="NCDHW"
86
+ )
87
+
88
+ # If the input is bfloat16, we cast back to bfloat16
89
+ if dtype == "bfloat16":
90
+ hidden_states = hidden_states.to(dtype)
91
+
92
+ hidden_states = self.conv(hidden_states)
93
+
94
+ return hidden_states
95
+
96
+
97
+ class Downsample3D(paddle.nn.Layer):
98
+ def __init__(self, channels, use_conv=False, out_channels=None, padding=1, name="conv"):
99
+ super().__init__()
100
+ self.channels = channels
101
+ self.out_channels = out_channels or channels
102
+ self.use_conv = use_conv
103
+ self.padding = padding
104
+ stride = 2
105
+ self.name = name
106
+
107
+ if use_conv:
108
+ self.conv = InflatedConv3d(self.channels, self.out_channels, 3, stride=stride, padding=padding)
109
+ else:
110
+ raise NotImplementedError
111
+
112
+ def forward(self, hidden_states):
113
+ assert hidden_states.shape[1] == self.channels
114
+ if self.use_conv and self.padding == 0:
115
+ raise NotImplementedError
116
+
117
+ assert hidden_states.shape[1] == self.channels
118
+ hidden_states = self.conv(hidden_states)
119
+
120
+ return hidden_states
121
+
122
+
123
+ class ResnetBlock3D(paddle.nn.Layer):
124
+ def __init__(
125
+ self,
126
+ *,
127
+ in_channels,
128
+ out_channels=None,
129
+ conv_shortcut=False,
130
+ dropout=0.0,
131
+ temb_channels=512,
132
+ groups=32,
133
+ groups_out=None,
134
+ pre_norm=True,
135
+ eps=1e-6,
136
+ non_linearity="swish",
137
+ time_embedding_norm="default",
138
+ output_scale_factor=1.0,
139
+ use_in_shortcut=None,
140
+ use_inflated_groupnorm=None,
141
+ ):
142
+ super().__init__()
143
+ self.pre_norm = pre_norm
144
+ self.pre_norm = True
145
+ self.in_channels = in_channels
146
+ out_channels = in_channels if out_channels is None else out_channels
147
+ self.out_channels = out_channels
148
+ self.use_conv_shortcut = conv_shortcut
149
+ self.time_embedding_norm = time_embedding_norm
150
+ self.output_scale_factor = output_scale_factor
151
+
152
+ if groups_out is None:
153
+ groups_out = groups
154
+
155
+ assert use_inflated_groupnorm is not None
156
+ if use_inflated_groupnorm:
157
+ self.norm1 = InflatedGroupNorm(num_groups=groups, num_channels=in_channels, epsilon=eps)
158
+ else:
159
+
160
+ self.norm1 = paddle.nn.GroupNorm(
161
+ num_groups=groups, num_channels=in_channels, epsilon=eps, weight_attr=True, bias_attr=True
162
+ )
163
+
164
+ self.conv1 = InflatedConv3d(in_channels, out_channels, kernel_size=3, stride=1, padding=1)
165
+
166
+ if temb_channels is not None:
167
+ if self.time_embedding_norm == "default":
168
+ time_emb_proj_out_channels = out_channels
169
+ elif self.time_embedding_norm == "scale_shift":
170
+ time_emb_proj_out_channels = out_channels * 2
171
+ else:
172
+ raise ValueError(f"unknown time_embedding_norm : {self.time_embedding_norm} ")
173
+
174
+ self.time_emb_proj = paddle.nn.Linear(in_features=temb_channels, out_features=time_emb_proj_out_channels)
175
+ else:
176
+ self.time_emb_proj = None
177
+
178
+ if use_inflated_groupnorm:
179
+ self.norm2 = InflatedGroupNorm(num_groups=groups_out, num_channels=out_channels, epsilon=eps)
180
+ else:
181
+ self.norm2 = paddle.nn.GroupNorm(
182
+ num_groups=groups_out, num_channels=out_channels, epsilon=eps, weight_attr=True, bias_attr=True
183
+ )
184
+ self.dropout = paddle.nn.Dropout(p=dropout)
185
+ self.conv2 = InflatedConv3d(out_channels, out_channels, kernel_size=3, stride=1, padding=1)
186
+
187
+ if non_linearity == "swish":
188
+ self.nonlinearity = lambda x: paddle.nn.functional.silu(x=x)
189
+ elif non_linearity == "mish":
190
+ self.nonlinearity = Mish()
191
+ elif non_linearity == "silu":
192
+ self.nonlinearity = paddle.nn.Silu()
193
+
194
+ self.use_in_shortcut = self.in_channels != self.out_channels if use_in_shortcut is None else use_in_shortcut
195
+
196
+ self.conv_shortcut = None
197
+ if self.use_in_shortcut:
198
+ self.conv_shortcut = InflatedConv3d(in_channels, out_channels, kernel_size=1, stride=1, padding=0)
199
+
200
+ def forward(self, input_tensor, temb):
201
+ hidden_states = input_tensor
202
+
203
+ hidden_states = self.norm1(hidden_states)
204
+ hidden_states = self.nonlinearity(hidden_states)
205
+
206
+ hidden_states = self.conv1(hidden_states)
207
+
208
+ if temb is not None:
209
+ temb = self.time_emb_proj(self.nonlinearity(temb))[:, :, None, None, None]
210
+
211
+ if temb is not None and self.time_embedding_norm == "default":
212
+ hidden_states = hidden_states + temb
213
+
214
+ hidden_states = self.norm2(hidden_states)
215
+
216
+ if temb is not None and self.time_embedding_norm == "scale_shift":
217
+ scale, shift = paddle.chunk(x=temb, chunks=2, axis=1)
218
+ hidden_states = hidden_states * (1 + scale) + shift
219
+
220
+ hidden_states = self.nonlinearity(hidden_states)
221
+
222
+ hidden_states = self.dropout(hidden_states)
223
+ hidden_states = self.conv2(hidden_states)
224
+
225
+ if self.conv_shortcut is not None:
226
+ input_tensor = self.conv_shortcut(input_tensor)
227
+
228
+ output_tensor = (input_tensor + hidden_states) / self.output_scale_factor
229
+
230
+ return output_tensor
231
+
232
+
233
+ class Mish(paddle.nn.Layer):
234
+ def forward(self, hidden_states):
235
+ return hidden_states * paddle.nn.functional.tanh(x=paddle.nn.functional.softplus(x=hidden_states))
VLMEvalKit_old/PaddleMIX/ppdiffusers/ppdiffusers/models/animate_anyone/transformer_3d.py ADDED
@@ -0,0 +1,155 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ from dataclasses import dataclass
16
+ from typing import Optional
17
+
18
+ import paddle
19
+ from einops import rearrange, repeat
20
+
21
+ from ppdiffusers.configuration_utils import ConfigMixin, register_to_config
22
+ from ppdiffusers.models import ModelMixin
23
+ from ppdiffusers.utils import BaseOutput
24
+
25
+ from .attention import TemporalBasicTransformerBlock
26
+
27
+
28
+ @dataclass
29
+ class Transformer3DModelOutput(BaseOutput):
30
+ sample: paddle.Tensor
31
+
32
+
33
+ class Transformer3DModel(ModelMixin, ConfigMixin):
34
+ _supports_gradient_checkpointing = True
35
+
36
+ @register_to_config
37
+ def __init__(
38
+ self,
39
+ num_attention_heads: int = 16,
40
+ attention_head_dim: int = 88,
41
+ in_channels: Optional[int] = None,
42
+ num_layers: int = 1,
43
+ dropout: float = 0.0,
44
+ norm_num_groups: int = 32,
45
+ cross_attention_dim: Optional[int] = None,
46
+ attention_bias: bool = False,
47
+ activation_fn: str = "geglu",
48
+ num_embeds_ada_norm: Optional[int] = None,
49
+ use_linear_projection: bool = False,
50
+ only_cross_attention: bool = False,
51
+ upcast_attention: bool = False,
52
+ unet_use_cross_frame_attention=None,
53
+ unet_use_temporal_attention=None,
54
+ ):
55
+ super().__init__()
56
+ self.use_linear_projection = use_linear_projection
57
+ self.num_attention_heads = num_attention_heads
58
+ self.attention_head_dim = attention_head_dim
59
+ inner_dim = num_attention_heads * attention_head_dim
60
+
61
+ # Define input layers
62
+ self.in_channels = in_channels
63
+
64
+ self.norm = paddle.nn.GroupNorm(
65
+ num_groups=norm_num_groups, num_channels=in_channels, epsilon=1e-06, weight_attr=True, bias_attr=True
66
+ )
67
+ if use_linear_projection:
68
+ self.proj_in = paddle.nn.Linear(in_features=in_channels, out_features=inner_dim)
69
+ else:
70
+ self.proj_in = paddle.nn.Conv2D(
71
+ in_channels=in_channels, out_channels=inner_dim, kernel_size=1, stride=1, padding=0
72
+ )
73
+ self.transformer_blocks = paddle.nn.LayerList(
74
+ sublayers=[
75
+ TemporalBasicTransformerBlock(
76
+ inner_dim,
77
+ num_attention_heads,
78
+ attention_head_dim,
79
+ dropout=dropout,
80
+ cross_attention_dim=cross_attention_dim,
81
+ activation_fn=activation_fn,
82
+ num_embeds_ada_norm=num_embeds_ada_norm,
83
+ attention_bias=attention_bias,
84
+ only_cross_attention=only_cross_attention,
85
+ upcast_attention=upcast_attention,
86
+ unet_use_cross_frame_attention=unet_use_cross_frame_attention,
87
+ unet_use_temporal_attention=unet_use_temporal_attention,
88
+ )
89
+ for d in range(num_layers)
90
+ ]
91
+ )
92
+ if use_linear_projection:
93
+ self.proj_out = paddle.nn.Linear(in_features=in_channels, out_features=inner_dim)
94
+ else:
95
+ self.proj_out = paddle.nn.Conv2D(
96
+ in_channels=inner_dim, out_channels=in_channels, kernel_size=1, stride=1, padding=0
97
+ )
98
+
99
+ self.gradient_checkpointing = False
100
+
101
+ def _set_gradient_checkpointing(self, module, value=False):
102
+ if hasattr(module, "gradient_checkpointing"):
103
+ module.gradient_checkpointing = value
104
+
105
+ def forward(
106
+ self,
107
+ hidden_states,
108
+ encoder_hidden_states=None,
109
+ timestep=None,
110
+ return_dict: bool = True,
111
+ ):
112
+ # Input
113
+ assert hidden_states.dim() == 5, f"Expected hidden_states to have ndim=5, but got ndim={hidden_states.dim()}."
114
+ video_length = hidden_states.shape[2]
115
+ hidden_states = rearrange(hidden_states, "b c f h w -> (b f) c h w")
116
+ if encoder_hidden_states.shape[0] != hidden_states.shape[0]:
117
+ encoder_hidden_states = repeat(encoder_hidden_states, "b n c -> (b f) n c", f=video_length)
118
+
119
+ batch, channel, height, weight = hidden_states.shape
120
+ residual = hidden_states
121
+
122
+ hidden_states = self.norm(hidden_states)
123
+ if not self.use_linear_projection:
124
+ hidden_states = self.proj_in(hidden_states)
125
+ inner_dim = hidden_states.shape[1]
126
+ hidden_states = hidden_states.transpose(perm=[0, 2, 3, 1]).reshape((batch, height * weight, inner_dim))
127
+ else:
128
+ inner_dim = hidden_states.shape[1]
129
+ hidden_states = hidden_states.transpose(perm=[0, 2, 3, 1]).reshape((batch, height * weight, inner_dim))
130
+ hidden_states = self.proj_in(hidden_states)
131
+
132
+ # Blocks
133
+ for i, block in enumerate(self.transformer_blocks):
134
+ hidden_states = block(
135
+ hidden_states,
136
+ encoder_hidden_states=encoder_hidden_states,
137
+ timestep=timestep,
138
+ video_length=video_length,
139
+ )
140
+
141
+ # Output
142
+ if not self.use_linear_projection:
143
+ hidden_states = hidden_states.reshape((batch, height, weight, inner_dim)).transpose(perm=[0, 3, 1, 2])
144
+ hidden_states = self.proj_out(hidden_states)
145
+ else:
146
+ hidden_states = self.proj_out(hidden_states)
147
+ hidden_states = hidden_states.reshape((batch, height, weight, inner_dim)).transpose(perm=[0, 3, 1, 2])
148
+
149
+ output = hidden_states + residual
150
+
151
+ output = rearrange(output, "(b f) c h w -> b c f h w", f=video_length)
152
+ if not return_dict:
153
+ return (output,)
154
+
155
+ return Transformer3DModelOutput(sample=output)
VLMEvalKit_old/PaddleMIX/ppdiffusers/ppdiffusers/models/animate_anyone/unet_3d.py ADDED
@@ -0,0 +1,615 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ # Adapted from https://github.com/guoyww/AnimateDiff/blob/main/animatediff/models/unet_blocks.py
16
+
17
+ from dataclasses import dataclass
18
+ from os import PathLike
19
+ from pathlib import Path
20
+ from typing import Dict, List, Optional, Tuple, Union
21
+
22
+ import paddle
23
+
24
+ from ppdiffusers.configuration_utils import ConfigMixin, register_to_config
25
+ from ppdiffusers.models.attention_processor import AttentionProcessor
26
+ from ppdiffusers.models.embeddings import TimestepEmbedding, Timesteps
27
+ from ppdiffusers.models.modeling_utils import ContextManagers, ModelMixin
28
+ from ppdiffusers.utils import BaseOutput, logging
29
+
30
+ from .resnet import InflatedConv3d, InflatedGroupNorm
31
+ from .unet_3d_blocks import UNetMidBlock3DCrossAttn, get_down_block, get_up_block
32
+
33
+ logger = logging.get_logger(__name__) # pylint: disable=invalid-name
34
+
35
+
36
+ @dataclass
37
+ class UNet3DConditionOutput(BaseOutput):
38
+ sample: paddle.Tensor
39
+
40
+
41
+ class UNet3DConditionModel(ModelMixin, ConfigMixin):
42
+ _supports_gradient_checkpointing = True
43
+
44
+ @register_to_config
45
+ def __init__(
46
+ self,
47
+ sample_size: Optional[int] = None,
48
+ in_channels: int = 4,
49
+ out_channels: int = 4,
50
+ center_input_sample: bool = False,
51
+ flip_sin_to_cos: bool = True,
52
+ freq_shift: int = 0,
53
+ down_block_types: Tuple[str] = (
54
+ "CrossAttnDownBlock3D",
55
+ "CrossAttnDownBlock3D",
56
+ "CrossAttnDownBlock3D",
57
+ "DownBlock3D",
58
+ ),
59
+ mid_block_type: str = "UNetMidBlock3DCrossAttn",
60
+ up_block_types: Tuple[str] = (
61
+ "UpBlock3D",
62
+ "CrossAttnUpBlock3D",
63
+ "CrossAttnUpBlock3D",
64
+ "CrossAttnUpBlock3D",
65
+ ),
66
+ only_cross_attention: Union[bool, Tuple[bool]] = False,
67
+ block_out_channels: Tuple[int] = (320, 640, 1280, 1280),
68
+ layers_per_block: int = 2,
69
+ downsample_padding: int = 1,
70
+ mid_block_scale_factor: float = 1,
71
+ act_fn: str = "silu",
72
+ norm_num_groups: int = 32,
73
+ norm_eps: float = 1e-5,
74
+ cross_attention_dim: int = 1280,
75
+ attention_head_dim: Union[int, Tuple[int]] = 8,
76
+ dual_cross_attention: bool = False,
77
+ use_linear_projection: bool = False,
78
+ class_embed_type: Optional[str] = None,
79
+ num_class_embeds: Optional[int] = None,
80
+ upcast_attention: bool = False,
81
+ resnet_time_scale_shift: str = "default",
82
+ use_inflated_groupnorm=False,
83
+ # Additional
84
+ use_motion_module=False,
85
+ motion_module_resolutions=(1, 2, 4, 8),
86
+ motion_module_mid_block=False,
87
+ motion_module_decoder_only=False,
88
+ motion_module_type=None,
89
+ motion_module_kwargs={},
90
+ unet_use_cross_frame_attention=None,
91
+ unet_use_temporal_attention=None,
92
+ ):
93
+ super().__init__()
94
+
95
+ self.sample_size = sample_size
96
+ time_embed_dim = block_out_channels[0] * 4
97
+
98
+ # input
99
+ self.conv_in = InflatedConv3d(in_channels, block_out_channels[0], kernel_size=3, padding=(1, 1))
100
+
101
+ # time
102
+ self.time_proj = Timesteps(block_out_channels[0], flip_sin_to_cos, freq_shift)
103
+ timestep_input_dim = block_out_channels[0]
104
+
105
+ self.time_embedding = TimestepEmbedding(timestep_input_dim, time_embed_dim)
106
+
107
+ # class embedding
108
+ if class_embed_type is None and num_class_embeds is not None:
109
+ self.class_embedding = paddle.nn.Embedding(num_embeddings=num_class_embeds, embedding_dim=time_embed_dim)
110
+ elif class_embed_type == "timestep":
111
+ self.class_embedding = TimestepEmbedding(timestep_input_dim, time_embed_dim)
112
+ elif class_embed_type == "identity":
113
+ self.class_embedding = paddle.nn.Identity(time_embed_dim, time_embed_dim)
114
+ else:
115
+ self.class_embedding = None
116
+
117
+ self.down_blocks = paddle.nn.LayerList(sublayers=[])
118
+ self.mid_block = None
119
+ self.up_blocks = paddle.nn.LayerList(sublayers=[])
120
+
121
+ if isinstance(only_cross_attention, bool):
122
+ only_cross_attention = [only_cross_attention] * len(down_block_types)
123
+
124
+ if isinstance(attention_head_dim, int):
125
+ attention_head_dim = (attention_head_dim,) * len(down_block_types)
126
+
127
+ # down
128
+ output_channel = block_out_channels[0]
129
+ for i, down_block_type in enumerate(down_block_types):
130
+ res = 2**i
131
+ input_channel = output_channel
132
+ output_channel = block_out_channels[i]
133
+ is_final_block = i == len(block_out_channels) - 1
134
+
135
+ down_block = get_down_block(
136
+ down_block_type,
137
+ num_layers=layers_per_block,
138
+ in_channels=input_channel,
139
+ out_channels=output_channel,
140
+ temb_channels=time_embed_dim,
141
+ add_downsample=not is_final_block,
142
+ resnet_eps=norm_eps,
143
+ resnet_act_fn=act_fn,
144
+ resnet_groups=norm_num_groups,
145
+ cross_attention_dim=cross_attention_dim,
146
+ attn_num_head_channels=attention_head_dim[i],
147
+ downsample_padding=downsample_padding,
148
+ dual_cross_attention=dual_cross_attention,
149
+ use_linear_projection=use_linear_projection,
150
+ only_cross_attention=only_cross_attention[i],
151
+ upcast_attention=upcast_attention,
152
+ resnet_time_scale_shift=resnet_time_scale_shift,
153
+ unet_use_cross_frame_attention=unet_use_cross_frame_attention,
154
+ unet_use_temporal_attention=unet_use_temporal_attention,
155
+ use_inflated_groupnorm=use_inflated_groupnorm,
156
+ use_motion_module=use_motion_module
157
+ and (res in motion_module_resolutions)
158
+ and (not motion_module_decoder_only),
159
+ motion_module_type=motion_module_type,
160
+ motion_module_kwargs=motion_module_kwargs,
161
+ )
162
+ self.down_blocks.append(down_block)
163
+
164
+ # mid
165
+ if mid_block_type == "UNetMidBlock3DCrossAttn":
166
+ self.mid_block = UNetMidBlock3DCrossAttn(
167
+ in_channels=block_out_channels[-1],
168
+ temb_channels=time_embed_dim,
169
+ resnet_eps=norm_eps,
170
+ resnet_act_fn=act_fn,
171
+ output_scale_factor=mid_block_scale_factor,
172
+ resnet_time_scale_shift=resnet_time_scale_shift,
173
+ cross_attention_dim=cross_attention_dim,
174
+ attn_num_head_channels=attention_head_dim[-1],
175
+ resnet_groups=norm_num_groups,
176
+ dual_cross_attention=dual_cross_attention,
177
+ use_linear_projection=use_linear_projection,
178
+ upcast_attention=upcast_attention,
179
+ unet_use_cross_frame_attention=unet_use_cross_frame_attention,
180
+ unet_use_temporal_attention=unet_use_temporal_attention,
181
+ use_inflated_groupnorm=use_inflated_groupnorm,
182
+ use_motion_module=use_motion_module and motion_module_mid_block,
183
+ motion_module_type=motion_module_type,
184
+ motion_module_kwargs=motion_module_kwargs,
185
+ )
186
+ else:
187
+ raise ValueError(f"unknown mid_block_type : {mid_block_type}")
188
+
189
+ # count how many layers upsample the videos
190
+ self.num_upsamplers = 0
191
+
192
+ # up
193
+ reversed_block_out_channels = list(reversed(block_out_channels))
194
+ reversed_attention_head_dim = list(reversed(attention_head_dim))
195
+ only_cross_attention = list(reversed(only_cross_attention))
196
+ output_channel = reversed_block_out_channels[0]
197
+ for i, up_block_type in enumerate(up_block_types):
198
+ res = 2 ** (3 - i)
199
+ is_final_block = i == len(block_out_channels) - 1
200
+
201
+ prev_output_channel = output_channel
202
+ output_channel = reversed_block_out_channels[i]
203
+ input_channel = reversed_block_out_channels[min(i + 1, len(block_out_channels) - 1)]
204
+
205
+ # add upsample block for all BUT final layer
206
+ if not is_final_block:
207
+ add_upsample = True
208
+ self.num_upsamplers += 1
209
+ else:
210
+ add_upsample = False
211
+
212
+ up_block = get_up_block(
213
+ up_block_type,
214
+ num_layers=layers_per_block + 1,
215
+ in_channels=input_channel,
216
+ out_channels=output_channel,
217
+ prev_output_channel=prev_output_channel,
218
+ temb_channels=time_embed_dim,
219
+ add_upsample=add_upsample,
220
+ resnet_eps=norm_eps,
221
+ resnet_act_fn=act_fn,
222
+ resnet_groups=norm_num_groups,
223
+ cross_attention_dim=cross_attention_dim,
224
+ attn_num_head_channels=reversed_attention_head_dim[i],
225
+ dual_cross_attention=dual_cross_attention,
226
+ use_linear_projection=use_linear_projection,
227
+ only_cross_attention=only_cross_attention[i],
228
+ upcast_attention=upcast_attention,
229
+ resnet_time_scale_shift=resnet_time_scale_shift,
230
+ unet_use_cross_frame_attention=unet_use_cross_frame_attention,
231
+ unet_use_temporal_attention=unet_use_temporal_attention,
232
+ use_inflated_groupnorm=use_inflated_groupnorm,
233
+ use_motion_module=use_motion_module and (res in motion_module_resolutions),
234
+ motion_module_type=motion_module_type,
235
+ motion_module_kwargs=motion_module_kwargs,
236
+ )
237
+ self.up_blocks.append(up_block)
238
+ prev_output_channel = output_channel
239
+
240
+ # out
241
+ if use_inflated_groupnorm:
242
+ self.conv_norm_out = InflatedGroupNorm(
243
+ num_channels=block_out_channels[0],
244
+ num_groups=norm_num_groups,
245
+ epsilon=norm_eps,
246
+ )
247
+ else:
248
+
249
+ self.conv_norm_out = paddle.nn.GroupNorm(
250
+ num_channels=block_out_channels[0], num_groups=norm_num_groups, epsilon=norm_eps
251
+ )
252
+ self.conv_act = paddle.nn.Silu()
253
+ self.conv_out = InflatedConv3d(block_out_channels[0], out_channels, kernel_size=3, padding=1)
254
+
255
+ @property
256
+ # Copied from ppdiffusers.models.unet_2d_condition.UNet2DConditionModel.attn_processors
257
+ def attn_processors(self) -> Dict[str, AttentionProcessor]:
258
+ r"""
259
+ Returns:
260
+ `dict` of attention processors: A dictionary containing all attention processors used in the model with
261
+ indexed by its weight name.
262
+ """
263
+ # set recursively
264
+ processors = {}
265
+
266
+ def fn_recursive_add_processors(
267
+ name: str,
268
+ module: paddle.nn.Layer,
269
+ processors: Dict[str, AttentionProcessor],
270
+ ):
271
+ if hasattr(module, "set_processor"):
272
+ processors[f"{name}.processor"] = module.processor
273
+
274
+ for sub_name, child in module.named_children():
275
+ if "temporal_transformer" not in sub_name:
276
+ fn_recursive_add_processors(f"{name}.{sub_name}", child, processors)
277
+
278
+ return processors
279
+
280
+ for name, module in self.named_children():
281
+ if "temporal_transformer" not in name:
282
+ fn_recursive_add_processors(name, module, processors)
283
+
284
+ return processors
285
+
286
+ def set_attention_slice(self, slice_size):
287
+ r"""
288
+ Enable sliced attention computation.
289
+
290
+ When this option is enabled, the attention module will split the input tensor in slices, to compute attention
291
+ in several steps. This is useful to save some memory in exchange for a small speed decrease.
292
+
293
+ Args:
294
+ slice_size (`str` or `int` or `list(int)`, *optional*, defaults to `"auto"`):
295
+ When `"auto"`, halves the input to the attention heads, so attention will be computed in two steps. If
296
+ `"max"`, maxium amount of memory will be saved by running only one slice at a time. If a number is
297
+ provided, uses as many slices as `attention_head_dim // slice_size`. In this case, `attention_head_dim`
298
+ must be a multiple of `slice_size`.
299
+ """
300
+ sliceable_head_dims = []
301
+
302
+ def fn_recursive_retrieve_slicable_dims(module: paddle.nn.Layer):
303
+ if hasattr(module, "set_attention_slice"):
304
+ sliceable_head_dims.append(module.sliceable_head_dim)
305
+
306
+ for child in module.children():
307
+ fn_recursive_retrieve_slicable_dims(child)
308
+
309
+ # retrieve number of attention layers
310
+ for module in self.children():
311
+ fn_recursive_retrieve_slicable_dims(module)
312
+
313
+ num_slicable_layers = len(sliceable_head_dims)
314
+
315
+ if slice_size == "auto":
316
+ # half the attention head size is usually a good trade-off between
317
+ # speed and memory
318
+ slice_size = [dim // 2 for dim in sliceable_head_dims]
319
+ elif slice_size == "max":
320
+ # make smallest slice possible
321
+ slice_size = num_slicable_layers * [1]
322
+
323
+ slice_size = num_slicable_layers * [slice_size] if not isinstance(slice_size, list) else slice_size
324
+
325
+ if len(slice_size) != len(sliceable_head_dims):
326
+ raise ValueError(
327
+ f"You have provided {len(slice_size)}, but {self.config} has {len(sliceable_head_dims)} different"
328
+ f" attention layers. Make sure to match `len(slice_size)` to be {len(sliceable_head_dims)}."
329
+ )
330
+
331
+ for i in range(len(slice_size)):
332
+ size = slice_size[i]
333
+ dim = sliceable_head_dims[i]
334
+ if size is not None and size > dim:
335
+ raise ValueError(f"size {size} has to be smaller or equal to {dim}.")
336
+
337
+ # Recursively walk through all the children.
338
+ # Any children which exposes the set_attention_slice method
339
+ # gets the message
340
+ def fn_recursive_set_attention_slice(module: paddle.nn.Layer, slice_size: List[int]):
341
+ if hasattr(module, "set_attention_slice"):
342
+ module.set_attention_slice(slice_size.pop())
343
+
344
+ for child in module.children():
345
+ fn_recursive_set_attention_slice(child, slice_size)
346
+
347
+ reversed_slice_size = list(reversed(slice_size))
348
+ for module in self.children():
349
+ fn_recursive_set_attention_slice(module, reversed_slice_size)
350
+
351
+ def _set_gradient_checkpointing(self, module, value=False):
352
+ if hasattr(module, "gradient_checkpointing"):
353
+ module.gradient_checkpointing = value
354
+
355
+ # Copied from ppdiffusers.models.unet_2d_condition.UNet2DConditionModel.set_attn_processor
356
+ def set_attn_processor(self, processor: Union[AttentionProcessor, Dict[str, AttentionProcessor]]):
357
+ r"""
358
+ Sets the attention processor to use to compute attention.
359
+
360
+ Parameters:
361
+ processor (`dict` of `AttentionProcessor` or only `AttentionProcessor`):
362
+ The instantiated processor class or a dictionary of processor classes that will be set as the processor
363
+ for **all** `Attention` layers.
364
+
365
+ If `processor` is a dict, the key needs to define the path to the corresponding cross attention
366
+ processor. This is strongly recommended when setting trainable attention processors.
367
+
368
+ """
369
+ count = len(self.attn_processors.keys())
370
+
371
+ if isinstance(processor, dict) and len(processor) != count:
372
+ raise ValueError(
373
+ f"A dict of processors was passed, but the number of processors {len(processor)} does not match the"
374
+ f" number of attention layers: {count}. Please make sure to pass {count} processor classes."
375
+ )
376
+
377
+ def fn_recursive_attn_processor(name: str, module: paddle.nn.Layer, processor):
378
+ if hasattr(module, "set_processor"):
379
+ if not isinstance(processor, dict):
380
+ module.set_processor(processor)
381
+ else:
382
+ module.set_processor(processor.pop(f"{name}.processor"))
383
+
384
+ for sub_name, child in module.named_children():
385
+ if "temporal_transformer" not in sub_name:
386
+ fn_recursive_attn_processor(f"{name}.{sub_name}", child, processor)
387
+
388
+ for name, module in self.named_children():
389
+ if "temporal_transformer" not in name:
390
+ fn_recursive_attn_processor(name, module, processor)
391
+
392
+ def forward(
393
+ self,
394
+ sample: paddle.Tensor,
395
+ timestep: Union[paddle.Tensor, float, int],
396
+ encoder_hidden_states: paddle.Tensor,
397
+ class_labels: Optional[paddle.Tensor] = None,
398
+ pose_cond_fea: Optional[paddle.Tensor] = None,
399
+ attention_mask: Optional[paddle.Tensor] = None,
400
+ down_block_additional_residuals: Optional[Tuple[paddle.Tensor]] = None,
401
+ mid_block_additional_residual: Optional[paddle.Tensor] = None,
402
+ return_dict: bool = True,
403
+ ) -> Union[UNet3DConditionOutput, Tuple]:
404
+ r"""
405
+ Args:
406
+ sample (`paddle.Tensor`): (batch, channel, height, width) noisy inputs tensor
407
+ timestep (`paddle.Tensor` or `float` or `int`): (batch) timesteps
408
+ encoder_hidden_states (`paddle.Tensor`): (batch, sequence_length, feature_dim) encoder hidden states
409
+ return_dict (`bool`, *optional*, defaults to `True`):
410
+ Whether or not to return a [`models.unet_2d_condition.UNet2DConditionOutput`] instead of a plain tuple.
411
+
412
+ Returns:
413
+ [`~models.unet_2d_condition.UNet2DConditionOutput`] or `tuple`:
414
+ [`~models.unet_2d_condition.UNet2DConditionOutput`] if `return_dict` is True, otherwise a `tuple`. When
415
+ returning a tuple, the first element is the sample tensor.
416
+ """
417
+ # By default samples have to be AT least a multiple of the overall upsampling factor.
418
+ # The overall upsampling factor is equal to 2 ** (# num of upsampling layears).
419
+ # However, the upsampling interpolation output size can be forced to fit any upsampling size
420
+ # on the fly if necessary.
421
+ default_overall_up_factor = 2**self.num_upsamplers
422
+
423
+ # upsample size should be forwarded when sample is not a multiple of `default_overall_up_factor`
424
+ forward_upsample_size = False
425
+ upsample_size = None
426
+
427
+ if any(s % default_overall_up_factor != 0 for s in sample.shape[-2:]):
428
+ logger.info("Forward upsample size to force interpolation output size.")
429
+ forward_upsample_size = True
430
+
431
+ # prepare attention_mask
432
+ if attention_mask is not None:
433
+ attention_mask = (1 - attention_mask.to(sample.dtype)) * -10000.0
434
+ attention_mask = attention_mask.unsqueeze(1)
435
+
436
+ # center input if necessary
437
+ if self.config.center_input_sample:
438
+ sample = 2 * sample - 1.0
439
+
440
+ # time
441
+ timesteps = timestep
442
+ if not paddle.is_tensor(timesteps):
443
+ # This would be a good case for the `match` statement (Python 3.10+)
444
+ is_mps = sample.device.type == "mps"
445
+ if isinstance(timestep, float):
446
+ dtype = "float32" if is_mps else "float64"
447
+ else:
448
+ dtype = "int32" if is_mps else "int64"
449
+ timesteps = paddle.Tensor([timesteps], dtype=dtype)
450
+ elif len(timesteps.shape) == 0:
451
+ timesteps = timesteps[None]
452
+
453
+ # broadcast to batch dimension in a way that's compatible with ONNX/Core ML
454
+ timesteps = timesteps.expand(sample.shape[0])
455
+
456
+ t_emb = self.time_proj(timesteps)
457
+
458
+ # timesteps does not contain any weights and will always return f32 tensors
459
+ # but time_embedding might actually be running in fp16. so we need to cast here.
460
+ # there might be better ways to encapsulate this.
461
+ t_emb = t_emb.to(dtype=self.dtype)
462
+ emb = self.time_embedding(t_emb)
463
+
464
+ if self.class_embedding is not None:
465
+ if class_labels is None:
466
+ raise ValueError("class_labels should be provided when num_class_embeds > 0")
467
+
468
+ if self.config.class_embed_type == "timestep":
469
+ class_labels = self.time_proj(class_labels)
470
+
471
+ class_emb = self.class_embedding(class_labels).to(dtype=self.dtype)
472
+ emb = emb + class_emb
473
+
474
+ # pre-process
475
+
476
+ sample = self.conv_in(sample)
477
+
478
+ if pose_cond_fea is not None:
479
+ sample = sample + pose_cond_fea
480
+
481
+ # down
482
+ down_block_res_samples = (sample,)
483
+ for downsample_block in self.down_blocks:
484
+ if hasattr(downsample_block, "has_cross_attention") and downsample_block.has_cross_attention:
485
+
486
+ sample, res_samples = downsample_block(
487
+ hidden_states=sample,
488
+ temb=emb,
489
+ encoder_hidden_states=encoder_hidden_states,
490
+ attention_mask=attention_mask,
491
+ )
492
+
493
+ else:
494
+ sample, res_samples = downsample_block(
495
+ hidden_states=sample,
496
+ temb=emb,
497
+ encoder_hidden_states=encoder_hidden_states,
498
+ )
499
+
500
+ down_block_res_samples += res_samples
501
+
502
+ if down_block_additional_residuals is not None:
503
+ new_down_block_res_samples = ()
504
+
505
+ for down_block_res_sample, down_block_additional_residual in zip(
506
+ down_block_res_samples, down_block_additional_residuals
507
+ ):
508
+ down_block_res_sample = down_block_res_sample + down_block_additional_residual
509
+ new_down_block_res_samples += (down_block_res_sample,)
510
+
511
+ down_block_res_samples = new_down_block_res_samples
512
+
513
+ # mid
514
+ sample = self.mid_block(
515
+ sample,
516
+ emb,
517
+ encoder_hidden_states=encoder_hidden_states,
518
+ attention_mask=attention_mask,
519
+ )
520
+
521
+ if mid_block_additional_residual is not None:
522
+ sample = sample + mid_block_additional_residual
523
+
524
+ # up
525
+ for i, upsample_block in enumerate(self.up_blocks):
526
+ is_final_block = i == len(self.up_blocks) - 1
527
+
528
+ res_samples = down_block_res_samples[-len(upsample_block.resnets) :]
529
+ down_block_res_samples = down_block_res_samples[: -len(upsample_block.resnets)]
530
+
531
+ # if we have not reached the final block and need to forward the
532
+ # upsample size, we do it here
533
+ if not is_final_block and forward_upsample_size:
534
+ upsample_size = down_block_res_samples[-1].shape[2:]
535
+
536
+ if hasattr(upsample_block, "has_cross_attention") and upsample_block.has_cross_attention:
537
+ sample = upsample_block(
538
+ hidden_states=sample,
539
+ temb=emb,
540
+ res_hidden_states_tuple=res_samples,
541
+ encoder_hidden_states=encoder_hidden_states,
542
+ upsample_size=upsample_size,
543
+ attention_mask=attention_mask,
544
+ )
545
+ else:
546
+ sample = upsample_block(
547
+ hidden_states=sample,
548
+ temb=emb,
549
+ res_hidden_states_tuple=res_samples,
550
+ upsample_size=upsample_size,
551
+ encoder_hidden_states=encoder_hidden_states,
552
+ )
553
+
554
+ # post-process
555
+ sample = self.conv_norm_out(sample)
556
+ sample = self.conv_act(sample)
557
+ sample = self.conv_out(sample)
558
+
559
+ if not return_dict:
560
+ return (sample,)
561
+
562
+ return UNet3DConditionOutput(sample=sample)
563
+
564
+ @classmethod
565
+ def from_pretrained_2d(
566
+ cls,
567
+ denoising_unet_config_path: Optional[Union[str, PathLike]],
568
+ base_model_path: Optional[Union[str, PathLike]] = None,
569
+ motion_module_path: Optional[Union[str, PathLike]] = None,
570
+ weight_dtype=None,
571
+ unet_additional_kwargs=None,
572
+ ):
573
+
574
+ config_file = denoising_unet_config_path
575
+ if not (Path(config_file).exists() and Path(config_file).is_file()):
576
+ raise RuntimeError(f"{config_file} does not exist or is not a file")
577
+
578
+ unet_config = cls.load_config(config_file)
579
+ unet_config["_class_name"] = cls.__name__
580
+ unet_config["down_block_types"] = [
581
+ "CrossAttnDownBlock3D",
582
+ "CrossAttnDownBlock3D",
583
+ "CrossAttnDownBlock3D",
584
+ "DownBlock3D",
585
+ ]
586
+ unet_config["up_block_types"] = [
587
+ "UpBlock3D",
588
+ "CrossAttnUpBlock3D",
589
+ "CrossAttnUpBlock3D",
590
+ "CrossAttnUpBlock3D",
591
+ ]
592
+ unet_config["mid_block_type"] = "UNetMidBlock3DCrossAttn"
593
+
594
+ init_contexts = []
595
+ if weight_dtype is not None:
596
+ init_contexts.append(paddle.dtype_guard(weight_dtype))
597
+
598
+ with ContextManagers(init_contexts):
599
+ model = cls.from_config(unet_config, **unet_additional_kwargs)
600
+
601
+ state_dict = paddle.load(base_model_path)
602
+
603
+ # motion module updating
604
+ if motion_module_path is not None:
605
+ motion_state_dict = paddle.load(motion_module_path)
606
+ state_dict.update(motion_state_dict)
607
+
608
+ if weight_dtype is not None:
609
+ for k in state_dict.keys():
610
+ state_dict[k] = state_dict[k].astype(weight_dtype)
611
+
612
+ m, u = model.set_state_dict(state_dict)
613
+ print(f"### missing keys: {len(m)}; \n### unexpected keys: {len(u)};")
614
+
615
+ return model
VLMEvalKit_old/PaddleMIX/ppdiffusers/ppdiffusers/models/hotshot_xl/__init__.py ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ from dataclasses import dataclass
16
+ from typing import Union
17
+
18
+ import numpy as np
19
+ import paddle
20
+
21
+ import ppdiffusers
22
+
23
+ from .unet import UNet3DConditionModel # noqa: *
24
+
25
+
26
+ @dataclass
27
+ class HotshotPipelineXLOutput(ppdiffusers.utils.BaseOutput):
28
+ videos: Union[paddle.Tensor, np.ndarray]
VLMEvalKit_old/PaddleMIX/ppdiffusers/ppdiffusers/models/hotshot_xl/resnet.py ADDED
@@ -0,0 +1,124 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ import paddle
16
+ from einops import rearrange
17
+
18
+ import ppdiffusers
19
+ from ppdiffusers.models import resnet
20
+
21
+
22
+ class Upsample3D(resnet.Upsample2D):
23
+ def forward(self, hidden_states, output_size=None, scale: float = 1.0):
24
+ f = tuple(hidden_states.shape)[2]
25
+ hidden_states = rearrange(hidden_states, "b c f h w -> (b f) c h w")
26
+ hidden_states = super(Upsample3D, self).forward(hidden_states, output_size, scale)
27
+ return rearrange(hidden_states, "(b f) c h w -> b c f h w", f=f)
28
+
29
+
30
+ class Downsample3D(ppdiffusers.models.resnet.Downsample2D):
31
+ def forward(self, hidden_states, scale: float = 1.0):
32
+ f = tuple(hidden_states.shape)[2]
33
+ hidden_states = rearrange(hidden_states, "b c f h w -> (b f) c h w")
34
+ hidden_states = super(Downsample3D, self).forward(hidden_states, scale)
35
+ return rearrange(hidden_states, "(b f) c h w -> b c f h w", f=f)
36
+
37
+
38
+ class Conv3d(ppdiffusers.models.resnet.LoRACompatibleConv):
39
+ def forward(self, hidden_states, scale: float = 1.0):
40
+ f = tuple(hidden_states.shape)[2]
41
+ hidden_states = rearrange(hidden_states, "b c f h w -> (b f) c h w")
42
+ hidden_states = super().forward(hidden_states, scale)
43
+ return rearrange(hidden_states, "(b f) c h w -> b c f h w", f=f)
44
+
45
+
46
+ class ResnetBlock3D(paddle.nn.Layer):
47
+ def __init__(
48
+ self,
49
+ *,
50
+ in_channels,
51
+ out_channels=None,
52
+ conv_shortcut=False,
53
+ dropout=0.0,
54
+ temb_channels=512,
55
+ groups=32,
56
+ groups_out=None,
57
+ pre_norm=True,
58
+ eps=1e-06,
59
+ non_linearity="silu",
60
+ time_embedding_norm="default",
61
+ output_scale_factor=1.0,
62
+ use_in_shortcut=None,
63
+ conv_shortcut_bias: bool = True
64
+ ):
65
+ super().__init__()
66
+ self.pre_norm = pre_norm
67
+ self.pre_norm = True
68
+ self.in_channels = in_channels
69
+ out_channels = in_channels if out_channels is None else out_channels
70
+ self.out_channels = out_channels
71
+ self.use_conv_shortcut = conv_shortcut
72
+ self.time_embedding_norm = time_embedding_norm
73
+ self.output_scale_factor = output_scale_factor
74
+ if groups_out is None:
75
+ groups_out = groups
76
+ self.norm1 = paddle.nn.GroupNorm(
77
+ num_groups=groups, num_channels=in_channels, epsilon=eps, weight_attr=True, bias_attr=True
78
+ )
79
+ self.conv1 = Conv3d(in_channels, out_channels, kernel_size=3, stride=1, padding=1)
80
+ if temb_channels is not None:
81
+ if self.time_embedding_norm == "default":
82
+ time_emb_proj_out_channels = out_channels
83
+ elif self.time_embedding_norm == "scale_shift":
84
+ time_emb_proj_out_channels = out_channels * 2
85
+ else:
86
+ raise ValueError(f"unknown time_embedding_norm : {self.time_embedding_norm} ")
87
+ self.time_emb_proj = paddle.nn.Linear(in_features=temb_channels, out_features=time_emb_proj_out_channels)
88
+ else:
89
+ self.time_emb_proj = None
90
+ self.norm2 = paddle.nn.GroupNorm(
91
+ num_groups=groups_out, num_channels=out_channels, epsilon=eps, weight_attr=True, bias_attr=True
92
+ )
93
+ self.dropout = paddle.nn.Dropout(p=dropout)
94
+ self.conv2 = Conv3d(out_channels, out_channels, kernel_size=3, stride=1, padding=1)
95
+ assert non_linearity == "silu"
96
+ self.nonlinearity = paddle.nn.Silu()
97
+ self.use_in_shortcut = self.in_channels != self.out_channels if use_in_shortcut is None else use_in_shortcut
98
+ self.conv_shortcut = None
99
+ if self.use_in_shortcut:
100
+ self.conv_shortcut = Conv3d(
101
+ in_channels, out_channels, kernel_size=1, stride=1, padding=0, bias_attr=conv_shortcut_bias
102
+ )
103
+
104
+ def forward(self, input_tensor, temb):
105
+ hidden_states = input_tensor
106
+ hidden_states = self.norm1(hidden_states)
107
+ hidden_states = self.nonlinearity(hidden_states)
108
+ hidden_states = self.conv1(hidden_states)
109
+ if temb is not None:
110
+ temb = self.nonlinearity(temb)
111
+ temb = self.time_emb_proj(temb)[:, :, None, None, None]
112
+ if temb is not None and self.time_embedding_norm == "default":
113
+ hidden_states = hidden_states + temb
114
+ hidden_states = self.norm2(hidden_states)
115
+ if temb is not None and self.time_embedding_norm == "scale_shift":
116
+ scale, shift = paddle.chunk(x=temb, chunks=2, axis=1)
117
+ hidden_states = hidden_states * (1 + scale) + shift
118
+ hidden_states = self.nonlinearity(hidden_states)
119
+ hidden_states = self.dropout(hidden_states)
120
+ hidden_states = self.conv2(hidden_states)
121
+ if self.conv_shortcut is not None:
122
+ input_tensor = self.conv_shortcut(input_tensor)
123
+ output_tensor = (input_tensor + hidden_states) / self.output_scale_factor
124
+ return output_tensor
VLMEvalKit_old/PaddleMIX/ppdiffusers/ppdiffusers/models/hotshot_xl/transformer_3d.py ADDED
@@ -0,0 +1,77 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ from dataclasses import dataclass
16
+ from typing import Any, Dict, Optional
17
+
18
+ import paddle
19
+ from einops import rearrange, repeat
20
+
21
+ import ppdiffusers
22
+
23
+
24
+ @dataclass
25
+ class Transformer3DModelOutput(ppdiffusers.utils.BaseOutput):
26
+ """
27
+ The output of [`Transformer3DModel`].
28
+
29
+ Args:
30
+ sample (`paddle.FloatTensor` of shape `(batch_size, num_channels, height, width)`:
31
+ The hidden states output conditioned on the `encoder_hidden_states` input.
32
+ """
33
+
34
+ sample: paddle.float32
35
+
36
+
37
+ class Transformer3DModel(ppdiffusers.models.transformer_2d.Transformer2DModel):
38
+ def __init__(self, *args, **kwargs):
39
+ super(Transformer3DModel, self).__init__(*args, **kwargs)
40
+ init_Constant = paddle.nn.initializer.Constant(value=0.0)
41
+ init_Constant(self.proj_out.weight.data)
42
+ init_Constant = paddle.nn.initializer.Constant(value=0.0)
43
+ init_Constant(self.proj_out.bias.data)
44
+
45
+ def forward(
46
+ self,
47
+ hidden_states: paddle.Tensor,
48
+ encoder_hidden_states: Optional[paddle.Tensor] = None,
49
+ timestep: Optional[int] = None,
50
+ class_labels: Optional[int] = None,
51
+ cross_attention_kwargs: Dict[str, Any] = None,
52
+ attention_mask: Optional[paddle.Tensor] = None,
53
+ encoder_attention_mask: Optional[paddle.Tensor] = None,
54
+ enable_temporal_layers: bool = True,
55
+ positional_embedding: Optional[paddle.Tensor] = None,
56
+ return_dict: bool = True,
57
+ ):
58
+ is_video = len(tuple(hidden_states.shape)) == 5
59
+ if is_video:
60
+ f = tuple(hidden_states.shape)[2]
61
+ hidden_states = rearrange(hidden_states, "b c f h w -> (b f) c h w")
62
+ encoder_hidden_states = repeat(encoder_hidden_states, "b n c -> (b f) n c", f=f)
63
+ hidden_states = super(Transformer3DModel, self).forward(
64
+ hidden_states,
65
+ encoder_hidden_states,
66
+ timestep,
67
+ class_labels,
68
+ cross_attention_kwargs,
69
+ attention_mask,
70
+ encoder_attention_mask,
71
+ return_dict=False,
72
+ )[0]
73
+ if is_video:
74
+ hidden_states = rearrange(hidden_states, "(b f) c h w -> b c f h w", f=f)
75
+ if not return_dict:
76
+ return (hidden_states,)
77
+ return Transformer3DModelOutput(sample=hidden_states)
VLMEvalKit_old/PaddleMIX/ppdiffusers/ppdiffusers/models/hotshot_xl/unet.py ADDED
@@ -0,0 +1,778 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ import os
16
+ from dataclasses import dataclass
17
+ from typing import Any, Dict, List, Optional, Tuple, Union
18
+
19
+ import paddle
20
+
21
+ import ppdiffusers
22
+ from ppdiffusers import loaders, transformers # noqa: *
23
+
24
+ from .resnet import Conv3d
25
+ from .unet_blocks import (
26
+ CrossAttnDownBlock3D,
27
+ CrossAttnUpBlock3D,
28
+ DownBlock3D,
29
+ UNetMidBlock3DCrossAttn,
30
+ UpBlock3D,
31
+ get_down_block,
32
+ get_up_block,
33
+ )
34
+
35
+ logger = ppdiffusers.utils.logging.get_logger(__name__)
36
+
37
+
38
+ @dataclass
39
+ class UNet3DConditionOutput(ppdiffusers.utils.BaseOutput):
40
+ """
41
+ The output of [`UNet2DConditionModel`].
42
+
43
+ Args:
44
+ sample (`paddle.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
45
+ The hidden states output conditioned on `encoder_hidden_states` input. Output of last layer of model.
46
+ """
47
+
48
+ sample: paddle.float32 = None
49
+
50
+
51
+ class UNet3DConditionModel(
52
+ ppdiffusers.models.modeling_utils.ModelMixin,
53
+ ppdiffusers.configuration_utils.ConfigMixin,
54
+ loaders.UNet2DConditionLoadersMixin,
55
+ ):
56
+ _supports_gradient_checkpointing = True
57
+
58
+ @ppdiffusers.configuration_utils.register_to_config
59
+ def __init__(
60
+ self,
61
+ sample_size: Optional[int] = None,
62
+ in_channels: int = 4,
63
+ out_channels: int = 4,
64
+ center_input_sample: bool = False,
65
+ flip_sin_to_cos: bool = True,
66
+ freq_shift: int = 0,
67
+ down_block_types: Tuple[str] = ("CrossAttnDownBlock3D", "CrossAttnDownBlock3D", "DownBlock3D"),
68
+ mid_block_type: Optional[str] = "UNetMidBlock3DCrossAttn",
69
+ up_block_types: Tuple[str] = ("UpBlock3D", "CrossAttnUpBlock3D", "CrossAttnUpBlock3D"),
70
+ only_cross_attention: Union[bool, Tuple[bool]] = False,
71
+ block_out_channels: Tuple[int] = (320, 640, 1280, 1280),
72
+ layers_per_block: Union[int, Tuple[int]] = 2,
73
+ downsample_padding: int = 1,
74
+ mid_block_scale_factor: float = 1,
75
+ act_fn: str = "silu",
76
+ norm_num_groups: Optional[int] = 32,
77
+ norm_eps: float = 1e-05,
78
+ cross_attention_dim: Union[int, Tuple[int]] = 1280,
79
+ transformer_layers_per_block: Union[int, Tuple[int]] = 1,
80
+ encoder_hid_dim: Optional[int] = None,
81
+ encoder_hid_dim_type: Optional[str] = None,
82
+ attention_head_dim: Union[int, Tuple[int]] = 8,
83
+ num_attention_heads: Optional[Union[int, Tuple[int]]] = None,
84
+ dual_cross_attention: bool = False,
85
+ use_linear_projection: bool = False,
86
+ class_embed_type: Optional[str] = None,
87
+ addition_embed_type: Optional[str] = None,
88
+ addition_time_embed_dim: Optional[int] = None,
89
+ num_class_embeds: Optional[int] = None,
90
+ upcast_attention: bool = False,
91
+ resnet_time_scale_shift: str = "default",
92
+ resnet_skip_time_act: bool = False,
93
+ resnet_out_scale_factor: int = 1.0,
94
+ time_embedding_type: str = "positional",
95
+ time_embedding_dim: Optional[int] = None,
96
+ time_embedding_act_fn: Optional[str] = None,
97
+ timestep_post_act: Optional[str] = None,
98
+ time_cond_proj_dim: Optional[int] = None,
99
+ conv_in_kernel: int = 3,
100
+ conv_out_kernel: int = 3,
101
+ projection_class_embeddings_input_dim: Optional[int] = None,
102
+ class_embeddings_concat: bool = False,
103
+ mid_block_only_cross_attention: Optional[bool] = None,
104
+ cross_attention_norm: Optional[str] = None,
105
+ addition_embed_type_num_heads=64,
106
+ ):
107
+ super().__init__()
108
+ self.sample_size = sample_size
109
+ if num_attention_heads is not None:
110
+ raise ValueError(
111
+ "At the moment it is not possible to define the number of attention heads via `num_attention_heads` because of a naming issue as described in https://github.com/huggingface/diffusers/issues/2011#issuecomment-1547958131. Passing `num_attention_heads` will only be supported in diffusers v0.19."
112
+ )
113
+ num_attention_heads = num_attention_heads or attention_head_dim
114
+ if len(down_block_types) != len(up_block_types):
115
+ raise ValueError(
116
+ f"Must provide the same number of `down_block_types` as `up_block_types`. `down_block_types`: {down_block_types}. `up_block_types`: {up_block_types}."
117
+ )
118
+ if len(block_out_channels) != len(down_block_types):
119
+ raise ValueError(
120
+ f"Must provide the same number of `block_out_channels` as `down_block_types`. `block_out_channels`: {block_out_channels}. `down_block_types`: {down_block_types}."
121
+ )
122
+ if not isinstance(only_cross_attention, bool) and len(only_cross_attention) != len(down_block_types):
123
+ raise ValueError(
124
+ f"Must provide the same number of `only_cross_attention` as `down_block_types`. `only_cross_attention`: {only_cross_attention}. `down_block_types`: {down_block_types}."
125
+ )
126
+ if not isinstance(num_attention_heads, int) and len(num_attention_heads) != len(down_block_types):
127
+ raise ValueError(
128
+ f"Must provide the same number of `num_attention_heads` as `down_block_types`. `num_attention_heads`: {num_attention_heads}. `down_block_types`: {down_block_types}."
129
+ )
130
+ if not isinstance(attention_head_dim, int) and len(attention_head_dim) != len(down_block_types):
131
+ raise ValueError(
132
+ f"Must provide the same number of `attention_head_dim` as `down_block_types`. `attention_head_dim`: {attention_head_dim}. `down_block_types`: {down_block_types}."
133
+ )
134
+ if isinstance(cross_attention_dim, list) and len(cross_attention_dim) != len(down_block_types):
135
+ raise ValueError(
136
+ f"Must provide the same number of `cross_attention_dim` as `down_block_types`. `cross_attention_dim`: {cross_attention_dim}. `down_block_types`: {down_block_types}."
137
+ )
138
+ if not isinstance(layers_per_block, int) and len(layers_per_block) != len(down_block_types):
139
+ raise ValueError(
140
+ f"Must provide the same number of `layers_per_block` as `down_block_types`. `layers_per_block`: {layers_per_block}. `down_block_types`: {down_block_types}."
141
+ )
142
+ conv_in_padding = (conv_in_kernel - 1) // 2
143
+ self.conv_in = Conv3d(in_channels, block_out_channels[0], kernel_size=conv_in_kernel, padding=conv_in_padding)
144
+ if time_embedding_type == "fourier":
145
+ time_embed_dim = time_embedding_dim or block_out_channels[0] * 2
146
+ if time_embed_dim % 2 != 0:
147
+ raise ValueError(f"`time_embed_dim` should be divisible by 2, but is {time_embed_dim}.")
148
+ self.time_proj = ppdiffusers.models.embeddings.GaussianFourierProjection(
149
+ time_embed_dim // 2, set_W_to_weight=False, log=False, flip_sin_to_cos=flip_sin_to_cos
150
+ )
151
+ timestep_input_dim = time_embed_dim
152
+ elif time_embedding_type == "positional":
153
+ time_embed_dim = time_embedding_dim or block_out_channels[0] * 4
154
+ self.time_proj = ppdiffusers.models.embeddings.Timesteps(
155
+ block_out_channels[0], flip_sin_to_cos, freq_shift
156
+ )
157
+ timestep_input_dim = block_out_channels[0]
158
+ else:
159
+ raise ValueError(
160
+ f"{time_embedding_type} does not exist. Please make sure to use one of `fourier` or `positional`."
161
+ )
162
+ self.time_embedding = ppdiffusers.models.embeddings.TimestepEmbedding(
163
+ timestep_input_dim,
164
+ time_embed_dim,
165
+ act_fn=act_fn,
166
+ post_act_fn=timestep_post_act,
167
+ cond_proj_dim=time_cond_proj_dim,
168
+ )
169
+ if encoder_hid_dim_type is None and encoder_hid_dim is not None:
170
+ encoder_hid_dim_type = "text_proj"
171
+ self.register_to_config(encoder_hid_dim_type=encoder_hid_dim_type)
172
+ logger.info("encoder_hid_dim_type defaults to 'text_proj' as `encoder_hid_dim` is defined.")
173
+ if encoder_hid_dim is None and encoder_hid_dim_type is not None:
174
+ raise ValueError(
175
+ f"`encoder_hid_dim` has to be defined when `encoder_hid_dim_type` is set to {encoder_hid_dim_type}."
176
+ )
177
+ if encoder_hid_dim_type == "text_proj":
178
+ self.encoder_hid_proj = paddle.nn.Linear(in_features=encoder_hid_dim, out_features=cross_attention_dim)
179
+ elif encoder_hid_dim_type == "text_image_proj":
180
+ self.encoder_hid_proj = ppdiffusers.models.embeddings.TextImageProjection(
181
+ text_embed_dim=encoder_hid_dim,
182
+ image_embed_dim=cross_attention_dim,
183
+ cross_attention_dim=cross_attention_dim,
184
+ )
185
+ elif encoder_hid_dim_type == "image_proj":
186
+ self.encoder_hid_proj = ppdiffusers.models.embeddings.ImageProjection(
187
+ image_embed_dim=encoder_hid_dim, cross_attention_dim=cross_attention_dim
188
+ )
189
+ elif encoder_hid_dim_type is not None:
190
+ raise ValueError(
191
+ f"encoder_hid_dim_type: {encoder_hid_dim_type} must be None, 'text_proj' or 'text_image_proj'."
192
+ )
193
+ else:
194
+ self.encoder_hid_proj = None
195
+ if class_embed_type is None and num_class_embeds is not None:
196
+ self.class_embedding = paddle.nn.Embedding(num_embeddings=num_class_embeds, embedding_dim=time_embed_dim)
197
+ elif class_embed_type == "timestep":
198
+ self.class_embedding = ppdiffusers.models.embeddings.TimestepEmbedding(
199
+ timestep_input_dim, time_embed_dim, act_fn=act_fn
200
+ )
201
+ elif class_embed_type == "identity":
202
+ self.class_embedding = paddle.nn.Identity(time_embed_dim, time_embed_dim)
203
+ elif class_embed_type == "projection":
204
+ if projection_class_embeddings_input_dim is None:
205
+ raise ValueError(
206
+ "`class_embed_type`: 'projection' requires `projection_class_embeddings_input_dim` be set"
207
+ )
208
+ self.class_embedding = ppdiffusers.models.embeddings.TimestepEmbedding(
209
+ projection_class_embeddings_input_dim, time_embed_dim
210
+ )
211
+ elif class_embed_type == "simple_projection":
212
+ if projection_class_embeddings_input_dim is None:
213
+ raise ValueError(
214
+ "`class_embed_type`: 'simple_projection' requires `projection_class_embeddings_input_dim` be set"
215
+ )
216
+ self.class_embedding = paddle.nn.Linear(
217
+ in_features=projection_class_embeddings_input_dim, out_features=time_embed_dim
218
+ )
219
+ else:
220
+ self.class_embedding = None
221
+ if addition_embed_type == "text":
222
+ if encoder_hid_dim is not None:
223
+ text_time_embedding_from_dim = encoder_hid_dim
224
+ else:
225
+ text_time_embedding_from_dim = cross_attention_dim
226
+ self.add_embedding = ppdiffusers.models.embeddings.TextTimeEmbedding(
227
+ text_time_embedding_from_dim, time_embed_dim, num_heads=addition_embed_type_num_heads
228
+ )
229
+ elif addition_embed_type == "text_image":
230
+ self.add_embedding = ppdiffusers.models.embeddings.TextImageTimeEmbedding(
231
+ text_embed_dim=cross_attention_dim, image_embed_dim=cross_attention_dim, time_embed_dim=time_embed_dim
232
+ )
233
+ elif addition_embed_type == "text_time":
234
+ self.add_time_proj = ppdiffusers.models.embeddings.Timesteps(
235
+ addition_time_embed_dim, flip_sin_to_cos, freq_shift
236
+ )
237
+ self.add_embedding = ppdiffusers.models.embeddings.TimestepEmbedding(
238
+ projection_class_embeddings_input_dim, time_embed_dim
239
+ )
240
+ elif addition_embed_type == "image":
241
+ self.add_embedding = ppdiffusers.models.embeddings.ImageTimeEmbedding(
242
+ image_embed_dim=encoder_hid_dim, time_embed_dim=time_embed_dim
243
+ )
244
+ elif addition_embed_type == "image_hint":
245
+ self.add_embedding = ppdiffusers.models.embeddings.ImageHintTimeEmbedding(
246
+ image_embed_dim=encoder_hid_dim, time_embed_dim=time_embed_dim
247
+ )
248
+ elif addition_embed_type is not None:
249
+ raise ValueError(f"addition_embed_type: {addition_embed_type} must be None, 'text' or 'text_image'.")
250
+ if time_embedding_act_fn is None:
251
+ self.time_embed_act = None
252
+ else:
253
+ self.time_embed_act = ppdiffusers.models.activations.get_activation(time_embedding_act_fn)
254
+ self.down_blocks = paddle.nn.LayerList(sublayers=[])
255
+ self.up_blocks = paddle.nn.LayerList(sublayers=[])
256
+ if isinstance(only_cross_attention, bool):
257
+ if mid_block_only_cross_attention is None:
258
+ mid_block_only_cross_attention = only_cross_attention
259
+ only_cross_attention = [only_cross_attention] * len(down_block_types)
260
+ if mid_block_only_cross_attention is None:
261
+ mid_block_only_cross_attention = False
262
+ if isinstance(num_attention_heads, int):
263
+ num_attention_heads = (num_attention_heads,) * len(down_block_types)
264
+ if isinstance(attention_head_dim, int):
265
+ attention_head_dim = (attention_head_dim,) * len(down_block_types)
266
+ if isinstance(cross_attention_dim, int):
267
+ cross_attention_dim = (cross_attention_dim,) * len(down_block_types)
268
+ if isinstance(layers_per_block, int):
269
+ layers_per_block = [layers_per_block] * len(down_block_types)
270
+ if isinstance(transformer_layers_per_block, int):
271
+ transformer_layers_per_block = [transformer_layers_per_block] * len(down_block_types)
272
+ if class_embeddings_concat:
273
+ blocks_time_embed_dim = time_embed_dim * 2
274
+ else:
275
+ blocks_time_embed_dim = time_embed_dim
276
+ output_channel = block_out_channels[0]
277
+ for i, down_block_type in enumerate(down_block_types):
278
+ res = 2**i
279
+ input_channel = output_channel
280
+ output_channel = block_out_channels[i]
281
+ is_final_block = i == len(block_out_channels) - 1
282
+ down_block = get_down_block(
283
+ down_block_type,
284
+ num_layers=layers_per_block[i],
285
+ transformer_layers_per_block=transformer_layers_per_block[i],
286
+ in_channels=input_channel,
287
+ out_channels=output_channel,
288
+ temb_channels=blocks_time_embed_dim,
289
+ add_downsample=not is_final_block,
290
+ resnet_eps=norm_eps,
291
+ resnet_act_fn=act_fn,
292
+ resnet_groups=norm_num_groups,
293
+ cross_attention_dim=cross_attention_dim[i],
294
+ num_attention_heads=num_attention_heads[i],
295
+ downsample_padding=downsample_padding,
296
+ dual_cross_attention=dual_cross_attention,
297
+ use_linear_projection=use_linear_projection,
298
+ only_cross_attention=only_cross_attention[i],
299
+ upcast_attention=upcast_attention,
300
+ resnet_time_scale_shift=resnet_time_scale_shift,
301
+ resnet_skip_time_act=resnet_skip_time_act,
302
+ resnet_out_scale_factor=resnet_out_scale_factor,
303
+ cross_attention_norm=cross_attention_norm,
304
+ attention_head_dim=attention_head_dim[i] if attention_head_dim[i] is not None else output_channel,
305
+ )
306
+ self.down_blocks.append(down_block)
307
+ if mid_block_type == "UNetMidBlock3DCrossAttn":
308
+ self.mid_block = UNetMidBlock3DCrossAttn(
309
+ transformer_layers_per_block=transformer_layers_per_block[-1],
310
+ in_channels=block_out_channels[-1],
311
+ temb_channels=blocks_time_embed_dim,
312
+ resnet_eps=norm_eps,
313
+ resnet_act_fn=act_fn,
314
+ output_scale_factor=mid_block_scale_factor,
315
+ resnet_time_scale_shift=resnet_time_scale_shift,
316
+ cross_attention_dim=cross_attention_dim[-1],
317
+ num_attention_heads=num_attention_heads[-1],
318
+ resnet_groups=norm_num_groups,
319
+ dual_cross_attention=dual_cross_attention,
320
+ use_linear_projection=use_linear_projection,
321
+ upcast_attention=upcast_attention,
322
+ )
323
+ elif mid_block_type == "UNetMidBlock2DSimpleCrossAttn":
324
+ raise ValueError("UNetMidBlock2DSimpleCrossAttn not supported")
325
+ elif mid_block_type is None:
326
+ self.mid_block = None
327
+ else:
328
+ raise ValueError(f"unknown mid_block_type : {mid_block_type}")
329
+ self.num_upsamplers = 0
330
+ reversed_block_out_channels = list(reversed(block_out_channels))
331
+ reversed_num_attention_heads = list(reversed(num_attention_heads))
332
+ reversed_layers_per_block = list(reversed(layers_per_block))
333
+ reversed_cross_attention_dim = list(reversed(cross_attention_dim))
334
+ reversed_transformer_layers_per_block = list(reversed(transformer_layers_per_block))
335
+ only_cross_attention = list(reversed(only_cross_attention))
336
+ output_channel = reversed_block_out_channels[0]
337
+ for i, up_block_type in enumerate(up_block_types):
338
+ res = 2 ** (len(up_block_types) - 1 - i) # noqa: *
339
+ is_final_block = i == len(block_out_channels) - 1
340
+ prev_output_channel = output_channel
341
+ output_channel = reversed_block_out_channels[i]
342
+ input_channel = reversed_block_out_channels[min(i + 1, len(block_out_channels) - 1)]
343
+ if not is_final_block:
344
+ add_upsample = True
345
+ self.num_upsamplers += 1
346
+ else:
347
+ add_upsample = False
348
+ up_block = get_up_block(
349
+ up_block_type,
350
+ num_layers=reversed_layers_per_block[i] + 1,
351
+ transformer_layers_per_block=reversed_transformer_layers_per_block[i],
352
+ in_channels=input_channel,
353
+ out_channels=output_channel,
354
+ prev_output_channel=prev_output_channel,
355
+ temb_channels=blocks_time_embed_dim,
356
+ add_upsample=add_upsample,
357
+ resnet_eps=norm_eps,
358
+ resnet_act_fn=act_fn,
359
+ resnet_groups=norm_num_groups,
360
+ cross_attention_dim=reversed_cross_attention_dim[i],
361
+ num_attention_heads=reversed_num_attention_heads[i],
362
+ dual_cross_attention=dual_cross_attention,
363
+ use_linear_projection=use_linear_projection,
364
+ only_cross_attention=only_cross_attention[i],
365
+ upcast_attention=upcast_attention,
366
+ resnet_time_scale_shift=resnet_time_scale_shift,
367
+ resnet_skip_time_act=resnet_skip_time_act,
368
+ resnet_out_scale_factor=resnet_out_scale_factor,
369
+ cross_attention_norm=cross_attention_norm,
370
+ attention_head_dim=attention_head_dim[i] if attention_head_dim[i] is not None else output_channel,
371
+ )
372
+ self.up_blocks.append(up_block)
373
+ prev_output_channel = output_channel
374
+ if norm_num_groups is not None:
375
+ self.conv_norm_out = paddle.nn.GroupNorm(
376
+ num_channels=block_out_channels[0], num_groups=norm_num_groups, epsilon=norm_eps
377
+ )
378
+ self.conv_act = ppdiffusers.models.activations.get_activation(act_fn)
379
+ else:
380
+ self.conv_norm_out = None
381
+ self.conv_act = None
382
+ conv_out_padding = (conv_out_kernel - 1) // 2
383
+ self.conv_out = Conv3d(
384
+ block_out_channels[0], out_channels, kernel_size=conv_out_kernel, padding=conv_out_padding
385
+ )
386
+
387
+ def temporal_parameters(self) -> list:
388
+ output = []
389
+ all_blocks = list(self.down_blocks) + list(self.up_blocks) + [self.mid_block]
390
+ for block in all_blocks:
391
+ output.extend(block.temporal_parameters())
392
+ return output
393
+
394
+ @property
395
+ def attn_processors(self) -> Dict[str, ppdiffusers.models.attention_processor.AttentionProcessor]:
396
+ return self.get_attn_processors(include_temporal_layers=False)
397
+
398
+ def get_attn_processors(
399
+ self, include_temporal_layers=True
400
+ ) -> Dict[str, ppdiffusers.models.attention_processor.AttentionProcessor]:
401
+ """
402
+ Returns:
403
+ `dict` of attention processors: A dictionary containing all attention processors used in the model with
404
+ indexed by its weight name.
405
+ """
406
+ processors = {}
407
+
408
+ def fn_recursive_add_processors(
409
+ name: str,
410
+ module: paddle.nn.Layer,
411
+ processors: Dict[str, ppdiffusers.models.attention_processor.AttentionProcessor],
412
+ ):
413
+ if not include_temporal_layers:
414
+ if "temporal" in name:
415
+ return processors
416
+ if hasattr(module, "set_processor"):
417
+ processors[f"{name}.processor"] = module.processor
418
+ for sub_name, child in module.named_children():
419
+ fn_recursive_add_processors(f"{name}.{sub_name}", child, processors)
420
+ return processors
421
+
422
+ for name, module in self.named_children():
423
+ fn_recursive_add_processors(name, module, processors)
424
+ return processors
425
+
426
+ def set_attn_processor(
427
+ self,
428
+ processor: Union[
429
+ ppdiffusers.models.attention_processor.AttentionProcessor,
430
+ Dict[str, ppdiffusers.models.attention_processor.AttentionProcessor],
431
+ ],
432
+ include_temporal_layers=False,
433
+ ):
434
+ """
435
+ Sets the attention processor to use to compute attention.
436
+
437
+ Parameters:
438
+ processor (`dict` of `AttentionProcessor` or only `AttentionProcessor`):
439
+ The instantiated processor class or a dictionary of processor classes that will be set as the processor
440
+ for **all** `Attention` layers.
441
+
442
+ If `processor` is a dict, the key needs to define the path to the corresponding cross attention
443
+ processor. This is strongly recommended when setting trainable attention processors.
444
+
445
+ """
446
+ count = len(self.get_attn_processors(include_temporal_layers=include_temporal_layers).keys())
447
+ if isinstance(processor, dict) and len(processor) != count:
448
+ raise ValueError(
449
+ f"A dict of processors was passed, but the number of processors {len(processor)} does not match the number of attention layers: {count}. Please make sure to pass {count} processor classes."
450
+ )
451
+
452
+ def fn_recursive_attn_processor(name: str, module: paddle.nn.Layer, processor):
453
+ if not include_temporal_layers:
454
+ if "temporal" in name:
455
+ return
456
+ if hasattr(module, "set_processor"):
457
+ if not isinstance(processor, dict):
458
+ module.set_processor(processor)
459
+ else:
460
+ module.set_processor(processor.pop(f"{name}.processor"))
461
+ for sub_name, child in module.named_children():
462
+ fn_recursive_attn_processor(f"{name}.{sub_name}", child, processor)
463
+
464
+ for name, module in self.named_children():
465
+ fn_recursive_attn_processor(name, module, processor)
466
+
467
+ def set_default_attn_processor(self):
468
+ """
469
+ Disables custom attention processors and sets the default attention implementation.
470
+ """
471
+ self.set_attn_processor(ppdiffusers.models.attention_processor.AttnProcessor())
472
+
473
+ def set_attention_slice(self, slice_size):
474
+ """
475
+ Enable sliced attention computation.
476
+
477
+ When this option is enabled, the attention module splits the input tensor in slices to compute attention in
478
+ several steps. This is useful for saving some memory in exchange for a small decrease in speed.
479
+
480
+ Args:
481
+ slice_size (`str` or `int` or `list(int)`, *optional*, defaults to `"auto"`):
482
+ When `"auto"`, input to the attention heads is halved, so attention is computed in two steps. If
483
+ `"max"`, maximum amount of memory is saved by running only one slice at a time. If a number is
484
+ provided, uses as many slices as `attention_head_dim // slice_size`. In this case, `attention_head_dim`
485
+ must be a multiple of `slice_size`.
486
+ """
487
+ sliceable_head_dims = []
488
+
489
+ def fn_recursive_retrieve_sliceable_dims(module: paddle.nn.Layer):
490
+ if hasattr(module, "set_attention_slice"):
491
+ sliceable_head_dims.append(module.sliceable_head_dim)
492
+ for child in module.children():
493
+ fn_recursive_retrieve_sliceable_dims(child)
494
+
495
+ for module in self.children():
496
+ fn_recursive_retrieve_sliceable_dims(module)
497
+ num_sliceable_layers = len(sliceable_head_dims)
498
+ if slice_size == "auto":
499
+ slice_size = [(dim // 2) for dim in sliceable_head_dims]
500
+ elif slice_size == "max":
501
+ slice_size = num_sliceable_layers * [1]
502
+ slice_size = num_sliceable_layers * [slice_size] if not isinstance(slice_size, list) else slice_size
503
+ if len(slice_size) != len(sliceable_head_dims):
504
+ raise ValueError(
505
+ f"You have provided {len(slice_size)}, but {self.config} has {len(sliceable_head_dims)} different attention layers. Make sure to match `len(slice_size)` to be {len(sliceable_head_dims)}."
506
+ )
507
+ for i in range(len(slice_size)):
508
+ size = slice_size[i]
509
+ dim = sliceable_head_dims[i]
510
+ if size is not None and size > dim:
511
+ raise ValueError(f"size {size} has to be smaller or equal to {dim}.")
512
+
513
+ def fn_recursive_set_attention_slice(module: paddle.nn.Layer, slice_size: List[int]):
514
+ if hasattr(module, "set_attention_slice"):
515
+ module.set_attention_slice(slice_size.pop())
516
+ for child in module.children():
517
+ fn_recursive_set_attention_slice(child, slice_size)
518
+
519
+ reversed_slice_size = list(reversed(slice_size))
520
+ for module in self.children():
521
+ fn_recursive_set_attention_slice(module, reversed_slice_size)
522
+
523
+ def _set_gradient_checkpointing(self, module, value=False):
524
+ if isinstance(module, (CrossAttnDownBlock3D, DownBlock3D, CrossAttnUpBlock3D, UpBlock3D)):
525
+ module.gradient_checkpointing = value
526
+
527
+ def forward(
528
+ self,
529
+ sample: paddle.float32,
530
+ timestep: Union[paddle.Tensor, float, int],
531
+ encoder_hidden_states: paddle.Tensor,
532
+ class_labels: Optional[paddle.Tensor] = None,
533
+ timestep_cond: Optional[paddle.Tensor] = None,
534
+ attention_mask: Optional[paddle.Tensor] = None,
535
+ cross_attention_kwargs: Optional[Dict[str, Any]] = None,
536
+ added_cond_kwargs: Optional[Dict[str, paddle.Tensor]] = None,
537
+ down_block_additional_residuals: Optional[Tuple[paddle.Tensor]] = None,
538
+ mid_block_additional_residual: Optional[paddle.Tensor] = None,
539
+ encoder_attention_mask: Optional[paddle.Tensor] = None,
540
+ return_dict: bool = True,
541
+ enable_temporal_attentions: bool = True,
542
+ ) -> Union[UNet3DConditionOutput, Tuple]:
543
+ """
544
+ The [`UNet2DConditionModel`] forward method.
545
+
546
+ Args:
547
+ sample (`paddle.FloatTensor`):
548
+ The noisy input tensor with the following shape `(batch, channel, height, width)`.
549
+ timestep (`paddle.FloatTensor` or `float` or `int`): The number of timesteps to denoise an input.
550
+ encoder_hidden_states (`paddle.FloatTensor`):
551
+ The encoder hidden states with shape `(batch, sequence_length, feature_dim)`.
552
+ encoder_attention_mask (`paddle.Tensor`):
553
+ A cross-attention mask of shape `(batch, sequence_length)` is applied to `encoder_hidden_states`. If
554
+ `True` the mask is kept, otherwise if `False` it is discarded. Mask will be converted into a bias,
555
+ which adds large negative values to the attention scores corresponding to "discard" tokens.
556
+ return_dict (`bool`, *optional*, defaults to `True`):
557
+ Whether or not to return a [`~models.unet_2d_condition.UNet2DConditionOutput`] instead of a plain
558
+ tuple.
559
+ cross_attention_kwargs (`dict`, *optional*):
560
+ A kwargs dictionary that if specified is passed along to the [`AttnProcessor`].
561
+ added_cond_kwargs: (`dict`, *optional*):
562
+ A kwargs dictionary containin additional embeddings that if specified are added to the embeddings that
563
+ are passed along to the UNet blocks.
564
+
565
+ Returns:
566
+ [`~models.unet_2d_condition.UNet2DConditionOutput`] or `tuple`:
567
+ If `return_dict` is True, an [`~models.unet_2d_condition.UNet2DConditionOutput`] is returned, otherwise
568
+ a `tuple` is returned where the first element is the sample tensor.
569
+ """
570
+ default_overall_up_factor = 2**self.num_upsamplers
571
+ forward_upsample_size = False
572
+ upsample_size = None
573
+ if any(s % default_overall_up_factor != 0 for s in tuple(sample.shape)[-2:]):
574
+ logger.info("Forward upsample size to force interpolation output size.")
575
+ forward_upsample_size = True
576
+ if attention_mask is not None:
577
+ attention_mask = (1 - attention_mask.to(sample.dtype)) * -10000.0
578
+ attention_mask = attention_mask.unsqueeze(axis=1)
579
+ if encoder_attention_mask is not None:
580
+ encoder_attention_mask = (1 - encoder_attention_mask.to(sample.dtype)) * -10000.0
581
+ encoder_attention_mask = encoder_attention_mask.unsqueeze(axis=1)
582
+ if self.config.center_input_sample:
583
+ sample = 2 * sample - 1.0
584
+ timesteps = timestep
585
+ if not paddle.is_tensor(x=timesteps):
586
+ is_mps = sample.device.type == "mps"
587
+ if isinstance(timestep, float):
588
+ dtype = "float32" if is_mps else "float64"
589
+ else:
590
+ dtype = "int32" if is_mps else "int64"
591
+ timesteps = paddle.to_tensor(data=[timesteps], dtype=dtype, place=sample.place)
592
+ elif len(tuple(timesteps.shape)) == 0:
593
+ timesteps = timesteps[None].to(sample.place)
594
+ timesteps = timesteps.expand(shape=tuple(sample.shape)[0])
595
+ t_emb = self.time_proj(timesteps)
596
+ t_emb = t_emb.to(dtype=sample.dtype)
597
+ emb = self.time_embedding(t_emb, timestep_cond)
598
+ aug_emb = None
599
+ if self.class_embedding is not None:
600
+ if class_labels is None:
601
+ raise ValueError("class_labels should be provided when num_class_embeds > 0")
602
+ if self.config.class_embed_type == "timestep":
603
+ class_labels = self.time_proj(class_labels)
604
+ class_labels = class_labels.to(dtype=sample.dtype)
605
+ class_emb = self.class_embedding(class_labels).to(dtype=sample.dtype)
606
+ if self.config.class_embeddings_concat:
607
+ emb = paddle.concat(x=[emb, class_emb], axis=-1)
608
+ else:
609
+ emb = emb + class_emb
610
+ if self.config.addition_embed_type == "text":
611
+ aug_emb = self.add_embedding(encoder_hidden_states)
612
+ elif self.config.addition_embed_type == "text_image":
613
+ if "image_embeds" not in added_cond_kwargs:
614
+ raise ValueError(
615
+ f"{self.__class__} has the config param `addition_embed_type` set to 'text_image' which requires the keyword argument `image_embeds` to be passed in `added_cond_kwargs`"
616
+ )
617
+ image_embs = added_cond_kwargs.get("image_embeds")
618
+ text_embs = added_cond_kwargs.get("text_embeds", encoder_hidden_states)
619
+ aug_emb = self.add_embedding(text_embs, image_embs)
620
+ elif self.config.addition_embed_type == "text_time":
621
+ if "text_embeds" not in added_cond_kwargs:
622
+ raise ValueError(
623
+ f"{self.__class__} has the config param `addition_embed_type` set to 'text_time' which requires the keyword argument `text_embeds` to be passed in `added_cond_kwargs`"
624
+ )
625
+ text_embeds = added_cond_kwargs.get("text_embeds")
626
+ if "time_ids" not in added_cond_kwargs:
627
+ raise ValueError(
628
+ f"{self.__class__} has the config param `addition_embed_type` set to 'text_time' which requires the keyword argument `time_ids` to be passed in `added_cond_kwargs`"
629
+ )
630
+ time_ids = added_cond_kwargs.get("time_ids")
631
+ time_embeds = self.add_time_proj(time_ids.flatten())
632
+ time_embeds = time_embeds.reshape((tuple(text_embeds.shape)[0], -1))
633
+ add_embeds = paddle.concat(x=[text_embeds, time_embeds], axis=-1)
634
+ add_embeds = add_embeds.to(emb.dtype)
635
+ aug_emb = self.add_embedding(add_embeds)
636
+ elif self.config.addition_embed_type == "image":
637
+ if "image_embeds" not in added_cond_kwargs:
638
+ raise ValueError(
639
+ f"{self.__class__} has the config param `addition_embed_type` set to 'image' which requires the keyword argument `image_embeds` to be passed in `added_cond_kwargs`"
640
+ )
641
+ image_embs = added_cond_kwargs.get("image_embeds")
642
+ aug_emb = self.add_embedding(image_embs)
643
+ elif self.config.addition_embed_type == "image_hint":
644
+ if "image_embeds" not in added_cond_kwargs or "hint" not in added_cond_kwargs:
645
+ raise ValueError(
646
+ f"{self.__class__} has the config param `addition_embed_type` set to 'image_hint' which requires the keyword arguments `image_embeds` and `hint` to be passed in `added_cond_kwargs`"
647
+ )
648
+ image_embs = added_cond_kwargs.get("image_embeds")
649
+ hint = added_cond_kwargs.get("hint")
650
+ aug_emb, hint = self.add_embedding(image_embs, hint)
651
+ sample = paddle.concat(x=[sample, hint], axis=1)
652
+ emb = emb + aug_emb if aug_emb is not None else emb
653
+ if self.time_embed_act is not None:
654
+ emb = self.time_embed_act(emb)
655
+ if self.encoder_hid_proj is not None and self.config.encoder_hid_dim_type == "text_proj":
656
+ encoder_hidden_states = self.encoder_hid_proj(encoder_hidden_states)
657
+ elif self.encoder_hid_proj is not None and self.config.encoder_hid_dim_type == "text_image_proj":
658
+ if "image_embeds" not in added_cond_kwargs:
659
+ raise ValueError(
660
+ f"{self.__class__} has the config param `encoder_hid_dim_type` set to 'text_image_proj' which requires the keyword argument `image_embeds` to be passed in `added_conditions`"
661
+ )
662
+ image_embeds = added_cond_kwargs.get("image_embeds")
663
+ encoder_hidden_states = self.encoder_hid_proj(encoder_hidden_states, image_embeds)
664
+ elif self.encoder_hid_proj is not None and self.config.encoder_hid_dim_type == "image_proj":
665
+ if "image_embeds" not in added_cond_kwargs:
666
+ raise ValueError(
667
+ f"{self.__class__} has the config param `encoder_hid_dim_type` set to 'image_proj' which requires the keyword argument `image_embeds` to be passed in `added_conditions`"
668
+ )
669
+ image_embeds = added_cond_kwargs.get("image_embeds")
670
+ encoder_hidden_states = self.encoder_hid_proj(image_embeds)
671
+ sample = self.conv_in(sample)
672
+ down_block_res_samples = (sample,)
673
+ for downsample_block in self.down_blocks:
674
+ if hasattr(downsample_block, "has_cross_attention") and downsample_block.has_cross_attention:
675
+ sample, res_samples = downsample_block(
676
+ hidden_states=sample,
677
+ temb=emb,
678
+ encoder_hidden_states=encoder_hidden_states,
679
+ attention_mask=attention_mask,
680
+ cross_attention_kwargs=cross_attention_kwargs,
681
+ enable_temporal_attentions=enable_temporal_attentions,
682
+ )
683
+ else:
684
+ sample, res_samples = downsample_block(
685
+ hidden_states=sample,
686
+ temb=emb,
687
+ encoder_hidden_states=encoder_hidden_states,
688
+ enable_temporal_attentions=enable_temporal_attentions,
689
+ )
690
+ down_block_res_samples += res_samples
691
+ if down_block_additional_residuals is not None:
692
+ new_down_block_res_samples = ()
693
+ for down_block_res_sample, down_block_additional_residual in zip(
694
+ down_block_res_samples, down_block_additional_residuals
695
+ ):
696
+ down_block_res_sample = down_block_res_sample + down_block_additional_residual
697
+ new_down_block_res_samples = new_down_block_res_samples + (down_block_res_sample,)
698
+ down_block_res_samples = new_down_block_res_samples
699
+ if self.mid_block is not None:
700
+ sample = self.mid_block(
701
+ sample,
702
+ emb,
703
+ encoder_hidden_states=encoder_hidden_states,
704
+ attention_mask=attention_mask,
705
+ cross_attention_kwargs=cross_attention_kwargs,
706
+ enable_temporal_attentions=enable_temporal_attentions,
707
+ )
708
+ if mid_block_additional_residual is not None:
709
+ sample = sample + mid_block_additional_residual
710
+ for i, upsample_block in enumerate(self.up_blocks):
711
+ is_final_block = i == len(self.up_blocks) - 1
712
+ res_samples = down_block_res_samples[-len(upsample_block.resnets) :]
713
+ down_block_res_samples = down_block_res_samples[: -len(upsample_block.resnets)]
714
+ if not is_final_block and forward_upsample_size:
715
+ upsample_size = tuple(down_block_res_samples[-1].shape)[2:]
716
+ if hasattr(upsample_block, "has_cross_attention") and upsample_block.has_cross_attention:
717
+ sample = upsample_block(
718
+ hidden_states=sample,
719
+ temb=emb,
720
+ res_hidden_states_tuple=res_samples,
721
+ encoder_hidden_states=encoder_hidden_states,
722
+ cross_attention_kwargs=cross_attention_kwargs,
723
+ upsample_size=upsample_size,
724
+ attention_mask=attention_mask,
725
+ enable_temporal_attentions=enable_temporal_attentions,
726
+ )
727
+ else:
728
+ sample = upsample_block(
729
+ hidden_states=sample,
730
+ temb=emb,
731
+ res_hidden_states_tuple=res_samples,
732
+ upsample_size=upsample_size,
733
+ encoder_hidden_states=encoder_hidden_states,
734
+ enable_temporal_attentions=enable_temporal_attentions,
735
+ )
736
+ if self.conv_norm_out:
737
+ sample = self.conv_norm_out(sample)
738
+ sample = self.conv_act(sample)
739
+ sample = self.conv_out(sample)
740
+ if not return_dict:
741
+ return (sample,)
742
+ return UNet3DConditionOutput(sample=sample)
743
+
744
+ @classmethod
745
+ def from_pretrained_spatial(cls, pretrained_model_path, subfolder=None):
746
+ import json
747
+
748
+ if subfolder is not None:
749
+ pretrained_model_path = os.path.join(pretrained_model_path, subfolder)
750
+ config_file = os.path.join(pretrained_model_path, "config.json")
751
+ with open(config_file, "r") as f:
752
+ config = json.load(f)
753
+ config["_class_name"] = "UNet3DConditionModel"
754
+ config["down_block_types"] = ["DownBlock3D", "CrossAttnDownBlock3D", "CrossAttnDownBlock3D"]
755
+ config["up_block_types"] = ["CrossAttnUpBlock3D", "CrossAttnUpBlock3D", "UpBlock3D"]
756
+ config["mid_block_type"] = "UNetMidBlock3DCrossAttn"
757
+ model = cls.from_config(config)
758
+ model_files = [
759
+ os.path.join(pretrained_model_path, "diffusion_paddle_model.bin"),
760
+ os.path.join(pretrained_model_path, "diffusion_paddle_model.safetensors"),
761
+ ]
762
+ model_file = None
763
+ for fp in model_files:
764
+ if os.path.exists(fp):
765
+ model_file = fp
766
+ if not model_file:
767
+ raise RuntimeError(f"{model_file} does not exist")
768
+ if model_file.split(".")[-1] == "safetensors":
769
+ from safetensors import safe_open
770
+
771
+ state_dict = {}
772
+ with safe_open(model_file, framework="pt", device="cuda") as f:
773
+ for key in f.keys():
774
+ state_dict[key] = f.get_tensor(key)
775
+ else:
776
+ state_dict = paddle.load(path=model_file)
777
+ model.set_state_dict(state_dict=state_dict, use_structured_name=False)
778
+ return model
VLMEvalKit_old/PaddleMIX/ppdiffusers/ppdiffusers/models/hotshot_xl/unet_blocks.py ADDED
@@ -0,0 +1,717 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ import paddle
16
+ from paddle.distributed.fleet.utils import recompute
17
+
18
+ from .resnet import Downsample3D, ResnetBlock3D, Upsample3D
19
+ from .transformer_3d import Transformer3DModel
20
+ from .transformer_temporal import TransformerTemporal
21
+
22
+
23
+ def get_down_block(
24
+ down_block_type,
25
+ num_layers,
26
+ in_channels,
27
+ out_channels,
28
+ temb_channels,
29
+ add_downsample,
30
+ resnet_eps,
31
+ resnet_act_fn,
32
+ transformer_layers_per_block=1,
33
+ num_attention_heads=None,
34
+ resnet_groups=None,
35
+ cross_attention_dim=None,
36
+ downsample_padding=None,
37
+ dual_cross_attention=False,
38
+ use_linear_projection=False,
39
+ only_cross_attention=False,
40
+ upcast_attention=False,
41
+ resnet_time_scale_shift="default",
42
+ resnet_skip_time_act=False,
43
+ resnet_out_scale_factor=1.0,
44
+ cross_attention_norm=None,
45
+ attention_head_dim=None,
46
+ downsample_type=None,
47
+ ):
48
+ down_block_type = down_block_type[7:] if down_block_type.startswith("UNetRes") else down_block_type
49
+ if down_block_type == "DownBlock3D":
50
+ return DownBlock3D(
51
+ num_layers=num_layers,
52
+ in_channels=in_channels,
53
+ out_channels=out_channels,
54
+ temb_channels=temb_channels,
55
+ add_downsample=add_downsample,
56
+ resnet_eps=resnet_eps,
57
+ resnet_act_fn=resnet_act_fn,
58
+ resnet_groups=resnet_groups,
59
+ downsample_padding=downsample_padding,
60
+ resnet_time_scale_shift=resnet_time_scale_shift,
61
+ )
62
+ elif down_block_type == "CrossAttnDownBlock3D":
63
+ if cross_attention_dim is None:
64
+ raise ValueError("cross_attention_dim must be specified for CrossAttnDownBlock3D")
65
+ return CrossAttnDownBlock3D(
66
+ num_layers=num_layers,
67
+ in_channels=in_channels,
68
+ out_channels=out_channels,
69
+ transformer_layers_per_block=transformer_layers_per_block,
70
+ temb_channels=temb_channels,
71
+ add_downsample=add_downsample,
72
+ resnet_eps=resnet_eps,
73
+ resnet_act_fn=resnet_act_fn,
74
+ resnet_groups=resnet_groups,
75
+ downsample_padding=downsample_padding,
76
+ cross_attention_dim=cross_attention_dim,
77
+ num_attention_heads=num_attention_heads,
78
+ dual_cross_attention=dual_cross_attention,
79
+ use_linear_projection=use_linear_projection,
80
+ only_cross_attention=only_cross_attention,
81
+ upcast_attention=upcast_attention,
82
+ resnet_time_scale_shift=resnet_time_scale_shift,
83
+ )
84
+ raise ValueError(f"{down_block_type} does not exist.")
85
+
86
+
87
+ def get_up_block(
88
+ up_block_type,
89
+ num_layers,
90
+ in_channels,
91
+ out_channels,
92
+ prev_output_channel,
93
+ temb_channels,
94
+ add_upsample,
95
+ resnet_eps,
96
+ resnet_act_fn,
97
+ transformer_layers_per_block=1,
98
+ num_attention_heads=None,
99
+ resnet_groups=None,
100
+ cross_attention_dim=None,
101
+ dual_cross_attention=False,
102
+ use_linear_projection=False,
103
+ only_cross_attention=False,
104
+ upcast_attention=False,
105
+ resnet_time_scale_shift="default",
106
+ resnet_skip_time_act=False,
107
+ resnet_out_scale_factor=1.0,
108
+ cross_attention_norm=None,
109
+ attention_head_dim=None,
110
+ upsample_type=None,
111
+ ):
112
+ up_block_type = up_block_type[7:] if up_block_type.startswith("UNetRes") else up_block_type
113
+ if up_block_type == "UpBlock3D":
114
+ return UpBlock3D(
115
+ num_layers=num_layers,
116
+ in_channels=in_channels,
117
+ out_channels=out_channels,
118
+ prev_output_channel=prev_output_channel,
119
+ temb_channels=temb_channels,
120
+ add_upsample=add_upsample,
121
+ resnet_eps=resnet_eps,
122
+ resnet_act_fn=resnet_act_fn,
123
+ resnet_groups=resnet_groups,
124
+ resnet_time_scale_shift=resnet_time_scale_shift,
125
+ )
126
+ elif up_block_type == "CrossAttnUpBlock3D":
127
+ if cross_attention_dim is None:
128
+ raise ValueError("cross_attention_dim must be specified for CrossAttnUpBlock3D")
129
+ return CrossAttnUpBlock3D(
130
+ num_layers=num_layers,
131
+ in_channels=in_channels,
132
+ transformer_layers_per_block=transformer_layers_per_block,
133
+ out_channels=out_channels,
134
+ prev_output_channel=prev_output_channel,
135
+ temb_channels=temb_channels,
136
+ add_upsample=add_upsample,
137
+ resnet_eps=resnet_eps,
138
+ resnet_act_fn=resnet_act_fn,
139
+ resnet_groups=resnet_groups,
140
+ cross_attention_dim=cross_attention_dim,
141
+ num_attention_heads=num_attention_heads,
142
+ dual_cross_attention=dual_cross_attention,
143
+ use_linear_projection=use_linear_projection,
144
+ only_cross_attention=only_cross_attention,
145
+ upcast_attention=upcast_attention,
146
+ resnet_time_scale_shift=resnet_time_scale_shift,
147
+ )
148
+ raise ValueError(f"{up_block_type} does not exist.")
149
+
150
+
151
+ class UNetMidBlock3DCrossAttn(paddle.nn.Layer):
152
+ def __init__(
153
+ self,
154
+ in_channels: int,
155
+ temb_channels: int,
156
+ dropout: float = 0.0,
157
+ num_layers: int = 1,
158
+ transformer_layers_per_block: int = 1,
159
+ resnet_eps: float = 1e-06,
160
+ resnet_time_scale_shift: str = "default",
161
+ resnet_act_fn: str = "swish",
162
+ resnet_groups: int = 32,
163
+ resnet_pre_norm: bool = True,
164
+ num_attention_heads=1,
165
+ output_scale_factor=1.0,
166
+ cross_attention_dim=1280,
167
+ dual_cross_attention=False,
168
+ use_linear_projection=False,
169
+ upcast_attention=False,
170
+ ):
171
+ super().__init__()
172
+ self.has_cross_attention = True
173
+ self.num_attention_heads = num_attention_heads
174
+ resnet_groups = resnet_groups if resnet_groups is not None else min(in_channels // 4, 32)
175
+ resnets = [
176
+ ResnetBlock3D(
177
+ in_channels=in_channels,
178
+ out_channels=in_channels,
179
+ temb_channels=temb_channels,
180
+ eps=resnet_eps,
181
+ groups=resnet_groups,
182
+ dropout=dropout,
183
+ time_embedding_norm=resnet_time_scale_shift,
184
+ non_linearity=resnet_act_fn,
185
+ output_scale_factor=output_scale_factor,
186
+ pre_norm=resnet_pre_norm,
187
+ )
188
+ ]
189
+ attentions = []
190
+ for _ in range(num_layers):
191
+ if dual_cross_attention:
192
+ raise NotImplementedError
193
+ attentions.append(
194
+ Transformer3DModel(
195
+ num_attention_heads,
196
+ in_channels // num_attention_heads,
197
+ in_channels=in_channels,
198
+ num_layers=transformer_layers_per_block,
199
+ cross_attention_dim=cross_attention_dim,
200
+ norm_num_groups=resnet_groups,
201
+ use_linear_projection=use_linear_projection,
202
+ upcast_attention=upcast_attention,
203
+ )
204
+ )
205
+ resnets.append(
206
+ ResnetBlock3D(
207
+ in_channels=in_channels,
208
+ out_channels=in_channels,
209
+ temb_channels=temb_channels,
210
+ eps=resnet_eps,
211
+ groups=resnet_groups,
212
+ dropout=dropout,
213
+ time_embedding_norm=resnet_time_scale_shift,
214
+ non_linearity=resnet_act_fn,
215
+ output_scale_factor=output_scale_factor,
216
+ pre_norm=resnet_pre_norm,
217
+ )
218
+ )
219
+ self.attentions = paddle.nn.LayerList(sublayers=attentions)
220
+ self.resnets = paddle.nn.LayerList(sublayers=resnets)
221
+
222
+ def forward(
223
+ self,
224
+ hidden_states,
225
+ temb=None,
226
+ encoder_hidden_states=None,
227
+ attention_mask=None,
228
+ cross_attention_kwargs=None,
229
+ enable_temporal_attentions: bool = True,
230
+ ):
231
+ hidden_states = self.resnets[0](hidden_states, temb)
232
+ for attn, resnet in zip(self.attentions, self.resnets[1:]):
233
+ hidden_states = attn(hidden_states, encoder_hidden_states=encoder_hidden_states).sample
234
+ hidden_states = resnet(hidden_states, temb)
235
+ return hidden_states
236
+
237
+ def temporal_parameters(self) -> list:
238
+ return []
239
+
240
+
241
+ class CrossAttnDownBlock3D(paddle.nn.Layer):
242
+ def __init__(
243
+ self,
244
+ in_channels: int,
245
+ out_channels: int,
246
+ temb_channels: int,
247
+ dropout: float = 0.0,
248
+ num_layers: int = 1,
249
+ transformer_layers_per_block: int = 1,
250
+ resnet_eps: float = 1e-06,
251
+ resnet_time_scale_shift: str = "default",
252
+ resnet_act_fn: str = "swish",
253
+ resnet_groups: int = 32,
254
+ resnet_pre_norm: bool = True,
255
+ num_attention_heads=1,
256
+ cross_attention_dim=1280,
257
+ output_scale_factor=1.0,
258
+ downsample_padding=1,
259
+ add_downsample=True,
260
+ dual_cross_attention=False,
261
+ use_linear_projection=False,
262
+ only_cross_attention=False,
263
+ upcast_attention=False,
264
+ ):
265
+ super().__init__()
266
+ resnets = []
267
+ attentions = []
268
+ temporal_attentions = []
269
+ self.has_cross_attention = True
270
+ self.num_attention_heads = num_attention_heads
271
+ for i in range(num_layers):
272
+ in_channels = in_channels if i == 0 else out_channels
273
+ resnets.append(
274
+ ResnetBlock3D(
275
+ in_channels=in_channels,
276
+ out_channels=out_channels,
277
+ temb_channels=temb_channels,
278
+ eps=resnet_eps,
279
+ groups=resnet_groups,
280
+ dropout=dropout,
281
+ time_embedding_norm=resnet_time_scale_shift,
282
+ non_linearity=resnet_act_fn,
283
+ output_scale_factor=output_scale_factor,
284
+ pre_norm=resnet_pre_norm,
285
+ )
286
+ )
287
+ if dual_cross_attention:
288
+ raise NotImplementedError
289
+ attentions.append(
290
+ Transformer3DModel(
291
+ num_attention_heads,
292
+ out_channels // num_attention_heads,
293
+ in_channels=out_channels,
294
+ num_layers=transformer_layers_per_block,
295
+ cross_attention_dim=cross_attention_dim,
296
+ norm_num_groups=resnet_groups,
297
+ use_linear_projection=use_linear_projection,
298
+ only_cross_attention=only_cross_attention,
299
+ upcast_attention=upcast_attention,
300
+ )
301
+ )
302
+ temporal_attentions.append(
303
+ TransformerTemporal(
304
+ num_attention_heads=8,
305
+ attention_head_dim=out_channels // 8,
306
+ in_channels=out_channels,
307
+ cross_attention_dim=None,
308
+ )
309
+ )
310
+ self.attentions = paddle.nn.LayerList(sublayers=attentions)
311
+ self.resnets = paddle.nn.LayerList(sublayers=resnets)
312
+ self.temporal_attentions = paddle.nn.LayerList(sublayers=temporal_attentions)
313
+ if add_downsample:
314
+ self.downsamplers = paddle.nn.LayerList(
315
+ sublayers=[
316
+ Downsample3D(
317
+ out_channels, use_conv=True, out_channels=out_channels, padding=downsample_padding, name="op"
318
+ )
319
+ ]
320
+ )
321
+ else:
322
+ self.downsamplers = None
323
+ self.gradient_checkpointing = False
324
+
325
+ def forward(
326
+ self,
327
+ hidden_states,
328
+ temb=None,
329
+ encoder_hidden_states=None,
330
+ attention_mask=None,
331
+ cross_attention_kwargs=None,
332
+ enable_temporal_attentions: bool = True,
333
+ ):
334
+ output_states = ()
335
+ for resnet, attn, temporal_attention in zip(self.resnets, self.attentions, self.temporal_attentions):
336
+ if self.training and self.gradient_checkpointing and not hidden_states.stop_gradient:
337
+
338
+ def create_custom_forward(module, return_dict=None):
339
+ def custom_forward(*inputs):
340
+ if return_dict is not None:
341
+ return module(*inputs, return_dict=return_dict)
342
+ else:
343
+ return module(*inputs)
344
+
345
+ return custom_forward
346
+
347
+ hidden_states = recompute(create_custom_forward(resnet), hidden_states, temb, use_reentrant=False)
348
+ hidden_states = recompute(
349
+ create_custom_forward(attn, return_dict=False),
350
+ hidden_states,
351
+ encoder_hidden_states,
352
+ use_reentrant=False,
353
+ )[0]
354
+ if enable_temporal_attentions and temporal_attention is not None:
355
+ hidden_states = recompute(
356
+ create_custom_forward(temporal_attention),
357
+ hidden_states,
358
+ encoder_hidden_states,
359
+ use_reentrant=False,
360
+ )
361
+ else:
362
+ hidden_states = resnet(hidden_states, temb)
363
+ hidden_states = attn(hidden_states, encoder_hidden_states=encoder_hidden_states).sample
364
+ if temporal_attention and enable_temporal_attentions:
365
+ hidden_states = temporal_attention(hidden_states, encoder_hidden_states=encoder_hidden_states)
366
+ output_states += (hidden_states,)
367
+ if self.downsamplers is not None:
368
+ for downsampler in self.downsamplers:
369
+ hidden_states = downsampler(hidden_states)
370
+ output_states += (hidden_states,)
371
+ return hidden_states, output_states
372
+
373
+ def temporal_parameters(self) -> list:
374
+ output = []
375
+ for block in self.temporal_attentions:
376
+ if block:
377
+ output.extend(block.parameters())
378
+ return output
379
+
380
+
381
+ class DownBlock3D(paddle.nn.Layer):
382
+ def __init__(
383
+ self,
384
+ in_channels: int,
385
+ out_channels: int,
386
+ temb_channels: int,
387
+ dropout: float = 0.0,
388
+ num_layers: int = 1,
389
+ resnet_eps: float = 1e-06,
390
+ resnet_time_scale_shift: str = "default",
391
+ resnet_act_fn: str = "swish",
392
+ resnet_groups: int = 32,
393
+ resnet_pre_norm: bool = True,
394
+ output_scale_factor=1.0,
395
+ add_downsample=True,
396
+ downsample_padding=1,
397
+ ):
398
+ super().__init__()
399
+ resnets = []
400
+ temporal_attentions = []
401
+ for i in range(num_layers):
402
+ in_channels = in_channels if i == 0 else out_channels
403
+ resnets.append(
404
+ ResnetBlock3D(
405
+ in_channels=in_channels,
406
+ out_channels=out_channels,
407
+ temb_channels=temb_channels,
408
+ eps=resnet_eps,
409
+ groups=resnet_groups,
410
+ dropout=dropout,
411
+ time_embedding_norm=resnet_time_scale_shift,
412
+ non_linearity=resnet_act_fn,
413
+ output_scale_factor=output_scale_factor,
414
+ pre_norm=resnet_pre_norm,
415
+ )
416
+ )
417
+ temporal_attentions.append(
418
+ TransformerTemporal(
419
+ num_attention_heads=8,
420
+ attention_head_dim=out_channels // 8,
421
+ in_channels=out_channels,
422
+ cross_attention_dim=None,
423
+ )
424
+ )
425
+ self.resnets = paddle.nn.LayerList(sublayers=resnets)
426
+ self.temporal_attentions = paddle.nn.LayerList(sublayers=temporal_attentions)
427
+ if add_downsample:
428
+ self.downsamplers = paddle.nn.LayerList(
429
+ sublayers=[
430
+ Downsample3D(
431
+ out_channels, use_conv=True, out_channels=out_channels, padding=downsample_padding, name="op"
432
+ )
433
+ ]
434
+ )
435
+ else:
436
+ self.downsamplers = None
437
+ self.gradient_checkpointing = False
438
+
439
+ def forward(self, hidden_states, temb=None, encoder_hidden_states=None, enable_temporal_attentions: bool = True):
440
+ output_states = ()
441
+ for resnet, temporal_attention in zip(self.resnets, self.temporal_attentions):
442
+ if self.training and self.gradient_checkpointing and not hidden_states.stop_gradient:
443
+
444
+ def create_custom_forward(module):
445
+ def custom_forward(*inputs):
446
+ return module(*inputs)
447
+
448
+ return custom_forward
449
+
450
+ hidden_states = recompute(create_custom_forward(resnet), hidden_states, temb, use_reentrant=False)
451
+ if enable_temporal_attentions and temporal_attention is not None:
452
+ hidden_states = recompute(
453
+ create_custom_forward(temporal_attention),
454
+ hidden_states,
455
+ encoder_hidden_states,
456
+ use_reentrant=False,
457
+ )
458
+ else:
459
+ hidden_states = resnet(hidden_states, temb)
460
+ if enable_temporal_attentions and temporal_attention:
461
+ hidden_states = temporal_attention(hidden_states, encoder_hidden_states=encoder_hidden_states)
462
+ output_states += (hidden_states,)
463
+ if self.downsamplers is not None:
464
+ for downsampler in self.downsamplers:
465
+ hidden_states = downsampler(hidden_states)
466
+ output_states += (hidden_states,)
467
+ return hidden_states, output_states
468
+
469
+ def temporal_parameters(self) -> list:
470
+ output = []
471
+ for block in self.temporal_attentions:
472
+ if block:
473
+ output.extend(block.parameters())
474
+ return output
475
+
476
+
477
+ class CrossAttnUpBlock3D(paddle.nn.Layer):
478
+ def __init__(
479
+ self,
480
+ in_channels: int,
481
+ out_channels: int,
482
+ prev_output_channel: int,
483
+ temb_channels: int,
484
+ dropout: float = 0.0,
485
+ num_layers: int = 1,
486
+ transformer_layers_per_block: int = 1,
487
+ resnet_eps: float = 1e-06,
488
+ resnet_time_scale_shift: str = "default",
489
+ resnet_act_fn: str = "swish",
490
+ resnet_groups: int = 32,
491
+ resnet_pre_norm: bool = True,
492
+ num_attention_heads=1,
493
+ cross_attention_dim=1280,
494
+ output_scale_factor=1.0,
495
+ add_upsample=True,
496
+ dual_cross_attention=False,
497
+ use_linear_projection=False,
498
+ only_cross_attention=False,
499
+ upcast_attention=False,
500
+ ):
501
+ super().__init__()
502
+ resnets = []
503
+ attentions = []
504
+ temporal_attentions = []
505
+ self.has_cross_attention = True
506
+ self.num_attention_heads = num_attention_heads
507
+ for i in range(num_layers):
508
+ res_skip_channels = in_channels if i == num_layers - 1 else out_channels
509
+ resnet_in_channels = prev_output_channel if i == 0 else out_channels
510
+ resnets.append(
511
+ ResnetBlock3D(
512
+ in_channels=resnet_in_channels + res_skip_channels,
513
+ out_channels=out_channels,
514
+ temb_channels=temb_channels,
515
+ eps=resnet_eps,
516
+ groups=resnet_groups,
517
+ dropout=dropout,
518
+ time_embedding_norm=resnet_time_scale_shift,
519
+ non_linearity=resnet_act_fn,
520
+ output_scale_factor=output_scale_factor,
521
+ pre_norm=resnet_pre_norm,
522
+ )
523
+ )
524
+ if dual_cross_attention:
525
+ raise NotImplementedError
526
+ attentions.append(
527
+ Transformer3DModel(
528
+ num_attention_heads,
529
+ out_channels // num_attention_heads,
530
+ in_channels=out_channels,
531
+ num_layers=transformer_layers_per_block,
532
+ cross_attention_dim=cross_attention_dim,
533
+ norm_num_groups=resnet_groups,
534
+ use_linear_projection=use_linear_projection,
535
+ only_cross_attention=only_cross_attention,
536
+ upcast_attention=upcast_attention,
537
+ )
538
+ )
539
+ temporal_attentions.append(
540
+ TransformerTemporal(
541
+ num_attention_heads=8,
542
+ attention_head_dim=out_channels // 8,
543
+ in_channels=out_channels,
544
+ cross_attention_dim=None,
545
+ )
546
+ )
547
+ self.attentions = paddle.nn.LayerList(sublayers=attentions)
548
+ self.resnets = paddle.nn.LayerList(sublayers=resnets)
549
+ self.temporal_attentions = paddle.nn.LayerList(sublayers=temporal_attentions)
550
+ if add_upsample:
551
+ self.upsamplers = paddle.nn.LayerList(
552
+ sublayers=[Upsample3D(out_channels, use_conv=True, out_channels=out_channels)]
553
+ )
554
+ else:
555
+ self.upsamplers = None
556
+ self.gradient_checkpointing = False
557
+
558
+ def forward(
559
+ self,
560
+ hidden_states,
561
+ res_hidden_states_tuple,
562
+ temb=None,
563
+ encoder_hidden_states=None,
564
+ upsample_size=None,
565
+ cross_attention_kwargs=None,
566
+ attention_mask=None,
567
+ enable_temporal_attentions: bool = True,
568
+ ):
569
+ for resnet, attn, temporal_attention in zip(self.resnets, self.attentions, self.temporal_attentions):
570
+ res_hidden_states = res_hidden_states_tuple[-1]
571
+ res_hidden_states_tuple = res_hidden_states_tuple[:-1]
572
+ hidden_states = paddle.concat(x=[hidden_states, res_hidden_states], axis=1)
573
+ if self.training and self.gradient_checkpointing and not hidden_states.stop_gradient:
574
+
575
+ def create_custom_forward(module, return_dict=None):
576
+ def custom_forward(*inputs):
577
+ if return_dict is not None:
578
+ return module(*inputs, return_dict=return_dict)
579
+ else:
580
+ return module(*inputs)
581
+
582
+ return custom_forward
583
+
584
+ hidden_states = recompute(create_custom_forward(resnet), hidden_states, temb, use_reentrant=False)
585
+ hidden_states = recompute(
586
+ create_custom_forward(attn, return_dict=False),
587
+ hidden_states,
588
+ encoder_hidden_states,
589
+ use_reentrant=False,
590
+ )[0]
591
+ if enable_temporal_attentions and temporal_attention is not None:
592
+ hidden_states = recompute(
593
+ create_custom_forward(temporal_attention),
594
+ hidden_states,
595
+ encoder_hidden_states,
596
+ use_reentrant=False,
597
+ )
598
+ else:
599
+ hidden_states = resnet(hidden_states, temb)
600
+ hidden_states = attn(hidden_states, encoder_hidden_states=encoder_hidden_states).sample
601
+ if enable_temporal_attentions and temporal_attention:
602
+ hidden_states = temporal_attention(hidden_states, encoder_hidden_states=encoder_hidden_states)
603
+ if self.upsamplers is not None:
604
+ for upsampler in self.upsamplers:
605
+ hidden_states = upsampler(hidden_states, upsample_size)
606
+ return hidden_states
607
+
608
+ def temporal_parameters(self) -> list:
609
+ output = []
610
+ for block in self.temporal_attentions:
611
+ if block:
612
+ output.extend(block.parameters())
613
+ return output
614
+
615
+
616
+ class UpBlock3D(paddle.nn.Layer):
617
+ def __init__(
618
+ self,
619
+ in_channels: int,
620
+ prev_output_channel: int,
621
+ out_channels: int,
622
+ temb_channels: int,
623
+ dropout: float = 0.0,
624
+ num_layers: int = 1,
625
+ resnet_eps: float = 1e-06,
626
+ resnet_time_scale_shift: str = "default",
627
+ resnet_act_fn: str = "swish",
628
+ resnet_groups: int = 32,
629
+ resnet_pre_norm: bool = True,
630
+ output_scale_factor=1.0,
631
+ add_upsample=True,
632
+ ):
633
+ super().__init__()
634
+ resnets = []
635
+ temporal_attentions = []
636
+ for i in range(num_layers):
637
+ res_skip_channels = in_channels if i == num_layers - 1 else out_channels
638
+ resnet_in_channels = prev_output_channel if i == 0 else out_channels
639
+ resnets.append(
640
+ ResnetBlock3D(
641
+ in_channels=resnet_in_channels + res_skip_channels,
642
+ out_channels=out_channels,
643
+ temb_channels=temb_channels,
644
+ eps=resnet_eps,
645
+ groups=resnet_groups,
646
+ dropout=dropout,
647
+ time_embedding_norm=resnet_time_scale_shift,
648
+ non_linearity=resnet_act_fn,
649
+ output_scale_factor=output_scale_factor,
650
+ pre_norm=resnet_pre_norm,
651
+ )
652
+ )
653
+ temporal_attentions.append(
654
+ TransformerTemporal(
655
+ num_attention_heads=8,
656
+ attention_head_dim=out_channels // 8,
657
+ in_channels=out_channels,
658
+ cross_attention_dim=None,
659
+ )
660
+ )
661
+ self.resnets = paddle.nn.LayerList(sublayers=resnets)
662
+ self.temporal_attentions = paddle.nn.LayerList(sublayers=temporal_attentions)
663
+ if add_upsample:
664
+ self.upsamplers = paddle.nn.LayerList(
665
+ sublayers=[Upsample3D(out_channels, use_conv=True, out_channels=out_channels)]
666
+ )
667
+ else:
668
+ self.upsamplers = None
669
+ self.gradient_checkpointing = False
670
+
671
+ def forward(
672
+ self,
673
+ hidden_states,
674
+ res_hidden_states_tuple,
675
+ temb=None,
676
+ upsample_size=None,
677
+ encoder_hidden_states=None,
678
+ enable_temporal_attentions: bool = True,
679
+ ):
680
+ for resnet, temporal_attention in zip(self.resnets, self.temporal_attentions):
681
+ res_hidden_states = res_hidden_states_tuple[-1]
682
+ res_hidden_states_tuple = res_hidden_states_tuple[:-1]
683
+ hidden_states = paddle.concat(x=[hidden_states, res_hidden_states], axis=1)
684
+ if self.training and self.gradient_checkpointing and not hidden_states.stop_gradient:
685
+
686
+ def create_custom_forward(module):
687
+ def custom_forward(*inputs):
688
+ return module(*inputs)
689
+
690
+ return custom_forward
691
+
692
+ hidden_states = recompute(create_custom_forward(resnet), hidden_states, temb, use_reentrant=False)
693
+ if enable_temporal_attentions and temporal_attention is not None:
694
+ hidden_states = recompute(
695
+ create_custom_forward(temporal_attention),
696
+ hidden_states,
697
+ encoder_hidden_states,
698
+ use_reentrant=False,
699
+ )
700
+ else:
701
+ hidden_states = resnet(hidden_states, temb)
702
+ hidden_states = (
703
+ temporal_attention(hidden_states, encoder_hidden_states=encoder_hidden_states)
704
+ if enable_temporal_attentions and temporal_attention is not None
705
+ else hidden_states
706
+ )
707
+ if self.upsamplers is not None:
708
+ for upsampler in self.upsamplers:
709
+ hidden_states = upsampler(hidden_states, upsample_size)
710
+ return hidden_states
711
+
712
+ def temporal_parameters(self) -> list:
713
+ output = []
714
+ for block in self.temporal_attentions:
715
+ if block:
716
+ output.extend(block.parameters())
717
+ return output
VLMEvalKit_old/PaddleMIX/ppdiffusers/ppdiffusers/models/stable_cascade/gdf/__init__.py ADDED
@@ -0,0 +1,162 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ import sys
16
+
17
+ import paddle
18
+ import paddle_aux
19
+
20
+ import ppdiffusers
21
+
22
+ from .loss_weights import *
23
+ from .noise_conditions import *
24
+ from .samplers import *
25
+ from .scalers import *
26
+ from .schedulers import *
27
+ from .targets import *
28
+
29
+
30
+ class GDF:
31
+ def __init__(self, schedule, input_scaler, target, noise_cond, loss_weight, offset_noise=0):
32
+ self.schedule = schedule
33
+ self.input_scaler = input_scaler
34
+ self.target = target
35
+ self.noise_cond = noise_cond
36
+ self.loss_weight = loss_weight
37
+ self.offset_noise = offset_noise
38
+
39
+ def setup_limits(self, stretch_max=True, stretch_min=True, shift=1):
40
+ stretched_limits = self.input_scaler.setup_limits(
41
+ self.schedule, self.input_scaler, stretch_max, stretch_min, shift
42
+ )
43
+ return stretched_limits
44
+
45
+ def diffuse(self, x0, epsilon=None, t=None, shift=1, loss_shift=1, offset=None):
46
+ if epsilon is None:
47
+ epsilon = paddle.randn(shape=x0.shape, dtype=x0.dtype)
48
+
49
+ if self.offset_noise > 0:
50
+ if offset is None:
51
+ offset = paddle.randn(
52
+ shape=[x0.shape[0], x0.shape[1]] + [1] * (len(x0.shape) - 2),
53
+ )
54
+ epsilon = epsilon + offset * self.offset_noise
55
+ logSNR = self.schedule(x0.shape[0] if t is None else t, shift=shift)
56
+ a, b = self.input_scaler(logSNR)
57
+ if len(a.shape) == 1:
58
+ a, b = a.reshape([-1, *([1] * (len(x0.shape) - 1))]), b.reshape([-1, *([1] * (len(x0.shape) - 1))])
59
+ target = self.target(x0, epsilon, logSNR, a, b)
60
+ return (
61
+ x0 * a + epsilon * b,
62
+ epsilon,
63
+ target,
64
+ logSNR,
65
+ self.noise_cond(logSNR),
66
+ self.loss_weight(logSNR, shift=loss_shift),
67
+ )
68
+
69
+ def undiffuse(self, x, logSNR, pred):
70
+ a, b = self.input_scaler(logSNR)
71
+ if len(a.shape) == 1:
72
+ a, b = a.reshape([-1, *([1] * (len(x.shape) - 1))]), b.reshape([-1, *([1] * (len(x.shape) - 1))])
73
+ return self.target.x0(x, pred, logSNR, a, b), self.target.epsilon(x, pred, logSNR, a, b)
74
+
75
+ def sample(
76
+ self,
77
+ model,
78
+ model_inputs,
79
+ shape,
80
+ unconditional_inputs=None,
81
+ sampler=None,
82
+ schedule=None,
83
+ t_start=1.0,
84
+ t_end=0.0,
85
+ timesteps=20,
86
+ x_init=None,
87
+ cfg=3.0,
88
+ cfg_t_stop=None,
89
+ cfg_t_start=None,
90
+ cfg_rho=0.7,
91
+ sampler_params=None,
92
+ shift=1,
93
+ device="cpu",
94
+ ):
95
+ sampler_params = {} if sampler_params is None else sampler_params
96
+ if sampler is None:
97
+ sampler = DDPMSampler(self) # noqa
98
+ r_range = paddle.linspace(start=t_start, stop=t_end, num=timesteps + 1)
99
+ schedule = self.schedule if schedule is None else schedule
100
+ logSNR_range = (
101
+ schedule(r_range, shift=shift)[:, None]
102
+ .expand(shape=[-1, shape[0] if x_init is None else x_init.shape[0]])
103
+ .to(device)
104
+ )
105
+ x = sampler.init_x(shape).to(device) if x_init is None else x_init.clone()
106
+ if cfg is not None:
107
+ if unconditional_inputs is None:
108
+ unconditional_inputs = {k: paddle.zeros_like(x=v) for k, v in model_inputs.items()}
109
+ model_inputs = {
110
+ k: (
111
+ paddle.concat(x=[v, v_u], axis=0)
112
+ if isinstance(v, paddle.Tensor)
113
+ else [
114
+ (
115
+ paddle.concat(x=[vi, vi_u], axis=0)
116
+ if isinstance(vi, paddle.Tensor) and isinstance(vi_u, paddle.Tensor)
117
+ else None
118
+ )
119
+ for vi, vi_u in zip(v, v_u)
120
+ ]
121
+ if isinstance(v, list)
122
+ else {vk: paddle.concat(x=[v[vk], v_u.get(vk, paddle.zeros_like(x=v[vk]))], axis=0) for vk in v}
123
+ if isinstance(v, dict)
124
+ else None
125
+ )
126
+ for (k, v), (k_u, v_u) in zip(model_inputs.items(), unconditional_inputs.items())
127
+ }
128
+ for i in range(0, timesteps):
129
+ noise_cond = self.noise_cond(logSNR_range[i])
130
+ if (
131
+ cfg is not None
132
+ and (cfg_t_stop is None or r_range[i].item() >= cfg_t_stop)
133
+ and (cfg_t_start is None or r_range[i].item() <= cfg_t_start)
134
+ ):
135
+ cfg_val = cfg
136
+ if isinstance(cfg_val, (list, tuple)):
137
+ assert len(cfg_val) == 2, "cfg must be a float or a list/tuple of length 2"
138
+ cfg_val = cfg_val[0] * r_range[i].item() + cfg_val[1] * (1 - r_range[i].item())
139
+
140
+ pred, pred_unconditional = model(
141
+ paddle.concat(x=[x, x], axis=0), noise_cond.repeat(2), **model_inputs
142
+ ).chunk(chunks=2)
143
+
144
+ pred_cfg = paddle.lerp(pred_unconditional, pred, paddle.to_tensor(cfg_val, dtype=paddle.float32))
145
+ if cfg_rho > 0:
146
+ std_pos, std_cfg = pred.std(), pred_cfg.std()
147
+ pred = cfg_rho * (pred_cfg * std_pos / (std_cfg + 1e-9)) + pred_cfg * (1 - cfg_rho)
148
+ else:
149
+ pred = pred_cfg
150
+ else:
151
+ pred = model(x, noise_cond, **model_inputs)
152
+
153
+ x0, epsilon = self.undiffuse(x, logSNR_range[i], pred)
154
+ x = sampler(x, x0, epsilon, logSNR_range[i], logSNR_range[i + 1], **sampler_params)
155
+ altered_vars = yield x0, x, pred
156
+ if altered_vars is not None:
157
+ cfg = altered_vars.get("cfg", cfg)
158
+ cfg_rho = altered_vars.get("cfg_rho", cfg_rho)
159
+ sampler = altered_vars.get("sampler", sampler)
160
+ model_inputs = altered_vars.get("model_inputs", model_inputs)
161
+ x = altered_vars.get("x", x)
162
+ x_init = altered_vars.get("x_init", x_init)
VLMEvalKit_old/PaddleMIX/ppdiffusers/ppdiffusers/models/stable_cascade/gdf/loss_weights.py ADDED
@@ -0,0 +1,128 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ import numpy as np
16
+ import paddle
17
+ import paddle_aux # noqa
18
+
19
+
20
+ class BaseLossWeight:
21
+ def weight(self, logSNR):
22
+ raise NotImplementedError("this method needs to be overridden")
23
+
24
+ def __call__(self, logSNR, *args, shift=1, clamp_range=None, **kwargs):
25
+ clamp_range = [-1000000000.0, 1000000000.0] if clamp_range is None else clamp_range
26
+ if shift != 1:
27
+ logSNR = logSNR.clone() + 2 * np.log(shift)
28
+ return self.weight(logSNR, *args, **kwargs).clip(*clamp_range)
29
+
30
+
31
+ class ComposedLossWeight(BaseLossWeight):
32
+ def __init__(self, div, mul):
33
+ self.mul = [mul] if isinstance(mul, BaseLossWeight) else mul
34
+ self.div = [div] if isinstance(div, BaseLossWeight) else div
35
+
36
+ def weight(self, logSNR):
37
+ prod, div = 1, 1
38
+ for m in self.mul:
39
+ prod *= m.weight(logSNR)
40
+ for d in self.div:
41
+ div *= d.weight(logSNR)
42
+ return prod / div
43
+
44
+
45
+ class ConstantLossWeight(BaseLossWeight):
46
+ def __init__(self, v=1):
47
+ self.v = v
48
+
49
+ def weight(self, logSNR):
50
+ return paddle.ones_like(x=logSNR) * self.v
51
+
52
+
53
+ class SNRLossWeight(BaseLossWeight):
54
+ def weight(self, logSNR):
55
+ return logSNR.exp()
56
+
57
+
58
+ class P2LossWeight(BaseLossWeight):
59
+ def __init__(self, k=1.0, gamma=1.0, s=1.0):
60
+ self.k, self.gamma, self.s = k, gamma, s
61
+
62
+ def weight(self, logSNR):
63
+ return (self.k + (logSNR * self.s).exp()) ** -self.gamma
64
+
65
+
66
+ class SNRPlusOneLossWeight(BaseLossWeight):
67
+ def weight(self, logSNR):
68
+ return logSNR.exp() + 1
69
+
70
+
71
+ class MinSNRLossWeight(BaseLossWeight):
72
+ def __init__(self, max_snr=5):
73
+ self.max_snr = max_snr
74
+
75
+ def weight(self, logSNR):
76
+ return logSNR.exp().clip(max=self.max_snr)
77
+
78
+
79
+ class MinSNRPlusOneLossWeight(BaseLossWeight):
80
+ def __init__(self, max_snr=5):
81
+ self.max_snr = max_snr
82
+
83
+ def weight(self, logSNR):
84
+ return (logSNR.exp() + 1).clip(max=self.max_snr)
85
+
86
+
87
+ class TruncatedSNRLossWeight(BaseLossWeight):
88
+ def __init__(self, min_snr=1):
89
+ self.min_snr = min_snr
90
+
91
+ def weight(self, logSNR):
92
+ return logSNR.exp().clip(min=self.min_snr)
93
+
94
+
95
+ class SechLossWeight(BaseLossWeight):
96
+ def __init__(self, div=2):
97
+ self.div = div
98
+
99
+ def weight(self, logSNR):
100
+ return 1 / (logSNR / self.div).cosh()
101
+
102
+
103
+ class DebiasedLossWeight(BaseLossWeight):
104
+ def weight(self, logSNR):
105
+ return 1 / logSNR.exp().sqrt()
106
+
107
+
108
+ class SigmoidLossWeight(BaseLossWeight):
109
+ def __init__(self, s=1):
110
+ self.s = s
111
+
112
+ def weight(self, logSNR):
113
+ return (logSNR * self.s).sigmoid()
114
+
115
+
116
+ class AdaptiveLossWeight(BaseLossWeight):
117
+ def __init__(self, logsnr_range=[-10, 10], buckets=300, weight_range=[1e-07, 10000000.0]):
118
+ self.bucket_ranges = paddle.linspace(start=logsnr_range[0], stop=logsnr_range[1], num=buckets - 1)
119
+ self.bucket_losses = paddle.ones(shape=buckets)
120
+ self.weight_range = weight_range
121
+
122
+ def weight(self, logSNR):
123
+ indices = paddle.searchsorted(sorted_sequence=self.bucket_ranges.to(logSNR.place), values=logSNR)
124
+ return (1 / self.bucket_losses.to(logSNR.place)[indices]).clip([*self.weight_range])
125
+
126
+ def update_buckets(self, logSNR, loss, beta=0.99):
127
+ indices = paddle.searchsorted(sorted_sequence=self.bucket_ranges.to(logSNR.place), values=logSNR).cpu()
128
+ self.bucket_losses[indices] = self.bucket_losses[indices] * beta + loss.detach().cpu() * (1 - beta)
VLMEvalKit_old/PaddleMIX/ppdiffusers/ppdiffusers/models/stable_cascade/gdf/scalers.py ADDED
@@ -0,0 +1,58 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ import paddle
16
+
17
+
18
+ class BaseScaler:
19
+ def __init__(self):
20
+ self.stretched_limits = None
21
+
22
+ def setup_limits(self, schedule, input_scaler, stretch_max=True, stretch_min=True, shift=1):
23
+ min_logSNR = schedule(paddle.ones(shape=[1]), shift=shift)
24
+ max_logSNR = schedule(paddle.zeros(shape=[1]), shift=shift)
25
+ min_a, max_b = [v.item() for v in input_scaler(min_logSNR)] if stretch_max else [0, 1]
26
+ max_a, min_b = [v.item() for v in input_scaler(max_logSNR)] if stretch_min else [1, 0]
27
+ self.stretched_limits = [min_a, max_a, min_b, max_b]
28
+ return self.stretched_limits
29
+
30
+ def stretch_limits(self, a, b):
31
+ min_a, max_a, min_b, max_b = self.stretched_limits
32
+ return (a - min_a) / (max_a - min_a), (b - min_b) / (max_b - min_b)
33
+
34
+ def scalers(self, logSNR):
35
+ raise NotImplementedError("this method needs to be overridden")
36
+
37
+ def __call__(self, logSNR):
38
+ a, b = self.scalers(logSNR)
39
+ if self.stretched_limits is not None:
40
+ a, b = self.stretch_limits(a, b)
41
+ return a, b
42
+
43
+
44
+ class VPScaler(BaseScaler):
45
+ def scalers(self, logSNR):
46
+ a_squared = logSNR.sigmoid()
47
+ a = a_squared.sqrt()
48
+ b = (1 - a_squared).sqrt()
49
+ return a, b
50
+
51
+
52
+ class LERPScaler(BaseScaler):
53
+ def scalers(self, logSNR):
54
+ _a = logSNR.exp() - 1
55
+ _a[_a == 0] = 0.001
56
+ a = 1 + (2 - (2**2 + 4 * _a) ** 0.5) / (2 * _a)
57
+ b = 1 - a
58
+ return a, b
VLMEvalKit_old/PaddleMIX/ppdiffusers/ppdiffusers/models/stable_cascade/gdf/targets.py ADDED
@@ -0,0 +1,59 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+
16
+ class EpsilonTarget:
17
+ def __call__(self, x0, epsilon, logSNR, a, b):
18
+ return epsilon
19
+
20
+ def x0(self, noised, pred, logSNR, a, b):
21
+ return (noised - pred * b) / a
22
+
23
+ def epsilon(self, noised, pred, logSNR, a, b):
24
+ return pred
25
+
26
+
27
+ class X0Target:
28
+ def __call__(self, x0, epsilon, logSNR, a, b):
29
+ return x0
30
+
31
+ def x0(self, noised, pred, logSNR, a, b):
32
+ return pred
33
+
34
+ def epsilon(self, noised, pred, logSNR, a, b):
35
+ return (noised - pred * a) / b
36
+
37
+
38
+ class VTarget:
39
+ def __call__(self, x0, epsilon, logSNR, a, b):
40
+ return a * epsilon - b * x0
41
+
42
+ def x0(self, noised, pred, logSNR, a, b):
43
+ squared_sum = a**2 + b**2
44
+ return a / squared_sum * noised - b / squared_sum * pred
45
+
46
+ def epsilon(self, noised, pred, logSNR, a, b):
47
+ squared_sum = a**2 + b**2
48
+ return b / squared_sum * noised + a / squared_sum * pred
49
+
50
+
51
+ class RectifiedFlowsTarget:
52
+ def __call__(self, x0, epsilon, logSNR, a, b):
53
+ return epsilon - x0
54
+
55
+ def x0(self, noised, pred, logSNR, a, b):
56
+ return noised - pred * b
57
+
58
+ def epsilon(self, noised, pred, logSNR, a, b):
59
+ return noised + pred * a
VLMEvalKit_old/PaddleMIX/ppdiffusers/ppdiffusers/models/stable_cascade/modules/__init__.py ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ from .effnet import EfficientNetEncoder
16
+ from .previewer import Previewer
17
+ from .stage_c import AttnBlock, FeedForwardBlock, ResBlock, StageC, TimestepBlock
VLMEvalKit_old/PaddleMIX/ppdiffusers/ppdiffusers/models/stable_cascade/modules/common.py ADDED
@@ -0,0 +1,151 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ import numpy as np
16
+ import paddle
17
+ import paddle.nn as nn
18
+
19
+
20
+ def load(path="../x.npy"):
21
+ return paddle.to_tensor(np.load(path))
22
+
23
+
24
+ def diff(a, b):
25
+ return (a - b).abs().mean()
26
+
27
+
28
+ class Linear(nn.Linear):
29
+ def reset_parameters(self):
30
+ return None
31
+
32
+
33
+ class Conv2d(nn.Conv2D):
34
+ def reset_parameters(self):
35
+ return None
36
+
37
+
38
+ class Attention2D(nn.Layer):
39
+ def __init__(self, c, nhead, dropout=0.0):
40
+ super().__init__()
41
+ self.attn = nn.MultiHeadAttention(c, nhead, dropout=dropout)
42
+
43
+ def forward(self, x, kv, self_attn=False):
44
+ orig_shape = x.shape
45
+ x = x.reshape([x.shape[0], x.shape[1], -1]).transpose([0, 2, 1])
46
+ if self_attn:
47
+ kv = paddle.concat([x, kv], axis=1)
48
+ x = self.attn(x, kv, kv)
49
+ x = x.transpose([0, 2, 1]).reshape(orig_shape)
50
+ return x
51
+
52
+
53
+ class LayerNorm2d(nn.LayerNorm):
54
+ def __init__(self, *args, **kwargs):
55
+ super().__init__(*args, **kwargs)
56
+
57
+ def forward(self, x):
58
+ return super().forward(x.transpose(perm=[0, 2, 3, 1])).transpose(perm=[0, 3, 1, 2])
59
+
60
+
61
+ class GlobalResponseNorm(nn.Layer):
62
+ def __init__(self, dim):
63
+ super(GlobalResponseNorm, self).__init__()
64
+ self.gamma = self.create_parameter(
65
+ shape=[1, 1, 1, dim], default_initializer=paddle.nn.initializer.Constant(value=0.0)
66
+ )
67
+ self.beta = self.create_parameter(
68
+ shape=[1, 1, 1, dim], default_initializer=paddle.nn.initializer.Constant(value=0.0)
69
+ )
70
+ self.gamma.stop_gradient = False
71
+ self.beta.stop_gradient = False
72
+
73
+ def forward(self, x):
74
+ Gx = paddle.norm(x, p=2, axis=(1, 2), keepdim=True)
75
+ Nx = Gx / (paddle.mean(Gx, axis=-1, keepdim=True) + 1e-6)
76
+ x = self.gamma * (x * Nx) + self.beta + x
77
+ return x
78
+
79
+
80
+ class ResBlock(nn.Layer):
81
+ def __init__(self, c, c_skip=0, kernel_size=3, dropout=0.0):
82
+ super().__init__()
83
+ self.depthwise = Conv2d(c, c, kernel_size=kernel_size, padding=kernel_size // 2, groups=c)
84
+ self.norm = LayerNorm2d(c, weight_attr=False, bias_attr=False, epsilon=1e-06)
85
+ self.channelwise = nn.Sequential(
86
+ Linear(c + c_skip, c * 4),
87
+ nn.GELU(),
88
+ GlobalResponseNorm(c * 4),
89
+ nn.Dropout(p=dropout),
90
+ Linear(c * 4, c),
91
+ )
92
+
93
+ def forward(self, x, x_skip=None):
94
+ x_res = x
95
+ x = self.depthwise(x)
96
+ x = self.norm(x)
97
+ if x_skip is not None:
98
+ x = paddle.concat(x=[x, x_skip], axis=1)
99
+
100
+ x = self.channelwise(x.transpose(perm=[0, 2, 3, 1])).transpose(perm=[0, 3, 1, 2])
101
+ return x + x_res
102
+
103
+
104
+ class AttnBlock(nn.Layer):
105
+ def __init__(self, c, c_cond, nhead, self_attn=True, dropout=0.0):
106
+ super().__init__()
107
+ self.self_attn = self_attn
108
+ self.norm = LayerNorm2d(c, weight_attr=False, bias_attr=False, epsilon=1e-06)
109
+ self.attention = Attention2D(c, nhead, dropout)
110
+ self.kv_mapper = nn.Sequential(nn.Silu(), Linear(c_cond, c))
111
+
112
+ def forward(self, x, kv):
113
+ kv = self.kv_mapper(kv)
114
+ x = x + self.attention(self.norm(x), kv, self_attn=self.self_attn)
115
+ return x
116
+
117
+
118
+ class FeedForwardBlock(nn.Layer):
119
+ def __init__(self, c, dropout=0.0):
120
+ super().__init__()
121
+ self.norm = LayerNorm2d(c, weight_attr=False, bias_attr=False, epsilon=1e-06)
122
+ self.channelwise = nn.Sequential(
123
+ Linear(c, c * 4),
124
+ nn.GELU(),
125
+ GlobalResponseNorm(c * 4),
126
+ nn.Dropout(p=dropout),
127
+ Linear(c * 4, c),
128
+ )
129
+
130
+ def forward(self, x):
131
+ x = x + self.channelwise(self.norm(x).transpose(perm=[0, 2, 3, 1])).transpose(perm=[0, 3, 1, 2])
132
+ return x
133
+
134
+
135
+ class TimestepBlock(nn.Layer):
136
+ def __init__(self, c, c_timestep, conds=["sca"], trainable=True):
137
+ super(TimestepBlock, self).__init__()
138
+ self.mapper = nn.Linear(c_timestep, c * 2, bias_attr=trainable)
139
+ self.conds = conds
140
+ for cname in conds:
141
+ setattr(self, f"mapper_{cname}", nn.Linear(c_timestep, c * 2, bias_attr=trainable))
142
+
143
+ def forward(self, x, t):
144
+ t = paddle.split(t, num_or_sections=len(self.conds) + 1, axis=1)
145
+ a_b = self.mapper(t[0])
146
+ a, b = a_b[:, : a_b.shape[1] // 2, None, None], a_b[:, a_b.shape[1] // 2 :, None, None]
147
+ for i, c in enumerate(self.conds):
148
+ ac_bc = getattr(self, f"mapper_{c}")(t[i + 1])
149
+ ac, bc = ac_bc[:, : ac_bc.shape[1] // 2, None, None], ac_bc[:, ac_bc.shape[1] // 2 :, None, None]
150
+ a, b = a + ac, b + bc
151
+ return x * (1 + a) + b
VLMEvalKit_old/PaddleMIX/ppdiffusers/ppdiffusers/models/stable_cascade/modules/efficientnet_v2_s.py ADDED
@@ -0,0 +1,561 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ import copy
16
+ import math
17
+ from dataclasses import dataclass
18
+ from functools import partial
19
+ from typing import Any, Callable, Dict, List, Optional, Sequence, Tuple, Union
20
+
21
+ import paddle
22
+ import paddle.nn as nn
23
+ from paddle import Tensor
24
+ from paddle.nn import (
25
+ AdaptiveAvgPool2D,
26
+ BatchNorm,
27
+ BatchNorm2D,
28
+ Conv2D,
29
+ Dropout,
30
+ GroupNorm,
31
+ Layer,
32
+ Linear,
33
+ ReLU,
34
+ Sequential,
35
+ Sigmoid,
36
+ Silu,
37
+ )
38
+ from paddle.nn.initializer import Constant, KaimingNormal, Uniform
39
+ from paddle.utils.download import get_weights_path_from_url
40
+
41
+ __all__ = ["EfficientNet", "EfficientNet_V2_S_Weights", "efficientnet_v2_s"]
42
+
43
+
44
+ class SqueezeExcitation(paddle.nn.Layer):
45
+ """
46
+ This block implements the Squeeze-and-Excitation block from https://arxiv.org/abs/1709.01507 (see Fig. 1).
47
+ Parameters ``activation`` and ``scale_activation`` correspond to ``delta`` and ``sigma`` in eq. 3.
48
+
49
+ Args:
50
+ input_channels (int): Number of channels in the input feature maps
51
+ squeeze_channels (int): Number of squeeze channels
52
+ activation (Callable[[Tensor], Tensor], optional): ``delta`` activation. Default: ReLU
53
+ scale_activation (Callable[[Tensor], Tensor], optional): ``sigma`` activation. Default: Sigmoid
54
+ """
55
+
56
+ def __init__(
57
+ self,
58
+ input_channels: int,
59
+ squeeze_channels: int,
60
+ activation: Callable[[Tensor], Tensor] = ReLU(),
61
+ scale_activation: Callable[[Tensor], Tensor] = Sigmoid(),
62
+ ) -> None:
63
+ super(SqueezeExcitation, self).__init__()
64
+ self.avgpool = AdaptiveAvgPool2D(1)
65
+ self.fc1 = Conv2D(in_channels=input_channels, out_channels=squeeze_channels, kernel_size=1)
66
+ self.fc2 = Conv2D(in_channels=squeeze_channels, out_channels=input_channels, kernel_size=1)
67
+ self.activation = activation
68
+ self.scale_activation = scale_activation
69
+
70
+ def forward(self, input: paddle.Tensor) -> paddle.Tensor:
71
+ scale = self.avgpool(input)
72
+ scale = self.fc1(scale)
73
+ scale = self.activation(scale)
74
+ scale = self.fc2(scale)
75
+ scale = self.scale_activation(scale)
76
+ return scale * input
77
+
78
+
79
+ def stochastic_depth(input, p, mode, training=True):
80
+ """
81
+ Implements the Stochastic Depth from `"Deep Networks with Stochastic Depth"
82
+ <https://arxiv.org/abs/1603.09382>`_ used for randomly dropping residual
83
+ branches of residual architectures.
84
+
85
+ Args:
86
+ input (paddle.Tensor): The input tensor or arbitrary dimensions with the first one
87
+ being its batch i.e. a batch with ``N`` rows.
88
+ p (float): probability of the input to be zeroed.
89
+ mode (str): ``"batch"`` or ``"row"``.
90
+ ``"batch"`` randomly zeroes the entire input, ``"row"`` zeroes
91
+ randomly selected rows from the batch.
92
+ training (bool): apply stochastic depth if is ``True``. Default: ``True``
93
+
94
+ Returns:
95
+ paddle.Tensor: The randomly zeroed tensor.
96
+ """
97
+ if p < 0.0 or p > 1.0:
98
+ raise ValueError(f"drop probability has to be between 0 and 1, but got {p}")
99
+ if mode not in ["batch", "row"]:
100
+ raise ValueError(f"mode has to be either 'batch' or 'row', but got {mode}")
101
+ if not training or p == 0.0:
102
+ return input
103
+
104
+ survival_rate = 1.0 - p
105
+ if mode == "row":
106
+ size = [input.shape[0]] + [1] * (input.ndim - 1)
107
+ else:
108
+ size = [1] * input.ndim
109
+ noise = paddle.empty(size, dtype=input.dtype)
110
+ survival_rate = paddle.to_tensor(survival_rate, dtype=input.dtype)
111
+ paddle.assign(paddle.bernoulli(paddle.broadcast_to(survival_rate, noise.shape)), noise)
112
+ if survival_rate > 0.0:
113
+ noise /= survival_rate
114
+ return input * noise
115
+
116
+
117
+ class StochasticDepth(Layer):
118
+ """
119
+ See :func:`stochastic_depth`.
120
+ """
121
+
122
+ def __init__(self, p: float, mode: str) -> None:
123
+ super(StochasticDepth, self).__init__()
124
+ self.p = p
125
+ self.mode = mode
126
+
127
+ def forward(self, input):
128
+ return stochastic_depth(input, self.p, self.mode, self.training)
129
+
130
+ def __repr__(self):
131
+ s = f"{self.__class__.__name__}(p={self.p}, mode={self.mode})"
132
+ return s
133
+
134
+
135
+ def _make_ntuple(value, n):
136
+ """Helper function to create a tuple of size n with the given value."""
137
+ if isinstance(value, int):
138
+ return (value,) * n
139
+ return value
140
+
141
+
142
+ class ConvNormActivation(Sequential):
143
+ def __init__(
144
+ self,
145
+ in_channels: int,
146
+ out_channels: int,
147
+ kernel_size: Union[int, Sequence[int]] = 3,
148
+ stride: Union[int, Sequence[int]] = 1,
149
+ padding: Optional[Union[int, Sequence[int], str]] = None,
150
+ groups: int = 1,
151
+ norm_layer: Optional[Callable[..., paddle.nn.Layer]] = BatchNorm,
152
+ activation_layer: Optional[Callable[..., paddle.nn.Layer]] = ReLU,
153
+ dilation: Union[int, Sequence[int]] = 1,
154
+ inplace: Optional[bool] = True,
155
+ bias: Optional[bool] = None,
156
+ conv_layer: Callable[..., Conv2D] = Conv2D,
157
+ ) -> None:
158
+ if padding is None:
159
+ padding = (kernel_size - 1) // 2 * dilation
160
+ else:
161
+ padding = _make_ntuple(padding, len(kernel_size))
162
+
163
+ layers = [
164
+ conv_layer(
165
+ in_channels,
166
+ out_channels,
167
+ kernel_size,
168
+ stride,
169
+ padding,
170
+ dilation=dilation,
171
+ groups=groups,
172
+ bias_attr=False if bias is None else bias,
173
+ )
174
+ ]
175
+
176
+ if norm_layer is not None:
177
+ norm_layer_instance = norm_layer(out_channels, use_global_stats=True)
178
+ layers.append(norm_layer_instance)
179
+
180
+ if activation_layer is not None:
181
+ layers.append(activation_layer)
182
+
183
+ super(ConvNormActivation, self).__init__(*layers)
184
+ self.out_channels = out_channels
185
+
186
+
187
+ class Conv2DNormActivation(ConvNormActivation):
188
+ def __init__(
189
+ self,
190
+ in_channels: int,
191
+ out_channels: int,
192
+ kernel_size: Union[int, Tuple[int, int]] = 3,
193
+ stride: Union[int, Tuple[int, int]] = 1,
194
+ padding: Optional[Union[int, Tuple[int, int], str]] = None,
195
+ groups: int = 1,
196
+ norm_layer: Optional[Callable[..., paddle.nn.Layer]] = BatchNorm,
197
+ activation_layer: Optional[Callable[..., paddle.nn.Layer]] = ReLU,
198
+ dilation: Union[int, Tuple[int, int]] = 1,
199
+ inplace: Optional[bool] = True,
200
+ bias: Optional[bool] = None,
201
+ ) -> None:
202
+ super().__init__(
203
+ in_channels,
204
+ out_channels,
205
+ kernel_size,
206
+ stride,
207
+ padding,
208
+ groups,
209
+ norm_layer,
210
+ activation_layer,
211
+ dilation,
212
+ inplace,
213
+ bias,
214
+ Conv2D,
215
+ )
216
+
217
+
218
+ class EfficientNet_V2_S_Weights:
219
+ IMAGENET1K_V1 = "https://download.pytorch.org/models/efficientnet_v2_s-dd5fe13b.pth"
220
+
221
+ def __init__(self, url: str, transforms: Callable[..., Any], meta: Dict[str, Any]) -> None:
222
+ self.url = url
223
+ self.transforms = transforms
224
+ self.meta = meta
225
+
226
+ def state_dict(self, progress: bool = True, check_hash: bool = False) -> Dict[str, Any]:
227
+ path = get_weights_path_from_url(self.url, progress=progress, check_hash=check_hash)
228
+ return paddle.load(path)
229
+
230
+ @classmethod
231
+ def verify(cls, weights):
232
+ if weights is None:
233
+ return None
234
+ if not isinstance(weights, EfficientNet_V2_S_Weights):
235
+ raise ValueError(f"weights must be an instance of EfficientNet_V2_S_Weights, but got {type(weights)}")
236
+ return weights
237
+
238
+
239
+ @dataclass
240
+ class _MBConvConfig:
241
+ expand_ratio: float
242
+ kernel: int
243
+ stride: int
244
+ input_channels: int
245
+ out_channels: int
246
+ num_layers: int
247
+ block: Callable[..., paddle.nn.Layer]
248
+
249
+ @staticmethod
250
+ def adjust_channels(channels: int, width_mult: float, min_value: Optional[int] = None) -> int:
251
+ return _make_divisible(channels * width_mult, 8, min_value)
252
+
253
+
254
+ class MBConvConfig(_MBConvConfig):
255
+ def __init__(
256
+ self,
257
+ expand_ratio: float,
258
+ kernel: int,
259
+ stride: int,
260
+ input_channels: int,
261
+ out_channels: int,
262
+ num_layers: int,
263
+ width_mult: float = 1.0,
264
+ depth_mult: float = 1.0,
265
+ block: Optional[Callable[..., paddle.nn.Layer]] = None,
266
+ ) -> None:
267
+ input_channels = self.adjust_channels(input_channels, width_mult)
268
+ out_channels = self.adjust_channels(out_channels, width_mult)
269
+ num_layers = self.adjust_depth(num_layers, depth_mult)
270
+ if block is None:
271
+ block = MBConv
272
+ super().__init__(expand_ratio, kernel, stride, input_channels, out_channels, num_layers, block)
273
+
274
+ @staticmethod
275
+ def adjust_depth(num_layers: int, depth_mult: float):
276
+ return int(math.ceil(num_layers * depth_mult))
277
+
278
+
279
+ class FusedMBConvConfig(_MBConvConfig):
280
+ def __init__(
281
+ self,
282
+ expand_ratio: float,
283
+ kernel: int,
284
+ stride: int,
285
+ input_channels: int,
286
+ out_channels: int,
287
+ num_layers: int,
288
+ block: Optional[Callable[..., paddle.nn.Layer]] = None,
289
+ ) -> None:
290
+ if block is None:
291
+ block = FusedMBConv
292
+ super().__init__(expand_ratio, kernel, stride, input_channels, out_channels, num_layers, block)
293
+
294
+
295
+ class MBConv(Layer):
296
+ def __init__(
297
+ self,
298
+ cnf,
299
+ stochastic_depth_prob: float,
300
+ norm_layer: Callable[..., Layer],
301
+ se_layer: Callable[..., Layer] = SqueezeExcitation,
302
+ ) -> None:
303
+ super(MBConv, self).__init__()
304
+
305
+ if not (1 <= cnf.stride <= 2):
306
+ raise ValueError("illegal stride value")
307
+
308
+ self.use_res_connect = cnf.stride == 1 and cnf.input_channels == cnf.out_channels
309
+
310
+ layers = []
311
+ activation_layer = nn.Silu()
312
+
313
+ # expand
314
+ expanded_channels = cnf.adjust_channels(cnf.input_channels, cnf.expand_ratio)
315
+ if expanded_channels != cnf.input_channels:
316
+ layers.append(
317
+ Conv2DNormActivation(
318
+ cnf.input_channels,
319
+ expanded_channels,
320
+ kernel_size=1,
321
+ norm_layer=norm_layer,
322
+ activation_layer=activation_layer,
323
+ )
324
+ )
325
+
326
+ # depthwise
327
+ layers.append(
328
+ Conv2DNormActivation(
329
+ expanded_channels,
330
+ expanded_channels,
331
+ kernel_size=cnf.kernel,
332
+ stride=cnf.stride,
333
+ groups=expanded_channels,
334
+ norm_layer=norm_layer,
335
+ activation_layer=activation_layer,
336
+ )
337
+ )
338
+
339
+ # squeeze and excitation
340
+ squeeze_channels = max(1, cnf.input_channels // 4)
341
+ layers.append(se_layer(expanded_channels, squeeze_channels, activation=nn.Silu()))
342
+
343
+ # project
344
+ layers.append(
345
+ Conv2DNormActivation(
346
+ expanded_channels, cnf.out_channels, kernel_size=1, norm_layer=norm_layer, activation_layer=None
347
+ )
348
+ )
349
+
350
+ self.block = Sequential(*layers)
351
+ self.stochastic_depth = StochasticDepth(stochastic_depth_prob, "row")
352
+ self.out_channels = cnf.out_channels
353
+
354
+ def forward(self, input) -> paddle.Tensor:
355
+ result = self.block(input)
356
+ if self.use_res_connect:
357
+ result = self.stochastic_depth(result)
358
+ result += input
359
+ return result
360
+
361
+
362
+ class FusedMBConv(Layer):
363
+ def __init__(
364
+ self,
365
+ cnf: "FusedMBConvConfig",
366
+ stochastic_depth_prob: float,
367
+ norm_layer: Callable[..., Layer],
368
+ ) -> None:
369
+ super(FusedMBConv, self).__init__()
370
+
371
+ if not (1 <= cnf.stride <= 2):
372
+ raise ValueError("illegal stride value")
373
+
374
+ self.use_res_connect = cnf.stride == 1 and cnf.input_channels == cnf.out_channels
375
+
376
+ layers: List[Layer] = []
377
+ activation_layer = nn.Silu()
378
+
379
+ expanded_channels = cnf.adjust_channels(cnf.input_channels, cnf.expand_ratio)
380
+ if expanded_channels != cnf.input_channels:
381
+ # fused expand and project
382
+ layers.append(
383
+ Conv2DNormActivation(
384
+ cnf.input_channels,
385
+ expanded_channels,
386
+ kernel_size=cnf.kernel,
387
+ stride=cnf.stride,
388
+ norm_layer=norm_layer,
389
+ activation_layer=activation_layer,
390
+ )
391
+ )
392
+ # project
393
+ layers.append(
394
+ Conv2DNormActivation(
395
+ expanded_channels,
396
+ cnf.out_channels,
397
+ kernel_size=1,
398
+ norm_layer=norm_layer,
399
+ activation_layer=None,
400
+ )
401
+ )
402
+ else:
403
+ layers.append(
404
+ Conv2DNormActivation(
405
+ cnf.input_channels,
406
+ cnf.out_channels,
407
+ kernel_size=cnf.kernel,
408
+ stride=cnf.stride,
409
+ norm_layer=norm_layer,
410
+ activation_layer=activation_layer,
411
+ )
412
+ )
413
+
414
+ self.block = Sequential(*layers)
415
+ self.stochastic_depth = StochasticDepth(stochastic_depth_prob, "row")
416
+ self.out_channels = cnf.out_channels
417
+
418
+ def forward(self, input: Tensor) -> Tensor:
419
+ result = self.block(input)
420
+ if self.use_res_connect:
421
+ result = self.stochastic_depth(result)
422
+ result += input
423
+ return result
424
+
425
+
426
+ class EfficientNet(Layer):
427
+ def __init__(
428
+ self,
429
+ inverted_residual_setting: Sequence[Union[MBConvConfig, FusedMBConvConfig]],
430
+ dropout: float,
431
+ stochastic_depth_prob: float = 0.2,
432
+ num_classes: int = 1000,
433
+ norm_layer: Optional[Callable[..., paddle.nn.Layer]] = None,
434
+ last_channel: Optional[int] = None,
435
+ ) -> None:
436
+ super().__init__()
437
+ if not inverted_residual_setting:
438
+ raise ValueError("The inverted_residual_setting should not be empty")
439
+ elif not (
440
+ isinstance(inverted_residual_setting, Sequence)
441
+ and all([isinstance(s, _MBConvConfig) for s in inverted_residual_setting])
442
+ ):
443
+ raise TypeError("The inverted_residual_setting should be List[MBConvConfig]")
444
+ if norm_layer is None:
445
+ norm_layer = BatchNorm2D
446
+ layers: List[paddle.nn.Layer] = []
447
+ firstconv_output_channels = inverted_residual_setting[0].input_channels
448
+ layers.append(
449
+ Conv2DNormActivation(
450
+ 3, firstconv_output_channels, kernel_size=3, stride=2, norm_layer=norm_layer, activation_layer=Silu()
451
+ )
452
+ )
453
+ total_stage_blocks = sum(cnf.num_layers for cnf in inverted_residual_setting)
454
+ stage_block_id = 0
455
+ for cnf in inverted_residual_setting:
456
+ stage: List[paddle.nn.Layer] = []
457
+ for _ in range(cnf.num_layers):
458
+ block_cnf = copy.copy(cnf)
459
+ if stage:
460
+ block_cnf.input_channels = block_cnf.out_channels
461
+ block_cnf.stride = 1
462
+ sd_prob = stochastic_depth_prob * float(stage_block_id) / total_stage_blocks
463
+ stage.append(block_cnf.block(block_cnf, sd_prob, norm_layer))
464
+ stage_block_id += 1
465
+ layers.append(Sequential(*stage))
466
+ lastconv_input_channels = inverted_residual_setting[-1].out_channels
467
+ lastconv_output_channels = last_channel if last_channel is not None else 4 * lastconv_input_channels
468
+ layers.append(
469
+ Conv2DNormActivation(
470
+ lastconv_input_channels,
471
+ lastconv_output_channels,
472
+ kernel_size=1,
473
+ norm_layer=norm_layer,
474
+ activation_layer=Silu(),
475
+ )
476
+ )
477
+ self.features = Sequential(*layers)
478
+ self.avgpool = AdaptiveAvgPool2D(output_size=1)
479
+ self.classifier = Sequential(
480
+ Dropout(p=dropout), Linear(in_features=lastconv_output_channels, out_features=num_classes)
481
+ )
482
+
483
+ for m in self.sublayers():
484
+ if isinstance(m, Conv2D):
485
+ KaimingNormal()(m.weight)
486
+ if m.bias is not None:
487
+ Constant(value=0.0)(m.bias)
488
+ elif isinstance(m, (BatchNorm2D, GroupNorm)):
489
+ Constant(value=1.0)(m.weight)
490
+ Constant(value=0.0)(m.bias)
491
+ elif isinstance(m, Linear):
492
+ init_range = 1.0 / math.sqrt(m.weight.shape[1])
493
+ Uniform(low=-init_range, high=init_range)(m.weight)
494
+ Constant(value=0.0)(m.bias)
495
+
496
+ def forward(self, x: paddle.Tensor) -> paddle.Tensor:
497
+ x = self.features(x)
498
+ x = self.avgpool(x)
499
+ x = paddle.flatten(x=x, start_axis=1)
500
+ x = self.classifier(x)
501
+ return x
502
+
503
+
504
+ def _make_divisible(value: float, divisor: int, min_value: Optional[int] = None) -> int:
505
+ if min_value is None:
506
+ min_value = divisor
507
+ new_value = max(min_value, int(value + divisor / 2) // divisor * divisor)
508
+ if new_value < 0.9 * value:
509
+ new_value += divisor
510
+ return new_value
511
+
512
+
513
+ def _efficientnet(
514
+ inverted_residual_setting: Sequence[Union[MBConvConfig, FusedMBConvConfig]],
515
+ dropout: float,
516
+ last_channel: Optional[int],
517
+ weights: Optional[EfficientNet_V2_S_Weights],
518
+ progress: bool,
519
+ **kwargs: Any
520
+ ) -> EfficientNet:
521
+ if weights is not None:
522
+ kwargs["num_classes"] = len(weights.meta["categories"])
523
+ model = EfficientNet(inverted_residual_setting, dropout, last_channel=last_channel, **kwargs)
524
+ if weights is not None:
525
+ model.set_state_dict(weights.state_dict(progress=progress, check_hash=True))
526
+ return model
527
+
528
+
529
+ def _efficientnet_conf(
530
+ arch: str, **kwargs: Any
531
+ ) -> Tuple[Sequence[Union[MBConvConfig, FusedMBConvConfig]], Optional[int]]:
532
+ inverted_residual_setting: Sequence[Union[MBConvConfig, FusedMBConvConfig]]
533
+ if arch.startswith("efficientnet_v2_s"):
534
+ inverted_residual_setting = [
535
+ FusedMBConvConfig(1, 3, 1, 24, 24, 2),
536
+ FusedMBConvConfig(4, 3, 2, 24, 48, 4),
537
+ FusedMBConvConfig(4, 3, 2, 48, 64, 4),
538
+ MBConvConfig(4, 3, 2, 64, 128, 6),
539
+ MBConvConfig(6, 3, 1, 128, 160, 9),
540
+ MBConvConfig(6, 3, 2, 160, 256, 15),
541
+ ]
542
+ last_channel = 1280
543
+ else:
544
+ raise ValueError(f"Unsupported model type {arch}")
545
+ return inverted_residual_setting, last_channel
546
+
547
+
548
+ def efficientnet_v2_s(
549
+ *, weights: Optional[EfficientNet_V2_S_Weights] = None, progress: bool = True, **kwargs: Any
550
+ ) -> EfficientNet:
551
+ weights = EfficientNet_V2_S_Weights.verify(weights)
552
+ inverted_residual_setting, last_channel = _efficientnet_conf("efficientnet_v2_s")
553
+ return _efficientnet(
554
+ inverted_residual_setting,
555
+ kwargs.pop("dropout", 0.2),
556
+ last_channel,
557
+ weights,
558
+ progress,
559
+ norm_layer=partial(BatchNorm2D, epsilon=0.001),
560
+ **kwargs,
561
+ )
VLMEvalKit_old/PaddleMIX/ppdiffusers/ppdiffusers/models/stable_cascade/modules/effnet.py ADDED
@@ -0,0 +1,83 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ import paddle
16
+ import paddle.nn as nn
17
+
18
+ from .efficientnet_v2_s import efficientnet_v2_s
19
+
20
+
21
+ class BatchNorm2D(nn.Layer):
22
+ def __init__(self, num_features, eps=1e-5, momentum=0.1, affine=True, track_running_stats=True):
23
+ super(BatchNorm2D, self).__init__()
24
+ self.num_features = num_features
25
+ self.eps = eps
26
+ self.momentum = momentum
27
+ self.affine = affine
28
+ self.track_running_stats = track_running_stats
29
+
30
+ if self.affine:
31
+ self.weight = self.create_parameter(
32
+ shape=[num_features], default_initializer=paddle.nn.initializer.Constant(value=1.0)
33
+ )
34
+ self.bias = self.create_parameter(
35
+ shape=[num_features], default_initializer=paddle.nn.initializer.Constant(value=0.0)
36
+ )
37
+ else:
38
+ self.weight = None
39
+ self.bias = None
40
+
41
+ if self.track_running_stats:
42
+ self._mean = self.create_parameter(
43
+ shape=[num_features], default_initializer=paddle.nn.initializer.Constant(value=0.0), is_bias=False
44
+ )
45
+ self._variance = self.create_parameter(
46
+ shape=[num_features], default_initializer=paddle.nn.initializer.Constant(value=1.0), is_bias=False
47
+ )
48
+ self._mean.stop_gradient = True
49
+ self._variance.stop_gradient = True
50
+ else:
51
+ self._mean = None
52
+ self._variance = None
53
+
54
+ def forward(self, input):
55
+ mean = self._mean
56
+ variance = self._variance
57
+
58
+ output = (input - paddle.unsqueeze(mean, axis=[0, 2, 3])) / paddle.unsqueeze(
59
+ paddle.sqrt(variance + self.eps), axis=[0, 2, 3]
60
+ )
61
+ if self.affine:
62
+ output = output * paddle.unsqueeze(self.weight, axis=[0, 2, 3]) + paddle.unsqueeze(
63
+ self.bias, axis=[0, 2, 3]
64
+ )
65
+ return output
66
+
67
+
68
+ class EfficientNetEncoder(nn.Layer):
69
+ def __init__(self, c_latent=16):
70
+ super().__init__()
71
+ self.backbone = efficientnet_v2_s().features
72
+ self.backbone.eval()
73
+ self.mapper = nn.Sequential(
74
+ nn.Conv2D(1280, c_latent, kernel_size=1, bias_attr=False),
75
+ BatchNorm2D(c_latent, affine=False),
76
+ )
77
+ self.mapper.eval()
78
+
79
+ def forward(self, x):
80
+
81
+ x = self.backbone(x)
82
+ x = self.mapper(x)
83
+ return x
VLMEvalKit_old/PaddleMIX/ppdiffusers/ppdiffusers/models/stable_cascade/modules/previewer.py ADDED
@@ -0,0 +1,80 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ import paddle
16
+
17
+
18
+ class Previewer(paddle.nn.Layer):
19
+ def __init__(self, c_in=16, c_hidden=512, c_out=3):
20
+ super().__init__()
21
+ self.blocks = paddle.nn.Sequential(
22
+ paddle.nn.Conv2D(in_channels=c_in, out_channels=c_hidden, kernel_size=1),
23
+ paddle.nn.GELU(),
24
+ paddle.nn.BatchNorm2D(num_features=c_hidden),
25
+ paddle.nn.Conv2D(in_channels=c_hidden, out_channels=c_hidden, kernel_size=3, padding=1),
26
+ paddle.nn.GELU(),
27
+ paddle.nn.BatchNorm2D(num_features=c_hidden),
28
+ paddle.nn.Conv2DTranspose(
29
+ in_channels=c_hidden,
30
+ out_channels=c_hidden // 2,
31
+ kernel_size=2,
32
+ stride=2,
33
+ ),
34
+ paddle.nn.GELU(),
35
+ paddle.nn.BatchNorm2D(num_features=c_hidden // 2),
36
+ paddle.nn.Conv2D(
37
+ in_channels=c_hidden // 2,
38
+ out_channels=c_hidden // 2,
39
+ kernel_size=3,
40
+ padding=1,
41
+ ),
42
+ paddle.nn.GELU(),
43
+ paddle.nn.BatchNorm2D(num_features=c_hidden // 2),
44
+ paddle.nn.Conv2DTranspose(
45
+ in_channels=c_hidden // 2,
46
+ out_channels=c_hidden // 4,
47
+ kernel_size=2,
48
+ stride=2,
49
+ ),
50
+ paddle.nn.GELU(),
51
+ paddle.nn.BatchNorm2D(num_features=c_hidden // 4),
52
+ paddle.nn.Conv2D(
53
+ in_channels=c_hidden // 4,
54
+ out_channels=c_hidden // 4,
55
+ kernel_size=3,
56
+ padding=1,
57
+ ),
58
+ paddle.nn.GELU(),
59
+ paddle.nn.BatchNorm2D(num_features=c_hidden // 4),
60
+ paddle.nn.Conv2DTranspose(
61
+ in_channels=c_hidden // 4,
62
+ out_channels=c_hidden // 4,
63
+ kernel_size=2,
64
+ stride=2,
65
+ ),
66
+ paddle.nn.GELU(),
67
+ paddle.nn.BatchNorm2D(num_features=c_hidden // 4),
68
+ paddle.nn.Conv2D(
69
+ in_channels=c_hidden // 4,
70
+ out_channels=c_hidden // 4,
71
+ kernel_size=3,
72
+ padding=1,
73
+ ),
74
+ paddle.nn.GELU(),
75
+ paddle.nn.BatchNorm2D(num_features=c_hidden // 4),
76
+ paddle.nn.Conv2D(in_channels=c_hidden // 4, out_channels=c_out, kernel_size=1),
77
+ )
78
+
79
+ def forward(self, x):
80
+ return self.blocks(x)
VLMEvalKit_old/PaddleMIX/ppdiffusers/ppdiffusers/models/stable_cascade/modules/stage_a.py ADDED
@@ -0,0 +1,206 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ import paddle
16
+ from torchtools.nn import VectorQuantize
17
+
18
+
19
+ class ResBlock(paddle.nn.Layer):
20
+ def __init__(self, c, c_hidden):
21
+ super().__init__()
22
+ self.norm1 = paddle.nn.LayerNorm(normalized_shape=c, weight_attr=False, bias_attr=False, epsilon=1e-06)
23
+ self.depthwise = paddle.nn.Sequential(
24
+ paddle.nn.Pad2D(padding=1, mode="replicate"),
25
+ paddle.nn.Conv2D(in_channels=c, out_channels=c, kernel_size=3, groups=c),
26
+ )
27
+ self.norm2 = paddle.nn.LayerNorm(normalized_shape=c, weight_attr=False, bias_attr=False, epsilon=1e-06)
28
+ self.channelwise = paddle.nn.Sequential(
29
+ paddle.nn.Linear(in_features=c, out_features=c_hidden),
30
+ paddle.nn.GELU(),
31
+ paddle.nn.Linear(in_features=c_hidden, out_features=c),
32
+ )
33
+ out_19 = paddle.create_parameter(
34
+ shape=paddle.zeros(shape=[6]).shape,
35
+ dtype=paddle.zeros(shape=[6]).numpy().dtype,
36
+ default_initializer=paddle.nn.initializer.Assign(paddle.zeros(shape=[6])),
37
+ )
38
+ out_19.stop_gradient = not True
39
+ self.gammas = out_19
40
+
41
+ def _basic_init(module):
42
+ if isinstance(module, paddle.nn.Linear) or isinstance(module, paddle.nn.Conv2D):
43
+ init_XavierUniform = paddle.nn.initializer.XavierUniform()
44
+ init_XavierUniform(module.weight)
45
+ if module.bias is not None:
46
+ init_Constant = paddle.nn.initializer.Constant(value=0)
47
+ init_Constant(module.bias)
48
+
49
+ self.apply(_basic_init)
50
+
51
+ def _norm(self, x, norm):
52
+ return norm(x.transpose(perm=[0, 2, 3, 1])).transpose(perm=[0, 3, 1, 2])
53
+
54
+ def forward(self, x):
55
+ mods = self.gammas
56
+ x_temp = self._norm(x, self.norm1) * (1 + mods[0]) + mods[1]
57
+ x = x + self.depthwise(x_temp) * mods[2]
58
+ x_temp = self._norm(x, self.norm2) * (1 + mods[3]) + mods[4]
59
+ x = x + self.channelwise(x_temp.transpose(perm=[0, 2, 3, 1])).transpose(perm=[0, 3, 1, 2]) * mods[5]
60
+ return x
61
+
62
+
63
+ class StageA(paddle.nn.Layer):
64
+ def __init__(
65
+ self,
66
+ levels=2,
67
+ bottleneck_blocks=12,
68
+ c_hidden=384,
69
+ c_latent=4,
70
+ codebook_size=8192,
71
+ scale_factor=0.43,
72
+ ):
73
+ super().__init__()
74
+ self.c_latent = c_latent
75
+ self.scale_factor = scale_factor
76
+ c_levels = [(c_hidden // 2**i) for i in reversed(range(levels))]
77
+ self.in_block = paddle.nn.Sequential(
78
+ paddle.nn.PixelUnshuffle(downscale_factor=2),
79
+ paddle.nn.Conv2D(in_channels=3 * 4, out_channels=c_levels[0], kernel_size=1),
80
+ )
81
+ down_blocks = []
82
+ for i in range(levels):
83
+ if i > 0:
84
+ down_blocks.append(
85
+ paddle.nn.Conv2D(
86
+ in_channels=c_levels[i - 1],
87
+ out_channels=c_levels[i],
88
+ kernel_size=4,
89
+ stride=2,
90
+ padding=1,
91
+ )
92
+ )
93
+ block = ResBlock(c_levels[i], c_levels[i] * 4)
94
+ down_blocks.append(block)
95
+ down_blocks.append(
96
+ paddle.nn.Sequential(
97
+ paddle.nn.Conv2D(
98
+ in_channels=c_levels[-1],
99
+ out_channels=c_latent,
100
+ kernel_size=1,
101
+ bias_attr=False,
102
+ ),
103
+ paddle.nn.BatchNorm2D(num_features=c_latent),
104
+ )
105
+ )
106
+ self.down_blocks = paddle.nn.Sequential(*down_blocks)
107
+ self.down_blocks[0]
108
+ self.codebook_size = codebook_size
109
+ self.vquantizer = VectorQuantize(c_latent, k=codebook_size)
110
+ up_blocks = [
111
+ paddle.nn.Sequential(paddle.nn.Conv2D(in_channels=c_latent, out_channels=c_levels[-1], kernel_size=1))
112
+ ]
113
+ for i in range(levels):
114
+ for j in range(bottleneck_blocks if i == 0 else 1):
115
+ block = ResBlock(c_levels[levels - 1 - i], c_levels[levels - 1 - i] * 4)
116
+ up_blocks.append(block)
117
+ if i < levels - 1:
118
+ up_blocks.append(
119
+ paddle.nn.Conv2DTranspose(
120
+ in_channels=c_levels[levels - 1 - i],
121
+ out_channels=c_levels[levels - 2 - i],
122
+ kernel_size=4,
123
+ stride=2,
124
+ padding=1,
125
+ )
126
+ )
127
+ self.up_blocks = paddle.nn.Sequential(*up_blocks)
128
+ self.out_block = paddle.nn.Sequential(
129
+ paddle.nn.Conv2D(in_channels=c_levels[0], out_channels=3 * 4, kernel_size=1),
130
+ paddle.nn.PixelShuffle(upscale_factor=2),
131
+ )
132
+
133
+ def encode(self, x, quantize=False):
134
+ x = self.in_block(x)
135
+ x = self.down_blocks(x)
136
+ if quantize:
137
+ qe, (vq_loss, commit_loss), indices = self.vquantizer.forward(x, dim=1)
138
+ return (
139
+ qe / self.scale_factor,
140
+ x / self.scale_factor,
141
+ indices,
142
+ vq_loss + commit_loss * 0.25,
143
+ )
144
+ else:
145
+ return x / self.scale_factor, None, None, None
146
+
147
+ def decode(self, x):
148
+ x = x * self.scale_factor
149
+ x = self.up_blocks(x)
150
+ x = self.out_block(x)
151
+ return x
152
+
153
+ def forward(self, x, quantize=False):
154
+ qe, x, _, vq_loss = self.encode(x, quantize)
155
+ x = self.decode(qe)
156
+ return x, vq_loss
157
+
158
+
159
+ class Discriminator(paddle.nn.Layer):
160
+ def __init__(self, c_in=3, c_cond=0, c_hidden=512, depth=6):
161
+ super().__init__()
162
+ d = max(depth - 3, 3)
163
+ layers = [
164
+ paddle.nn.utils.spectral_norm(
165
+ layer=paddle.nn.Conv2D(
166
+ in_channels=c_in,
167
+ out_channels=c_hidden // 2**d,
168
+ kernel_size=3,
169
+ stride=2,
170
+ padding=1,
171
+ )
172
+ ),
173
+ paddle.nn.LeakyReLU(negative_slope=0.2),
174
+ ]
175
+ for i in range(depth - 1):
176
+ c_in = c_hidden // 2 ** max(d - i, 0)
177
+ c_out = c_hidden // 2 ** max(d - 1 - i, 0)
178
+ layers.append(
179
+ paddle.nn.utils.spectral_norm(
180
+ layer=paddle.nn.Conv2D(
181
+ in_channels=c_in,
182
+ out_channels=c_out,
183
+ kernel_size=3,
184
+ stride=2,
185
+ padding=1,
186
+ )
187
+ )
188
+ )
189
+ layers.append(paddle.nn.InstanceNorm2D(num_features=c_out, momentum=1 - 0.1))
190
+ layers.append(paddle.nn.LeakyReLU(negative_slope=0.2))
191
+ self.encoder = paddle.nn.Sequential(*layers)
192
+ self.shuffle = paddle.nn.Conv2D(
193
+ in_channels=c_hidden + c_cond if c_cond > 0 else c_hidden,
194
+ out_channels=1,
195
+ kernel_size=1,
196
+ )
197
+ self.logits = paddle.nn.Sigmoid()
198
+
199
+ def forward(self, x, cond=None):
200
+ x = self.encoder(x)
201
+ if cond is not None:
202
+ cond = cond.reshape([cond.shape[0], cond.shape[1], 1, 1]).expand(shape=[-1, -1, x.shape[-2], x.shape[-1]])
203
+ x = paddle.concat(x=[x, cond], axis=1)
204
+ x = self.shuffle(x)
205
+ x = self.logits(x)
206
+ return x