diff --git a/VLM2Vec/evaluation/__init__.py b/VLM2Vec/evaluation/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/VLM2Vec/evaluation/eval_blip.py b/VLM2Vec/evaluation/eval_blip.py new file mode 100644 index 0000000000000000000000000000000000000000..419051125a33f870c0df122aed7bd8f76962e1c7 --- /dev/null +++ b/VLM2Vec/evaluation/eval_blip.py @@ -0,0 +1,209 @@ +# https://github.com/salesforce/LAVIS/blob/3446bac20c5646d35ae383ebe6d13cec4f8b00cb/examples/blip2_feature_extraction.ipynb +# https://medium.com/@enrico.randellini/image-and-text-features-extraction-with-blip-and-blip-2-how-to-build-a-multimodal-search-engine-a4ceabf51fbe +from src.arguments import ModelArguments, DataArguments, TrainingArguments +from transformers import HfArgumentParser, AutoProcessor +from src.dataset import EvalDataset +from evaluation.collator import EvalCollator, BLIP2Collator +from torch.utils.data import DataLoader +import torch +from tqdm import tqdm +import numpy as np +import pickle +import os +from datasets import load_dataset +from evaluation.eval_utils import get_pred, save_results, print_results +from lavis.models import load_model_and_preprocess + +t2i_tasks = [ + "EDIS", "MSCOCO_t2i","VisDial","VisualNews_t2i","WebQA", "Wiki-SS-NQ", # retrieval + ] +i2t_tasks = [ + "MSCOCO_i2t","VisualNews_i2t", # retrieval + "ImageNet-1K", "N24News", "HatefulMemes", "VOC2007", "SUN397", "Place365", "ImageNet-A", "ImageNet-R", "ObjectNet", "Country211" # classification + ] + + +def get_pred_blip(qry_t, tgt_t, mode="multimodal2text"): + + if mode == "multimodal2text": + # Compute the dot product between each token in qry_t (shape 32, dim) and tgt_t (shape candidate_num, dim) + # This results in a (32, candidate_num) array of scores + scores = np.dot(qry_t, tgt_t.T) # (32, dim) dot (candidate_num, dim).T -> (32, candidate_num) + + # Find the maximum score for each candidate across the 32 tokens + max_scores = np.max(scores, axis=0) # Max along the 32 tokens for each candidate (shape candidate_num) + + # The prediction is the index of the target with the highest maximum score + pred = np.argmax(max_scores) + + elif mode == "text2multimodal": + # Compute the dot product between qry_t (shape dim) and each of the 32 tokens in the target (candidate_num, 32, dim) + # This results in a (candidate_num, 32) array of scores + scores = np.dot(tgt_t, qry_t) # (candidate_num, 32, dim) dot (dim) -> (candidate_num, 32) + + # Find the maximum score for each candidate across the 32 tokens + max_scores = np.max(scores, axis=1) # Max along the 32 tokens for each candidate (shape candidate_num) + + # The prediction is the index of the target with the highest maximum score + pred = np.argmax(max_scores) + + return max_scores, pred + + +def main(): + parser = HfArgumentParser((ModelArguments, DataArguments, TrainingArguments)) + model_args, data_args, training_args = parser.parse_args_into_dataclasses() + model_args: ModelArguments + data_args: DataArguments + training_args: TrainingArguments + model, vis_processors, txt_processors = load_model_and_preprocess(name=model_args.model_name, model_type=model_args.model_type, is_eval=True, device=training_args.device) + embedding_type = data_args.embedding_type + eval_collator = BLIP2Collator( + data_args=data_args, + vis_processors=vis_processors, + txt_processors=txt_processors + ) + + # ToDo: This part of code is a little bit hacky. Need to refactor later. + for idx, subset in enumerate(data_args.subset_name): + print(f"\033[91m{idx+1}/{len(data_args.subset_name)}: Processing {subset} now!\033[0m") + encode_qry_path = os.path.join(data_args.encode_output_path, f"{subset}_qry") + encode_tgt_path = os.path.join(data_args.encode_output_path, f"{subset}_tgt") + if os.path.exists(encode_qry_path) and os.path.exists(encode_tgt_path): + continue + + eval_qry_dataset = EvalDataset( + data_args=data_args, + subset=subset, + text_field="qry_text", + img_path_field="qry_img_path", + ) + eval_tgt_dataset = EvalDataset( + data_args=data_args, + subset=subset, + text_field="tgt_text", + img_path_field="tgt_img_path", + ) + + eval_qry_loader = DataLoader( + eval_qry_dataset, + batch_size=training_args.per_device_eval_batch_size, + collate_fn=eval_collator, + shuffle=False, + drop_last=False, + num_workers=training_args.dataloader_num_workers, + ) + eval_tgt_loader = DataLoader( + eval_tgt_dataset, + batch_size=training_args.per_device_eval_batch_size, + collate_fn=eval_collator, + shuffle=False, + drop_last=False, + num_workers=training_args.dataloader_num_workers, + ) + + encoded_tensor = [] + with torch.no_grad(): + for batch in tqdm(eval_qry_loader, desc="Encode query"): + samples, modes = batch + for sample, mode in zip(samples, modes): + image_features, text_features = None, None + if sample["image"] is not None: + sample["image"] = sample["image"].to(training_args.device) + image_features = model.extract_features(sample, mode="image").image_embeds[0,0,:] # (dim,) + if sample["text_input"]: + text_features = model.extract_features(sample, mode="text").text_embeds[0,0,:] # (dim,) + if embedding_type=="unimodal": + if subset in t2i_tasks: + features = text_features + if subset in i2t_tasks: + features = image_features + elif embedding_type=="multimodal": + if image_features is None: + features = text_features + elif text_features is None: + features = image_features + else: + features = image_features + text_features + encoded_tensor.append(features.cpu().detach().float().numpy()) + with open(encode_qry_path, 'wb') as f: + pickle.dump((encoded_tensor, eval_qry_dataset.paired_data), f) + + encoded_tensor = [] + with torch.no_grad(): + for batch in tqdm(eval_tgt_loader, desc="Encode target"): + samples, modes = batch + for sample, mode in zip(samples, modes): + image_features, text_features = None, None + if sample["image"] is not None: + sample["image"] = sample["image"].to(training_args.device) + image_features = model.extract_features(sample, mode="image").image_embeds[0,0,:] # (dim,) + if sample["text_input"]: + text_features = model.extract_features(sample, mode="text").text_embeds[0,0,:] # (dim,) + if embedding_type=="unimodal": + if subset in t2i_tasks: + features = image_features + if subset in i2t_tasks: + features = text_features + elif embedding_type=="multimodal": + if image_features is None: + features = text_features + elif text_features is None: + features = image_features + else: + features = image_features + text_features + encoded_tensor.append(features.cpu().detach().float().numpy()) + with open(encode_tgt_path, 'wb') as f: + pickle.dump((encoded_tensor, eval_tgt_dataset.paired_data), f) + + results = {} + for subset in tqdm(data_args.subset_name, desc="calculate score"): + encode_qry_path = os.path.join(data_args.encode_output_path, f"{subset}_qry") + encode_tgt_path = os.path.join(data_args.encode_output_path, f"{subset}_tgt") + with open(encode_qry_path, 'rb') as f: + qry_tensor, qry_index = pickle.load(f) + with open(encode_tgt_path, 'rb') as f: + tgt_tensor, tgt_index = pickle.load(f) + qry_dict, tgt_dict = {}, {} + for qry_t, tt in zip(qry_tensor, qry_index): + text, img_path = tt["text"], tt["img_path"] + qry_dict[(text, img_path)] = qry_t + for tgt_t, tt in zip(tgt_tensor, tgt_index): + text, img_path = tt["text"], tt["img_path"] + tgt_dict[(text, img_path)] = tgt_t + + eval_data = load_dataset( + data_args.dataset_name, + subset, + split=data_args.dataset_split, + ) + acc = 0 + all_pred = [] + for row in eval_data: + qry_t = qry_dict[(row["qry_text"], row["qry_img_path"])] # (dim,) + tgt_t, all_candidates = [], [] + if row["tgt_text"] == "": + row["tgt_text"] = ["" for _ in range(len(row["tgt_img_path"]))] + for tt in zip(row["tgt_text"], row["tgt_img_path"]): + tgt_t.append(tgt_dict[tt]) + all_candidates.append(tt) + try: + tgt_t = np.stack(tgt_t, axis=0) # (num_candidate, dim) + except: + import ipdb; ipdb.set_trace() + scores, pred = get_pred(qry_t, tgt_t, normalization=model_args.normalize) + if pred == 0: + acc += 1 + all_pred.append(all_candidates[pred]) + with open(os.path.join(data_args.encode_output_path, f"{subset}_pred.txt"), "w") as f: + for item in all_pred: + f.write(f"{item}\n") + accuracy = acc / len(eval_data) * 100 + results[subset] = accuracy + print(f"\033[91m{subset} accuracy: {acc/len(eval_data)}\033[0m") + save_results(results, model_args, data_args, training_args) + print_results(results) + + +if __name__ == "__main__": + main() diff --git a/VLM2Vec/evaluation/eval_clip.py b/VLM2Vec/evaluation/eval_clip.py new file mode 100644 index 0000000000000000000000000000000000000000..2251a13754dc2a72b9ed682dee4034e4143f093c --- /dev/null +++ b/VLM2Vec/evaluation/eval_clip.py @@ -0,0 +1,185 @@ +from src.arguments import ModelArguments, DataArguments, TrainingArguments +from transformers import HfArgumentParser, AutoProcessor, AutoTokenizer, CLIPModel +from src.dataset import EvalDataset +from src.collator import CLIPCollator +from torch.utils.data import DataLoader +import torch +from tqdm import tqdm +import numpy as np +import pickle +import os +from datasets import load_dataset +from evaluation.eval_utils import get_pred, save_results, print_results + +t2i_tasks = [ + "CIRR", "NIGHTS", "EDIS", "MSCOCO_t2i","VisDial","VisualNews_t2i","WebQA", "Wiki-SS-NQ", "OVEN", # retrieval + ] +i2t_tasks = [ + "MSCOCO_i2t","VisualNews_i2t", # retrieval + "ImageNet-1K", "N24News", "HatefulMemes", "VOC2007", "SUN397", "Place365", "ImageNet-A", "ImageNet-R", "ObjectNet", "Country211" # classification + ] + + +def main(): + parser = HfArgumentParser((ModelArguments, DataArguments, TrainingArguments)) + model_args, data_args, training_args = parser.parse_args_into_dataclasses() + model_args: ModelArguments + data_args: DataArguments + training_args: TrainingArguments + + model = CLIPModel.from_pretrained(model_args.model_name) + processor = AutoProcessor.from_pretrained(model_args.model_name) + tokenizer = AutoTokenizer.from_pretrained(model_args.model_name) + + embedding_type = data_args.embedding_type + eval_collator = CLIPCollator( + data_args=data_args, + vis_processors=processor, + txt_processors=tokenizer + ) + model.eval() + model = model.to(training_args.device) + + # ToDo: This part of code is a little bit hacky. Need to refactor later. + for idx, subset in enumerate(data_args.subset_name): + print(f"\033[91m{idx+1}/{len(data_args.subset_name)}: Processing {subset} now!\033[0m") + encode_qry_path = os.path.join(data_args.encode_output_path, f"{subset}_qry") + encode_tgt_path = os.path.join(data_args.encode_output_path, f"{subset}_tgt") + if os.path.exists(encode_qry_path) and os.path.exists(encode_tgt_path): + continue + + eval_qry_dataset = EvalDataset( + data_args=data_args, + subset=subset, + text_field="qry_text", + img_path_field="qry_img_path", + ) + eval_tgt_dataset = EvalDataset( + data_args=data_args, + subset=subset, + text_field="tgt_text", + img_path_field="tgt_img_path", + ) + + eval_qry_loader = DataLoader( + eval_qry_dataset, + batch_size=training_args.per_device_eval_batch_size, + collate_fn=eval_collator, + shuffle=False, + drop_last=False, + num_workers=training_args.dataloader_num_workers, + ) + eval_tgt_loader = DataLoader( + eval_tgt_dataset, + batch_size=training_args.per_device_eval_batch_size, + collate_fn=eval_collator, + shuffle=False, + drop_last=False, + num_workers=training_args.dataloader_num_workers, + ) + + encoded_tensor = [] + with torch.no_grad(): + for batch in tqdm(eval_qry_loader, desc="Encode query"): + batch = {key: value.to(training_args.device) for key, value in batch.items() if type(value) is not list} + image_features, text_features = None, None + if "pixel_values" in batch: + image_features = model.get_image_features(batch["pixel_values"]) + if "input_ids" in batch: + text_features = model.get_text_features(batch["input_ids"], batch["attention_mask"]) + if embedding_type=="unimodal": + if subset in t2i_tasks: + features = text_features + if subset in i2t_tasks: + features = image_features + elif embedding_type=="multimodal": + if image_features is None: + features = text_features + elif text_features is None: + features = image_features + else: + try: + features = image_features + text_features + except: + import ipdb; ipdb.set_trace() + encoded_tensor.append(features.cpu().detach().float().numpy()) + encoded_tensor = np.concatenate(encoded_tensor) + with open(encode_qry_path, 'wb') as f: + pickle.dump((encoded_tensor, eval_qry_dataset.paired_data), f) + + encoded_tensor = [] + with torch.no_grad(): + for batch in tqdm(eval_tgt_loader, desc="Encode target"): + batch = {key: value.to(training_args.device) for key, value in batch.items() if type(value) is not list} + image_features, text_features = None, None + if "pixel_values" in batch: + image_features = model.get_image_features(batch["pixel_values"]) + if "input_ids" in batch: + text_features = model.get_text_features(batch["input_ids"], batch["attention_mask"]) + if embedding_type=="unimodal": + if subset in t2i_tasks: + features = image_features + if subset in i2t_tasks: + features = text_features + elif embedding_type=="multimodal": + if image_features is None: + features = text_features + elif text_features is None: + features = image_features + else: + features = image_features + text_features + encoded_tensor.append(features.cpu().detach().float().numpy()) + encoded_tensor = np.concatenate(encoded_tensor) + with open(encode_tgt_path, 'wb') as f: + pickle.dump((encoded_tensor, eval_tgt_dataset.paired_data), f) + results = {} + for subset in tqdm(data_args.subset_name, desc="calculate score"): + encode_qry_path = os.path.join(data_args.encode_output_path, f"{subset}_qry") + encode_tgt_path = os.path.join(data_args.encode_output_path, f"{subset}_tgt") + with open(encode_qry_path, 'rb') as f: + qry_tensor, qry_index = pickle.load(f) + with open(encode_tgt_path, 'rb') as f: + tgt_tensor, tgt_index = pickle.load(f) + qry_dict, tgt_dict = {}, {} + for qry_t, tt in zip(qry_tensor, qry_index): + text, img_path = tt["text"], tt["img_path"] + qry_dict[(text, img_path)] = qry_t + for tgt_t, tt in zip(tgt_tensor, tgt_index): + text, img_path = tt["text"], tt["img_path"] + tgt_dict[(text, img_path)] = tgt_t + + eval_data = load_dataset( + data_args.dataset_name, + subset, + split=data_args.dataset_split, + ) + acc = 0 + all_pred = [] + for row in eval_data: + qry_t = qry_dict[(row["qry_text"], row["qry_img_path"])] # (dim,) + tgt_t, all_candidates = [], [] + if row["tgt_text"] == "": + row["tgt_text"] = ["" for _ in range(len(row["tgt_img_path"]))] + for tt in zip(row["tgt_text"], row["tgt_img_path"]): + tgt_t.append(tgt_dict[tt]) + all_candidates.append(tt) + try: + tgt_t = np.stack(tgt_t, axis=0) # (num_candidate, dim) + except: + import ipdb; ipdb.set_trace() + scores, pred = get_pred(qry_t, tgt_t, normalization=model_args.normalize) + if pred == 0: + acc += 1 + all_pred.append(all_candidates[pred]) + with open(os.path.join(data_args.encode_output_path, f"{subset}_pred.txt"), "w") as f: + for item in all_pred: + f.write(f"{item}\n") + accuracy = acc / len(eval_data) * 100 + results[subset] = accuracy + print(f"\033[91m{subset} accuracy: {acc/len(eval_data)}\033[0m") + save_results(results, model_args, data_args, training_args) + print_results(results) + + +if __name__ == "__main__": + main() diff --git a/VLM2Vec/evaluation/eval_openclip.py b/VLM2Vec/evaluation/eval_openclip.py new file mode 100644 index 0000000000000000000000000000000000000000..34ab52d78f84ba4adc1e684146d586f2e82a2597 --- /dev/null +++ b/VLM2Vec/evaluation/eval_openclip.py @@ -0,0 +1,185 @@ +import open_clip +from src.arguments import ModelArguments, DataArguments, TrainingArguments +from transformers import HfArgumentParser, AutoProcessor, AutoTokenizer, CLIPModel +from src.dataset import EvalDataset +from src.collator import EvalCollator, BLIP2Collator, CLIPCollator, OpenCLIPCollator +from torch.utils.data import DataLoader +import torch +from tqdm import tqdm +import numpy as np +import pickle +import os +from datasets import load_dataset +from evaluation.eval_utils import get_pred, save_results, print_results + +t2i_tasks = [ + "CIRR", "NIGHTS", "EDIS", "MSCOCO_t2i","VisDial","VisualNews_t2i","WebQA", "Wiki-SS-NQ", "OVEN", # retrieval + ] +i2t_tasks = [ + "MSCOCO_i2t","VisualNews_i2t", # retrieval + "ImageNet-1K", "N24News", "HatefulMemes", "VOC2007", "SUN397", "Place365", "ImageNet-A", "ImageNet-R", "ObjectNet", "Country211" # classification + ] + + +def main(): + parser = HfArgumentParser((ModelArguments, DataArguments, TrainingArguments)) + model_args, data_args, training_args = parser.parse_args_into_dataclasses() + model_args: ModelArguments + data_args: DataArguments + training_args: TrainingArguments + + model, processor = open_clip.create_model_from_pretrained('hf-hub:laion/CLIP-ViT-B-16-laion2B-s34B-b88K') + tokenizer = open_clip.get_tokenizer('hf-hub:laion/CLIP-ViT-B-16-laion2B-s34B-b88K') + + embedding_type = data_args.embedding_type + eval_collator = OpenCLIPCollator( + data_args=data_args, + vis_processors=processor, + txt_processors=tokenizer + ) + model.eval() + model = model.to(training_args.device) + + # ToDo: This part of code is a little bit hacky. Need to refactor later. + for idx, subset in enumerate(data_args.subset_name): + print(f"\033[91m{idx+1}/{len(data_args.subset_name)}: Processing {subset} now!\033[0m") + encode_qry_path = os.path.join(data_args.encode_output_path, f"{subset}_qry") + encode_tgt_path = os.path.join(data_args.encode_output_path, f"{subset}_tgt") + if os.path.exists(encode_qry_path) and os.path.exists(encode_tgt_path): + continue + + eval_qry_dataset = EvalDataset( + data_args=data_args, + subset=subset, + text_field="qry_text", + img_path_field="qry_img_path", + ) + eval_tgt_dataset = EvalDataset( + data_args=data_args, + subset=subset, + text_field="tgt_text", + img_path_field="tgt_img_path", + ) + + eval_qry_loader = DataLoader( + eval_qry_dataset, + batch_size=training_args.per_device_eval_batch_size, + collate_fn=eval_collator, + shuffle=False, + drop_last=False, + num_workers=training_args.dataloader_num_workers, + ) + eval_tgt_loader = DataLoader( + eval_tgt_dataset, + batch_size=training_args.per_device_eval_batch_size, + collate_fn=eval_collator, + shuffle=False, + drop_last=False, + num_workers=training_args.dataloader_num_workers, + ) + + encoded_tensor = [] + with torch.no_grad(): + for batch in tqdm(eval_qry_loader, desc="Encode query"): + batch = {key: value.to(training_args.device) for key, value in batch.items() if type(value) is not list} + image_features, text_features = None, None + if "pixel_values" in batch: + image_features = model.encode_image(batch["pixel_values"]) + if "input_ids" in batch: + text_features = model.encode_text(batch["input_ids"]) + if embedding_type=="unimodal": + if subset in t2i_tasks: + features = text_features + if subset in i2t_tasks: + features = image_features + elif embedding_type=="multimodal": + if image_features is None: + features = text_features + elif text_features is None: + features = image_features + else: + try: + features = image_features + text_features + except: + import ipdb; ipdb.set_trace() + encoded_tensor.append(features.cpu().detach().float().numpy()) + encoded_tensor = np.concatenate(encoded_tensor) + with open(encode_qry_path, 'wb') as f: + pickle.dump((encoded_tensor, eval_qry_dataset.paired_data), f) + + encoded_tensor = [] + with torch.no_grad(): + for batch in tqdm(eval_tgt_loader, desc="Encode target"): + batch = {key: value.to(training_args.device) for key, value in batch.items() if type(value) is not list} + image_features, text_features = None, None + if "pixel_values" in batch: + image_features = model.encode_image(batch["pixel_values"]) + if "input_ids" in batch: + text_features = model.encode_text(batch["input_ids"]) + if embedding_type=="unimodal": + if subset in t2i_tasks: + features = image_features + if subset in i2t_tasks: + features = text_features + elif embedding_type=="multimodal": + if image_features is None: + features = text_features + elif text_features is None: + features = image_features + else: + features = image_features + text_features + encoded_tensor.append(features.cpu().detach().float().numpy()) + encoded_tensor = np.concatenate(encoded_tensor) + with open(encode_tgt_path, 'wb') as f: + pickle.dump((encoded_tensor, eval_tgt_dataset.paired_data), f) + results = {} + for subset in tqdm(data_args.subset_name, desc="calculate score"): + encode_qry_path = os.path.join(data_args.encode_output_path, f"{subset}_qry") + encode_tgt_path = os.path.join(data_args.encode_output_path, f"{subset}_tgt") + with open(encode_qry_path, 'rb') as f: + qry_tensor, qry_index = pickle.load(f) + with open(encode_tgt_path, 'rb') as f: + tgt_tensor, tgt_index = pickle.load(f) + qry_dict, tgt_dict = {}, {} + for qry_t, tt in zip(qry_tensor, qry_index): + text, img_path = tt["text"], tt["img_path"] + qry_dict[(text, img_path)] = qry_t + for tgt_t, tt in zip(tgt_tensor, tgt_index): + text, img_path = tt["text"], tt["img_path"] + tgt_dict[(text, img_path)] = tgt_t + + eval_data = load_dataset( + data_args.dataset_name, + subset, + split=data_args.dataset_split, + ) + acc = 0 + all_pred = [] + for row in eval_data: + qry_t = qry_dict[(row["qry_text"], row["qry_img_path"])] # (dim,) + tgt_t, all_candidates = [], [] + if row["tgt_text"] == "": + row["tgt_text"] = ["" for _ in range(len(row["tgt_img_path"]))] + for tt in zip(row["tgt_text"], row["tgt_img_path"]): + tgt_t.append(tgt_dict[tt]) + all_candidates.append(tt) + try: + tgt_t = np.stack(tgt_t, axis=0) # (num_candidate, dim) + except: + import ipdb; ipdb.set_trace() + scores, pred = get_pred(qry_t, tgt_t, normalization=model_args.normalize) + if pred == 0: + acc += 1 + all_pred.append(all_candidates[pred]) + with open(os.path.join(data_args.encode_output_path, f"{subset}_pred.txt"), "w") as f: + for item in all_pred: + f.write(f"{item}\n") + accuracy = acc / len(eval_data) * 100 + results[subset] = accuracy + print(f"\033[91m{subset} accuracy: {acc/len(eval_data)}\033[0m") + save_results(results, model_args, data_args, training_args) + print_results(results) + + +if __name__ == "__main__": + main() diff --git a/VLM2Vec/evaluation/eval_siglip.py b/VLM2Vec/evaluation/eval_siglip.py new file mode 100644 index 0000000000000000000000000000000000000000..49c1687426e29f790258945ab366d493ae39a1e8 --- /dev/null +++ b/VLM2Vec/evaluation/eval_siglip.py @@ -0,0 +1,186 @@ +from src.arguments import ModelArguments, DataArguments, TrainingArguments +from transformers import HfArgumentParser, AutoProcessor, AutoTokenizer, CLIPModel, AutoModel +from src.dataset import EvalDataset +from src.collator import EvalCollator, BLIP2Collator, CLIPCollator +from torch.utils.data import DataLoader +import torch +from tqdm import tqdm +import numpy as np +import pickle +import os +from datasets import load_dataset +from evaluation.eval_utils import get_pred, save_results, print_results + +t2i_tasks = [ + "CIRR", "NIGHTS", "EDIS", "MSCOCO_t2i","VisDial","VisualNews_t2i","WebQA", "Wiki-SS-NQ", "OVEN", # retrieval + ] +i2t_tasks = [ + "MSCOCO_i2t","VisualNews_i2t", # retrieval + "ImageNet-1K", "N24News", "HatefulMemes", "VOC2007", "SUN397", "Place365", "ImageNet-A", "ImageNet-R", "ObjectNet", "Country211" # classification + ] + + +def main(): + parser = HfArgumentParser((ModelArguments, DataArguments, TrainingArguments)) + model_args, data_args, training_args = parser.parse_args_into_dataclasses() + model_args: ModelArguments + data_args: DataArguments + training_args: TrainingArguments + + model = AutoModel.from_pretrained("google/siglip-so400m-patch14-384") + all_processor = AutoProcessor.from_pretrained("google/siglip-so400m-patch14-384") + processor = all_processor.image_processor + tokenizer = all_processor.tokenizer + + embedding_type = data_args.embedding_type + eval_collator = CLIPCollator( + data_args=data_args, + vis_processors=processor, + txt_processors=tokenizer + ) + model.eval() + model = model.to(training_args.device) + + # ToDo: This part of code is a little bit hacky. Need to refactor later. + for idx, subset in enumerate(data_args.subset_name): + print(f"\033[91m{idx+1}/{len(data_args.subset_name)}: Processing {subset} now!\033[0m") + encode_qry_path = os.path.join(data_args.encode_output_path, f"{subset}_qry") + encode_tgt_path = os.path.join(data_args.encode_output_path, f"{subset}_tgt") + if os.path.exists(encode_qry_path) and os.path.exists(encode_tgt_path): + continue + + eval_qry_dataset = EvalDataset( + data_args=data_args, + subset=subset, + text_field="qry_text", + img_path_field="qry_img_path", + ) + eval_tgt_dataset = EvalDataset( + data_args=data_args, + subset=subset, + text_field="tgt_text", + img_path_field="tgt_img_path", + ) + + eval_qry_loader = DataLoader( + eval_qry_dataset, + batch_size=training_args.per_device_eval_batch_size, + collate_fn=eval_collator, + shuffle=False, + drop_last=False, + num_workers=training_args.dataloader_num_workers, + ) + eval_tgt_loader = DataLoader( + eval_tgt_dataset, + batch_size=training_args.per_device_eval_batch_size, + collate_fn=eval_collator, + shuffle=False, + drop_last=False, + num_workers=training_args.dataloader_num_workers, + ) + + encoded_tensor = [] + with torch.no_grad(): + for batch in tqdm(eval_qry_loader, desc="Encode query"): + batch = {key: value.to(training_args.device) for key, value in batch.items() if type(value) is not list} + image_features, text_features = None, None + if "pixel_values" in batch: + image_features = model.get_image_features(batch["pixel_values"]) + if "input_ids" in batch: + text_features = model.get_text_features(batch["input_ids"], batch["attention_mask"]) + if embedding_type=="unimodal": + if subset in t2i_tasks: + features = text_features + if subset in i2t_tasks: + features = image_features + elif embedding_type=="multimodal": + if image_features is None: + features = text_features + elif text_features is None: + features = image_features + else: + try: + features = image_features + text_features + except: + import ipdb; ipdb.set_trace() + encoded_tensor.append(features.cpu().detach().float().numpy()) + encoded_tensor = np.concatenate(encoded_tensor) + with open(encode_qry_path, 'wb') as f: + pickle.dump((encoded_tensor, eval_qry_dataset.paired_data), f) + + encoded_tensor = [] + with torch.no_grad(): + for batch in tqdm(eval_tgt_loader, desc="Encode target"): + batch = {key: value.to(training_args.device) for key, value in batch.items() if type(value) is not list} + image_features, text_features = None, None + if "pixel_values" in batch: + image_features = model.get_image_features(batch["pixel_values"]) + if "input_ids" in batch: + text_features = model.get_text_features(batch["input_ids"], batch["attention_mask"]) + if embedding_type=="unimodal": + if subset in t2i_tasks: + features = image_features + if subset in i2t_tasks: + features = text_features + elif embedding_type=="multimodal": + if image_features is None: + features = text_features + elif text_features is None: + features = image_features + else: + features = image_features + text_features + encoded_tensor.append(features.cpu().detach().float().numpy()) + encoded_tensor = np.concatenate(encoded_tensor) + with open(encode_tgt_path, 'wb') as f: + pickle.dump((encoded_tensor, eval_tgt_dataset.paired_data), f) + results = {} + for subset in tqdm(data_args.subset_name, desc="calculate score"): + encode_qry_path = os.path.join(data_args.encode_output_path, f"{subset}_qry") + encode_tgt_path = os.path.join(data_args.encode_output_path, f"{subset}_tgt") + with open(encode_qry_path, 'rb') as f: + qry_tensor, qry_index = pickle.load(f) + with open(encode_tgt_path, 'rb') as f: + tgt_tensor, tgt_index = pickle.load(f) + qry_dict, tgt_dict = {}, {} + for qry_t, tt in zip(qry_tensor, qry_index): + text, img_path = tt["text"], tt["img_path"] + qry_dict[(text, img_path)] = qry_t + for tgt_t, tt in zip(tgt_tensor, tgt_index): + text, img_path = tt["text"], tt["img_path"] + tgt_dict[(text, img_path)] = tgt_t + + eval_data = load_dataset( + data_args.dataset_name, + subset, + split=data_args.dataset_split, + ) + acc = 0 + all_pred = [] + for row in eval_data: + qry_t = qry_dict[(row["qry_text"], row["qry_img_path"])] # (dim,) + tgt_t, all_candidates = [], [] + if row["tgt_text"] == "": + row["tgt_text"] = ["" for _ in range(len(row["tgt_img_path"]))] + for tt in zip(row["tgt_text"], row["tgt_img_path"]): + tgt_t.append(tgt_dict[tt]) + all_candidates.append(tt) + try: + tgt_t = np.stack(tgt_t, axis=0) # (num_candidate, dim) + except: + import ipdb; ipdb.set_trace() + scores, pred = get_pred(qry_t, tgt_t, normalization=model_args.normalize) + if pred == 0: + acc += 1 + all_pred.append(all_candidates[pred]) + with open(os.path.join(data_args.encode_output_path, f"{subset}_pred.txt"), "w") as f: + for item in all_pred: + f.write(f"{item}\n") + accuracy = acc / len(eval_data) * 100 + results[subset] = accuracy + print(f"\033[91m{subset} accuracy: {acc/len(eval_data)}\033[0m") + save_results(results, model_args, data_args, training_args) + print_results(results) + + +if __name__ == "__main__": + main() diff --git a/VLM2Vec/src/dist_utils.py b/VLM2Vec/src/dist_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..be6655edff2680523f91bc581fa67ce334ece96d --- /dev/null +++ b/VLM2Vec/src/dist_utils.py @@ -0,0 +1,92 @@ +# Code adapted from SimCSE (https://github.com/princeton-nlp/SimCSE) governed by MIT license. + +# Copyright (c) 2023, Salesforce, Inc. +# All rights reserved. +# SPDX-License-Identifier: BSD-3-Clause +# For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause + +import torch +import torch.distributed as dist + +class GatherLayer(torch.autograd.Function): + """ + Gather tensors from all process, supporting backward propagation. + https://github.com/Spijkervet/SimCLR/blob/master/simclr/modules/gather.py + """ + @staticmethod + def forward(ctx, input): + ctx.save_for_backward(input) + output = [torch.zeros_like(input) for _ in range(dist.get_world_size())] + dist.all_gather(output, input) + return tuple(output) + + @staticmethod + def backward(ctx, *grads): + (input,) = ctx.saved_tensors + grad_out = torch.zeros_like(input) + grad_out[:] = grads[dist.get_rank()] + return grad_out + + +def dist_gather(x: torch.tensor): + if not dist.is_initialized(): return x + if len(x.shape) == 0: + x = x.reshape(1) + x_gather = GatherLayer.apply(x) + x_gather = torch.cat(x_gather, dim=0) + return x_gather + + +@torch.no_grad() +def dist_gather_nograd(x: torch.tensor): + if not dist.is_initialized(): return x + x_gather = [torch.ones_like(x) for _ in range(get_world_size())] + dist.all_gather(x_gather, x, async_op=False) + x_gather = torch.cat(x_gather, dim=0) + return x_gather + + +def get_rank(): + if not dist.is_available(): + return 0 + if not dist.is_initialized(): + return 0 + return dist.get_rank() + + +def is_main(): + return get_rank() == 0 + + +def get_world_size(): + if not dist.is_initialized(): + return 1 + else: + return dist.get_world_size() + +def barrier(): + if dist.is_initialized(): + dist.barrier() + + +@torch.no_grad() +def varsize_gather_nograd(x: torch.Tensor): + """gather tensors of different sizes along the first dimension""" + if not dist.is_initialized(): + return x + + # determine max size + size = torch.tensor([x.shape[0]], device=x.device, dtype=torch.int) + allsizes = [torch.zeros_like(size) for _ in range(dist.get_world_size())] + dist.all_gather(allsizes, size) + max_size = max([size.cpu().max() for size in allsizes]) + + padded = torch.empty(max_size, *x.shape[1:], dtype=x.dtype, device=x.device) + padded[: x.shape[0]] = x + output = [torch.zeros_like(padded) for _ in range(dist.get_world_size())] + dist.all_gather(output, padded) + + output = [tensor[: allsizes[k]] for k, tensor in enumerate(output)] + output = torch.cat(output, dim=0) + + return output diff --git a/VLMEvalKit_old/PaddleMIX/deploy/README.md b/VLMEvalKit_old/PaddleMIX/deploy/README.md new file mode 100644 index 0000000000000000000000000000000000000000..ba0a3ce4475827b5475e4384fb9d1e1364575d72 --- /dev/null +++ b/VLMEvalKit_old/PaddleMIX/deploy/README.md @@ -0,0 +1,110 @@ +# PaddleMIX推理部署 + +[[English](README_en.md)] + +PaddleMIX基于Paddle Inference,提供了python的部署方案。部署方式分为两种: +- 通过 **APPflow** ,设置static_mode = True 变量开启静态图推理,同时可配合trt加速推理;该方式部分模型不支持静态图以及trt,具体模型可参考[跨模态多场景应用](../applications/README.md/#跨模态多场景应用); + +- 单模型部署 + + +## 1.APPflow部署 + +在使用 PaddleMIX 一键预测 **APPflow** 时,可通过设置 static_mode = True 变量开启静态图推理,同时可配合trt加速推理。 + +### 1.1 示例 + +```python +>>> from paddlemix.appflow import Appflow +>>> from PIL import Image + +>>> task = Appflow(app="openset_det_sam", + models=["GroundingDino/groundingdino-swint-ogc","Sam/SamVitH-1024"], + static_mode=True, + precision="fp32") +>>> image_pil = Image.open("beauty.png").convert("RGB") +>>> result = task(image=image_pil,prompt="women") +``` + +### 1.2 参数说明 +| 参数 | 是否必须| 含义 | +|-------|-------|---------------------------------------------------------------------------------------------| +| --app | Yes| 应用名称 | +| --models | Yes | 需要使用的模型,可以是单个模型,也可以多个组合 | +| --static_mode | Option | 是否静态图推理,默认False | +| --precision | Option | 当 static_mode == True 时使用,默认fp32,可选择trt_fp32、trt_fp16 | + +说明: +- 部分模型不支持静态图以及trt,具体可参考[跨模态多场景应用](../applications/README.md) +- 生成的静态图将在模型名字对应的文件夹下 如:GroundingDino/groundingdino-swint-ogc/ + + +## 2. 单模型预测部署 + +Python端预测部署主要包含两个步骤: +- 导出预测模型 +- 基于Python进行预测 + +当前支持模型: +- [blip2](./blip2/README.md) +- [groundingdino](./groundingdino/README.md) +- [sam](./sam/README.md) +- [qwen_vl](./qwen_vl/README.md) + +以 groundingdino 为例子。 + +### 2.1 导出预测模型 + +```bash +cd deploy/groundingdino +# 导出groundingdino模型 +python export.py \ +--dino_type GroundingDino/groundingdino-swint-ogc +``` +导出后目录下,包括 `model_state.pdiparams`, `model_state.pdiparams.info`, `model_state.pdmodel`等文件。 + +### 2.2 基于python的预测 + +```bash + python predict.py \ + --text_encoder_type GroundingDino/groundingdino-swint-ogc \ + --model_path output_groundingdino/GroundingDino/groundingdino-swint-ogc \ + --input_image https://bj.bcebos.com/v1/paddlenlp/models/community/GroundingDino/000000004505.jpg \ + --output_dir ./groundingdino_predict_output \ + --prompt "bus" + +``` + +## 3. 推理 BenchMark + +> Note: +> 测试环境为: +Paddle 3.0, +PaddleMIX release/2.0 +PaddleNLP2.7.2 +A100 80G单卡。 + +### 3.1 benchmark命令 + +在 `deploy` 对应模型目录下的运行后加 --benchmark, +如 GroundingDino 的benchmark命令为: + +```bash + cd deploy/groundingdino + python predict.py \ + --text_encoder_type GroundingDino/groundingdino-swint-ogc \ + --model_path output_groundingdino/GroundingDino/groundingdino-swint-ogc \ + --input_image https://bj.bcebos.com/v1/paddlenlp/models/community/GroundingDino/000000004505.jpg \ + --output_dir ./groundingdino_predict_output \ + --prompt "bus" \ + --benchmark True +``` + +# A100性能数据 +|模型|图片分辨率|数据类型 |Paddle Deploy | +|-|-|-|-| +|qwen-vl-7b|448*448|fp16|669.8 ms| +|llava-1.5-7b|336*336|fp16|981.2 ms| +|llava-1.6-7b|336*336|fp16|778.7 ms| +|groundingDino/groundingdino-swint-ogc|800*1193|fp32|100 ms| +|Sam/SamVitH-1024|1024*1024|fp32|121 ms| diff --git a/VLMEvalKit_old/PaddleMIX/deploy/README_en.md b/VLMEvalKit_old/PaddleMIX/deploy/README_en.md new file mode 100644 index 0000000000000000000000000000000000000000..9614e9d1fde3d08969de93bbd7773b2f933ead80 --- /dev/null +++ b/VLMEvalKit_old/PaddleMIX/deploy/README_en.md @@ -0,0 +1,108 @@ +# PaddleMIX Inference Deployment + +[[中文文档](README.md)] + +PaddleMIX utilizes Paddle Inference and provides a Python-based deployment solution. There are two deployment methods: + +1. **APPflow Deployment**: + - By setting the `static_mode = True` variable in APPflow, you can enable static graph inference. Additionally, you can accelerate inference using TensorRT. Note that not all models support static graph or TensorRT. Please refer to the [Multi Modal And Scenario](../applications/README_en.md/#multi-modal-and-scenario) section for specific model support. + +2. **Single Model Deployment**: + +For APPflow usage, you can set the `static_mode = True` variable to enable static graph inference and optionally accelerate inference using TensorRT. + +### 1.1 Exmaples + +```python +>>> from paddlemix.appflow import Appflow +>>> from PIL import Image + +>>> task = Appflow(app="openset_det_sam", + models=["GroundingDino/groundingdino-swint-ogc","Sam/SamVitH-1024"], + static_mode=True, + precision="fp32") +>>> image_pil = Image.open("beauty.png").convert("RGB") +>>> result = task(image=image_pil,prompt="women") +``` + +### 1.2 Parameter Explanation +| Parameter | Required? | Meaning | +|-------|-------|---------------------------------------------------------------------------------------------| +| --app | Yes| Application name | +| --models | Yes | Model(s) used. Can be one model, or multiple models | +| --static_mode | Optional | Whether to use static graph inference, default to False | +| --precision | Optional | When `static_mode == True`, it defaults to using FP32. You can optionally select `trt_fp32` or `trt_fp16`. | + +Instructions: +- Some models do not support static graph or TensorRT. For specific information, please refer to [Multi Modal And Scenario](../applications/README_en.md/#multi-modal-and-scenario). + +- The generated static graph will be located in the folder corresponding to the model name, for example: `GroundingDino/groundingdino-swint-ogc/`. + +## 2. Single Model Prediction Deployment + +Python-based prediction deployment mainly involves two steps: +- Exporting the predictive model +- Performing prediction using Python + +Currently supported models: +- [blip2](./blip2/README.md) +- [groundingdino](./groundingdino/README.md) +- [sam](./sam/README.md) +- [qwen_vl](./qwen_vl/README.md) + +Using groundingdino as an exmaple. + +### 2.1 Exporting Predictive Model + +```bash +cd deploy/groundingdino +# 导出groundingdino模型 +python export.py \ +--dino_type GroundingDino/groundingdino-swint-ogc +``` +Will be exported to the following directory, including `model_state.pdiparams`, `model_state.pdiparams.info`, `model_state.pdmodel`and other files. + +### 2.2 Python-based Inference + +```bash + python predict.py \ + --text_encoder_type GroundingDino/groundingdino-swint-ogc \ + --model_path output_groundingdino/GroundingDino/groundingdino-swint-ogc \ + --input_image https://bj.bcebos.com/v1/paddlenlp/models/community/GroundingDino/000000004505.jpg \ + --output_dir ./groundingdino_predict_output \ + --prompt "bus" + +``` + +## 3. BenchMark + +> Note: +> environment +Paddle 3.0 +PaddleMIX release/2.0 +PaddleNLP 2.7.2 +A100 80G。 + +### 3.1 benchmark cmd + +Add -- benchmark after running in the 'deploy' corresponding model directory to obtain the running time of the model. +example: GroundingDino benchmark: + +```bash + cd deploy/groundingdino + python predict.py \ + --text_encoder_type GroundingDino/groundingdino-swint-ogc \ + --model_path output_groundingdino/GroundingDino/groundingdino-swint-ogc \ + --input_image https://bj.bcebos.com/v1/paddlenlp/models/community/GroundingDino/000000004505.jpg \ + --output_dir ./groundingdino_predict_output \ + --prompt "bus" \ + --benchmark True +``` + +|Model|image size|dtype |Paddle Deploy | +|-|-|-|-| +|qwen-vl-7b|448*448|fp16|669.8 ms| +|llava-1.5-7b|336*336|fp16|981.2 ms| +|llava-1.6-7b|336*336|fp16|778.7 ms| +|groundingDino/groundingdino-swint-ogc|800*1193|fp32|100 ms| +|Sam/SamVitH-1024|1024*1024|fp32|121 ms| \ No newline at end of file diff --git a/VLMEvalKit_old/PaddleMIX/docs/CHANGELOG.md b/VLMEvalKit_old/PaddleMIX/docs/CHANGELOG.md new file mode 100644 index 0000000000000000000000000000000000000000..f1ead4f987a5f327353398eb627c4d199af52054 --- /dev/null +++ b/VLMEvalKit_old/PaddleMIX/docs/CHANGELOG.md @@ -0,0 +1,44 @@ +# 版本更新信息 + +## 最新版本信息 + +### 2.0(07/26/2024) + +#### 多模态理解 + +1. 新增模型:LLaVA: v1.5-7b, v1.5-13b, v1,6-7b,CogAgent, CogVLM, Qwen-VL, InternLM-XComposer2 +2. 数据集增强:新增chatml_dataset图文对话数据读取方案,可自定义chat_template文件适配,支持混合数据集 +3. 工具链升级:新增Auto模块,统一SFT训练流程,兼容全参数、lora训练。新增mixtoken训练策略,SFT吞吐量提升5.6倍。支持Qwen-VL,LLaVA推理部署,较torch推理性能提升2.38倍 + +#### 多模态生成 + +1. 视频生成能力:支持Sora相关技术,支持DiT、SiT、UViT训练推理,新增NaViT、MAGVIT-v2模型; 新增视频生成模型SVD、Open Sora,支持模型微调和推理; 新增姿态可控视频生成模型AnimateAnyone、即插即用视频生成模型AnimateDiff、GIF视频生成模型Hotshot-XL; +2. 文生图模型库:新增高速推理文图生成模型LCM,适配SD/SDXL训练和推理; +3. 工具链升级:发布ppdiffusers 0.24.1版本,新增peft,accelerate后端; 权重加载/保存全面升级,支持分布式、模型切片、safetensors等场景。 +4. 生态兼容:提供基于ppdiffusers开发的ComfyUI插件,支持了常见的模型加载转换、文生图、图生图、图像局部修改等任务。新增Stable Diffusion 1.5系列节点;新增Stable Diffusion XL系列节点。新增4个图像生成的workflow案例。 + +#### DataCopilot(多模态数据处理工具箱) + +1. 多模态数据集类型MMDataset,支持加载和导出Json、H5、Jsonl等多种数据存储格式,内置并发(map, filter)数据处理接口等 +2. 多模态数据格式工具,支持自定义数据结构,数据转换,离线格式检查 +3. 多模态数据分析工具,支持基本的统计信息,数据可视化功能,以及注册自定义功能 + +### 1.0(11/15/2023) + +#### 核心能力 + +1. 大规模预训练: BLIP-2支持数据并行、sharding、模型并行,流水线并行训练;支持千亿参数规模训练; EVA-CLIP支持数据并行、sharding、模型并行训练; Stable Diffusion支持数据并行、sharding、BF16 O2训练; CLIP,Coca支持数据并行训练 +2. 有监督精调: Stable Diffusion,SDXL 支持LoRA精调 +3. 推理部署: 支持BLIP-2,miniGPT-4,Grounding DINO, SAM,Stable Diffusion动转静导出部署 + +#### 前沿模型 +1. 新增CLIP系列跨模态大模型:CLIP,EVA-CLIP,Coca +2. 新增图生文跨模态大模型:BLIP-2,miniGPT-4,VisualGLM +3. 新增跨模态视觉模型:Grounding DINO, SAM +4. 新增融合更多模态大模型:ImageBind +5. 新增文生图模型:SDXL,支持Text2Image、Img2Img、Inpainting、InstructPix2Pix等任务,支持DreamBooth Lora训练; 新增UniDiffuser,通过统一的多模态扩散过程支持文生图、图生文等任务; 新增文本条件视频生成模型LVDM,支持训练与推理; 新增文图生成模型Kandinsky 2.2,Consistency models; Controlnet升级,支持ControlNetImg2Img、ControlNetInpaint、 StableDiffusionXLControlNet等。 + +#### 特色应用 +1. 新增跨模态大模型应用流水线AppFlow +2. 新增基于chat的图像编辑应用 +3. 新增自动标注应用 diff --git a/VLMEvalKit_old/PaddleMIX/docs/FAQ.md b/VLMEvalKit_old/PaddleMIX/docs/FAQ.md new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/VLMEvalKit_old/PaddleMIX/docs/train_tutorial.md b/VLMEvalKit_old/PaddleMIX/docs/train_tutorial.md new file mode 100644 index 0000000000000000000000000000000000000000..baac7761a64a29334f02b255f971c23710fd3782 --- /dev/null +++ b/VLMEvalKit_old/PaddleMIX/docs/train_tutorial.md @@ -0,0 +1,10 @@ +# Train Tutorial + + +## 训练微调示例 +- [Blip2](../paddlemix/examples/blip2/README.md) +- [clip](../paddlemix/examples/clip/README.md) +- [coca](../paddlemix/examples/coca/README.md) +- [eva02](../paddlemix/examples/eva02/README.md) +- [evaclip](../paddlemix/examples/evaclip/README.md) +- [Stable Diffusion](../ppdiffusers/examples/text_to_image/README.md) diff --git a/VLMEvalKit_old/PaddleMIX/ppdiffusers/LICENSE b/VLMEvalKit_old/PaddleMIX/ppdiffusers/LICENSE new file mode 100644 index 0000000000000000000000000000000000000000..962fee016f4e1b4fcaa5565ad3373a49cad04141 --- /dev/null +++ b/VLMEvalKit_old/PaddleMIX/ppdiffusers/LICENSE @@ -0,0 +1,203 @@ + + + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. \ No newline at end of file diff --git a/VLMEvalKit_old/PaddleMIX/ppdiffusers/Makefile b/VLMEvalKit_old/PaddleMIX/ppdiffusers/Makefile new file mode 100644 index 0000000000000000000000000000000000000000..160de104e005b97faba8f766a87ed20e013578a9 --- /dev/null +++ b/VLMEvalKit_old/PaddleMIX/ppdiffusers/Makefile @@ -0,0 +1,30 @@ + +.DEFAULT_GOAL := all + +.PHONY: all +all: deploy-version build deploy + +.PHONY: build +build: + python3 setup.py sdist bdist_wheel + +.PHONY: deploy +deploy: + make deploy-version + twine upload --skip-existing dist/* + +.PHONY: deploy-version +deploy-version: + echo "VERSION = '$$(cat VERSION)'" > ppdiffusers/version.py + +.PHONY: install +install: + pip install -r requirements.txt + +.PHONY: version +version: + @newVersion=$$(awk -F. '{print $$1"."$$2"."$$3+1}' < VERSION) \ + && echo $${newVersion} > VERSION \ + && git add VERSION \ + && git commit -m "🔥 update version to $${newVersion}" > /dev/null \ + && echo "Bumped version to $${newVersion}" \ No newline at end of file diff --git a/VLMEvalKit_old/PaddleMIX/ppdiffusers/README.md b/VLMEvalKit_old/PaddleMIX/ppdiffusers/README.md new file mode 100644 index 0000000000000000000000000000000000000000..609b2a390800b41558edcf4b14d7a1692d6cc118 --- /dev/null +++ b/VLMEvalKit_old/PaddleMIX/ppdiffusers/README.md @@ -0,0 +1,1278 @@ +
+ +
+ +

+ + + +

+ +

+ 特性 | + 安装 | + 快速开始 | + 模型部署 +

+ +# PPDiffusers: Diffusers toolbox implemented based on PaddlePaddle + +**PPDiffusers**是一款支持多种模态(如文本图像跨模态、图像、语音)扩散模型(Diffusion Model)训练和推理的国产化工具箱,依托于[**PaddlePaddle**](https://www.paddlepaddle.org.cn/)框架和[**PaddleNLP**](https://github.com/PaddlePaddle/PaddleNLP)自然语言处理开发库。 + +## News 📢 +* 🔥 **2024.10.18 发布 0.29.0 版本,新增图像生成模型[Stable Diffusion 3 (SD3)](https://github.com/PaddlePaddle/PaddleMIX/blob/develop/ppdiffusers/examples/text_to_image/README_sd3.md),支持DreamBooth训练及高性能推理;SD3、SDXL适配昇腾910B,提供国产计算芯片上的训推能力;DIT支持[高性能推理](https://github.com/PaddlePaddle/PaddleMIX/blob/develop/ppdiffusers/examples/class_conditional_image_generation/DiT/README.md#23-paddle-inference-%E9%AB%98%E6%80%A7%E8%83%BD%E6%8E%A8%E7%90%86);支持PaddleNLP 3.0 beta版本。** + +* 🔥 **2024.07.15 发布 0.24.1 版本,新增[Open-Sora](https://github.com/PaddlePaddle/PaddleMIX/tree/develop/ppdiffusers/examples/Open-Sora),支持模型训练和推理;全面支持Paddle 3.0。** + +* 🔥 **2024.04.17 发布 0.24.0 版本,支持[Sora相关技术](https://github.com/PaddlePaddle/PaddleMIX/tree/develop/ppdiffusers/examples/sora),支持[DiT](https://github.com/PaddlePaddle/PaddleMIX/tree/develop/ppdiffusers/examples/class_conditional_image_generation/DiT)、[SiT](https://github.com/PaddlePaddle/PaddleMIX/tree/develop/ppdiffusers/examples/class_conditional_image_generation/DiT#exploring-flow-and-diffusion-based-generative-models-with-scalable-interpolant-transformers-sit)、[UViT](https://github.com/PaddlePaddle/PaddleMIX/tree/develop/ppdiffusers/examples/text_to_image_mscoco_uvit)训练推理,新增[NaViT](https://github.com/PaddlePaddle/PaddleMIX/tree/develop/ppdiffusers/examples/navit)、[MAGVIT-v2](https://github.com/PaddlePaddle/PaddleMIX/tree/develop/ppdiffusers/examples/video_tokenizer/magvit2)模型; +视频生成能力全面升级; +新增视频生成模型[SVD](https://github.com/PaddlePaddle/PaddleMIX/tree/develop/ppdiffusers/examples/stable_video_diffusion),支持模型微调和推理; +新增姿态可控视频生成模型[AnimateAnyone](https://github.com/PaddlePaddle/PaddleMIX/tree/develop/ppdiffusers/examples/AnimateAnyone)、即插即用视频生成模型[AnimateDiff](https://github.com/PaddlePaddle/PaddleMIX/blob/develop/ppdiffusers/examples/inference/text_to_video_generation_animediff.py)、GIF视频生成模型[Hotshot-XL](https://github.com/PaddlePaddle/PaddleMIX/tree/develop/ppdiffusers/examples/community/Hotshot-XL); +新增高速推理文图生成模型[LCM](https://github.com/PaddlePaddle/PaddleMIX/tree/develop/ppdiffusers/examples/consistency_distillation),支持SD/SDXL训练和推理; +[模型推理部署](https://github.com/PaddlePaddle/PaddleMIX/tree/develop/ppdiffusers/deploy)全面升级;新增peft,accelerate后端; +权重加载/保存全面升级,支持分布式、模型切片、safetensors等场景,相关能力已集成DiT、 [IP-Adapter](https://github.com/PaddlePaddle/PaddleMIX/tree/develop/ppdiffusers/examples/ip_adapter)、[PhotoMaker](https://github.com/PaddlePaddle/PaddleMIX/tree/develop/ppdiffusers/examples/PhotoMaker)、[InstantID](https://github.com/PaddlePaddle/PaddleMIX/tree/develop/ppdiffusers/examples/InstantID)等。** +* 🔥 **2023.12.12 发布 0.19.4 版本,修复已知的部分 BUG,修复 0D Tensor 的 Warning,新增 SDXL 的 FastdeployPipeline。** +* 🔥 **2023.09.27 发布 0.19.3 版本,新增[SDXL](#文本图像多模),支持Text2Image、Img2Img、Inpainting、InstructPix2Pix等任务,支持DreamBooth Lora训练; +新增[UniDiffuser](#文本图像多模),通过统一的多模态扩散过程支持文生图、图生文等任务; +新增文本条件视频生成模型[LVDM](https://github.com/PaddlePaddle/PaddleMIX/tree/develop/ppdiffusers/examples/text_to_video_lvdm),支持训练与推理; +新增文图生成模型[Kandinsky 2.2](#文本图像多模),[Consistency models](#文本图像多模); +Stable Diffusion支持[BF16 O2训练](https://github.com/PaddlePaddle/PaddleMIX/tree/develop/ppdiffusers/examples/stable_diffusion),效果对齐FP32; +[LoRA加载升级](#加载HF-LoRA权重),支持加载SDXL的LoRA权重; +[Controlnet](https://github.com/PaddlePaddle/PaddleMIX/tree/develop/ppdiffusers/ppdiffusers/pipelines/controlnet)升级,支持ControlNetImg2Img、ControlNetInpaint、StableDiffusionXLControlNet等。** + + + + +## 特性 +#### 📦 SOTA扩散模型Pipelines集合 +我们提供**SOTA(State-of-the-Art)** 的扩散模型Pipelines集合。 +目前**PPDiffusers**已经集成了**100+Pipelines**,支持文图生成(Text-to-Image Generation)、文本引导的图像编辑(Text-Guided Image Inpainting)、文本引导的图像变换(Image-to-Image Text-Guided Generation)、文本条件的视频生成(Text-to-Video Generation)、超分(Super Superresolution)、文本条件的音频生成(Text-to-Audio Generation)在内的**10余项**任务,覆盖**文本、图像、视频、音频**等多种模态。 +如果想要了解当前支持的所有**Pipelines**以及对应的来源信息,可以阅读[🔥 PPDiffusers Pipelines](https://github.com/PaddlePaddle/PaddleMIX/blob/develop/ppdiffusers/ppdiffusers/pipelines/README.md)文档。 + + +#### 🔊 提供丰富的Noise Scheduler +我们提供了丰富的**噪声调度器(Noise Scheduler)**,可以对**速度**与**质量**进行权衡,用户可在推理时根据需求快速切换使用。 +当前**PPDiffusers**已经集成了**14+Scheduler**,不仅支持 [DDPM](https://github.com/PaddlePaddle/PaddleMIX/blob/develop/ppdiffusers/ppdiffusers/schedulers/scheduling_ddpm.py)、[DDIM](https://github.com/PaddlePaddle/PaddleMIX/blob/develop/ppdiffusers/ppdiffusers/schedulers/scheduling_ddim.py) 和 [PNDM](https://github.com/PaddlePaddle/PaddleMIX/blob/develop/ppdiffusers/ppdiffusers/schedulers/scheduling_pndm.py),还支持最新的 [🔥 DPMSolver](https://github.com/PaddlePaddle/PaddleMIX/blob/develop/ppdiffusers/ppdiffusers/schedulers/scheduling_dpmsolver_multistep.py)! + +#### 🎛️ 提供多种扩散模型组件 +我们提供了**多种扩散模型**组件,如[UNet1DModel](https://github.com/PaddlePaddle/PaddleMIX/blob/develop/ppdiffusers/ppdiffusers/models/unet_1d.py)、[UNet2DModel](https://github.com/PaddlePaddle/PaddleMIX/blob/develop/ppdiffusers/ppdiffusers/models/unet_2d.py)、[UNet2DConditionModel](https://github.com/PaddlePaddle/PaddleMIX/blob/develop/ppdiffusers/ppdiffusers/models/unet_2d_condition.py)、[UNet3DConditionModel](https://github.com/PaddlePaddle/PaddleMIX/blob/develop/ppdiffusers/ppdiffusers/models/unet_3d_condition.py)、[VQModel](https://github.com/PaddlePaddle/PaddleMIX/blob/develop/ppdiffusers/ppdiffusers/models/vae.py)、[AutoencoderKL](https://github.com/PaddlePaddle/PaddleMIX/blob/develop/ppdiffusers/ppdiffusers/models/vae.py)等。 + + +#### 📖 提供丰富的训练和推理教程 +我们提供了丰富的训练教程,不仅支持扩散模型的二次开发微调,如基于[Textual Inversion](https://github.com/PaddlePaddle/PaddleMIX/tree/develop/ppdiffusers/examples/textual_inversion)和[DreamBooth](https://github.com/PaddlePaddle/PaddleMIX/tree/develop/ppdiffusers/examples/dreambooth)使用3-5张图定制化训练生成图像的风格或物体,还支持[🔥 Latent Diffusion Model](https://github.com/PaddlePaddle/PaddleMIX/tree/develop/ppdiffusers/examples/text_to_image_laion400m)、[🔥 ControlNet](https://github.com/PaddlePaddle/PaddleMIX/tree/develop/ppdiffusers/examples/controlnet)、[🔥 T2I-Adapter](https://github.com/PaddlePaddle/PaddleMIX/tree/develop/ppdiffusers/examples/t2i-adapter) 等扩散模型的训练! +此外,我们还提供了丰富的[🔥 Pipelines推理样例](https://github.com/PaddlePaddle/PaddleMIX/tree/develop/ppdiffusers/examples/inference)。 + +#### 🚀 支持FastDeploy高性能部署 +我们提供基于[FastDeploy](https://github.com/PaddlePaddle/FastDeploy)的[🔥 高性能Stable Diffusion Pipeline](https://github.com/PaddlePaddle/PaddleMIX/blob/develop/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_fastdeploy_stable_diffusion.py),更多有关FastDeploy进行多推理引擎后端高性能部署的信息请参考[🔥 高性能FastDeploy推理教程](https://github.com/PaddlePaddle/PaddleMIX/tree/develop/ppdiffusers/deploy)。 + +## 安装 + +### 环境依赖 +``` +pip install -r requirements.txt +``` +关于PaddlePaddle安装的详细教程请查看[Installation](https://www.paddlepaddle.org.cn/install/quick?docurl=/documentation/docs/zh/develop/install/pip/linux-pip.html)。 + +### pip安装 + +```shell +pip install --upgrade ppdiffusers +``` + +### 手动安装 +```shell +git clone https://github.com/PaddlePaddle/PaddleMIX +cd PaddleMIX/ppdiffusers +python setup.py install +``` +### 设置代理 +```shell +export HF_HUB_ENABLE_HF_TRANSFER=1 +export HF_ENDPOINT=https://hf-mirror.com +``` + +## 快速开始 +我们将以扩散模型的典型代表**Stable Diffusion**为例,带你快速了解PPDiffusers。 + +**Stable Diffusion**基于**潜在扩散模型(Latent Diffusion Models)**,专门用于**文图生成(Text-to-Image Generation)任务**。该模型是由来自 [CompVis](https://github.com/CompVis), [Stability AI](https://stability.ai/), [LAION](https://laion.ai/)以及[RunwayML](https://runwayml.com/)的工程师共同开发完成,目前发布了v1和v2两个版本。v1版本采用了LAION-5B数据集子集(分辨率为 512x512)进行训练,并具有以下架构设置:自动编码器下采样因子为8,UNet大小为860M,文本编码器为CLIP ViT-L/14。v2版本相较于v1版本在生成图像的质量和分辨率等进行了改善。 + +### Stable Diffusion重点模型权重 + +
  Stable Diffusion 模型支持的权重(英文) + +**我们只需要将下面的"xxxx",替换成所需的权重名,即可快速使用!** +```python +from ppdiffusers import * + +pipe_text2img = StableDiffusionPipeline.from_pretrained("xxxx") +pipe_img2img = StableDiffusionImg2ImgPipeline.from_pretrained("xxxx") +pipe_inpaint_legacy = StableDiffusionInpaintPipelineLegacy.from_pretrained("xxxx") +pipe_mega = StableDiffusionMegaPipeline.from_pretrained("xxxx") + +# pipe_mega.text2img() 等于 pipe_text2img() +# pipe_mega.img2img() 等于 pipe_img2img() +# pipe_mega.inpaint_legacy() 等于 pipe_inpaint_legacy() +``` + +| PPDiffusers支持的模型名称 | 支持加载的Pipeline | 备注 | huggingface.co地址 | +| :-------------------------------------------: | :--------------------------------------------------------------------: | --- | :-----------------------------------------: | +| CompVis/stable-diffusion-v1-4 | StableDiffusionPipeline、StableDiffusionImg2ImgPipeline、StableDiffusionInpaintPipelineLegacy、StableDiffusionMegaPipeline、StableDiffusionPipelineAllinOne | Stable-Diffusion-v1-4 使用 Stable-Diffusion-v1-2 的权重进行初始化。随后在"laion-aesthetics v2 5+"数据集上以 **512x512** 分辨率微调了 **225k** 步数,对文本使用了 **10%** 的dropout(即:训练过程中文图对中的文本有 10% 的概率会变成空文本)。模型使用了[CLIP ViT-L/14](https://huggingface.co/openai/clip-vit-large-patch14)作为文本编码器。| [地址](https://huggingface.co/CompVis/stable-diffusion-v1-4) | +| CompVis/ldm-text2im-large-256 | LDMTextToImagePipeline | [LDM论文](https://arxiv.org/pdf/2112.10752.pdf) LDM-KL-8-G* 权重。| [地址](https://huggingface.co/CompVis/ldm-text2im-large-256) | +| CompVis/ldm-super-resolution-4x-openimages | LDMSuperResolutionPipeline | [LDM论文](https://arxiv.org/pdf/2112.10752.pdf) LDM-VQ-4 权重,[原始权重链接](https://ommer-lab.com/files/latent-diffusion/sr_bsr.zip)。| [地址](https://huggingface.co/CompVis/ldm-super-resolution-4x-openimages) | +| runwayml/stable-diffusion-v1-5 | StableDiffusionPipeline、StableDiffusionImg2ImgPipeline、StableDiffusionInpaintPipelineLegacy、StableDiffusionMegaPipeline、StableDiffusionPipelineAllinOne | Stable-Diffusion-v1-5 使用 Stable-Diffusion-v1-2 的权重进行初始化。随后在"laion-aesthetics v2 5+"数据集上以 **512x512** 分辨率微调了 **595k** 步数,对文本使用了 **10%** 的dropout(即:训练过程中文图对中的文本有 10% 的概率会变成空文本)。模型同样也使用了[CLIP ViT-L/14](https://huggingface.co/openai/clip-vit-large-patch14)作为文本编码器。| [地址](https://huggingface.co/runwayml/stable-diffusion-v1-5) | +| runwayml/stable-diffusion-inpainting | StableDiffusionInpaintPipeline | Stable-Diffusion-Inpainting 使用 Stable-Diffusion-v1-2 的权重进行初始化。首先进行了 **595k** 步的常规训练(实际也就是 Stable-Diffusion-v1-5 的权重),然后进行了 **440k** 步的 inpainting 修复训练。对于 inpainting 修复训练,给 UNet 额外增加了 **5** 输入通道(其中 **4** 个用于被 Mask 遮盖住的图片,**1** 个用于 Mask 本身)。在训练期间,会随机生成 Mask,并有 **25%** 概率会将原始图片全部 Mask 掉。| [地址](https://huggingface.co/runwayml/stable-diffusion-inpainting) | +| stabilityai/stable-diffusion-2-base | StableDiffusionPipeline、StableDiffusionImg2ImgPipeline、StableDiffusionInpaintPipelineLegacy、StableDiffusionMegaPipeline、StableDiffusionPipelineAllinOne | 该模型首先在 [LAION-5B 256x256 子集上](https://laion.ai/blog/laion-5b/) (过滤条件:[punsafe = 0.1 的 LAION-NSFW 分类器](https://github.com/LAION-AI/CLIP-based-NSFW-Detector) 和 审美分数大于等于 4.5 )从头开始训练 **550k** 步,然后又在分辨率 **>= 512x512** 的同一数据集上进一步训练 **850k** 步。| [地址](https://huggingface.co/stabilityai/stable-diffusion-2-base) | +| stabilityai/stable-diffusion-2 | StableDiffusionPipeline、StableDiffusionImg2ImgPipeline、StableDiffusionInpaintPipelineLegacy、StableDiffusionMegaPipeline、StableDiffusionPipelineAllinOne | stable-diffusion-2 使用 stable-diffusion-2-base 权重进行初始化,首先在同一数据集上(**512x512** 分辨率)使用 [v-objective](https://arxiv.org/abs/2202.00512) 训练了 **150k** 步。然后又在 **768x768** 分辨率上使用 [v-objective](https://arxiv.org/abs/2202.00512) 继续训练了 **140k** 步。| [地址](https://huggingface.co/stabilityai/stable-diffusion-2) | +| stabilityai/stable-diffusion-2-inpainting | StableDiffusionInpaintPipeline |stable-diffusion-2-inpainting 使用 stable-diffusion-2-base 权重初始化,并且额外训练了 **200k** 步。训练过程使用了 [LAMA](https://github.com/saic-mdal/lama) 中提出的 Mask 生成策略,并且使用 Mask 图片的 Latent 表示(经过 VAE 编码)作为附加条件。| [地址](https://huggingface.co/stabilityai/stable-diffusion-2-inpainting) | +| stabilityai/stable-diffusion-x4-upscaler | StableDiffusionUpscalePipeline | 该模型在**LAION 10M** 子集上(>2048x2048)训练了 1.25M 步。该模型还在分辨率为 **512x512** 的图像上使用 [Text-guided Latent Upscaling Diffusion Model](https://arxiv.org/abs/2112.10752) 进行了训练。除了**文本输入**之外,它还接收 **noise_level** 作为输入参数,因此我们可以使用 [预定义的 Scheduler](https://huggingface.co/stabilityai/stable-diffusion-x4-upscaler/blob/main/low_res_scheduler/scheduler_config.json) 向低分辨率的输入图片添加噪声。| [地址](https://huggingface.co/stabilityai/stable-diffusion-x4-upscaler) | +| hakurei/waifu-diffusion | StableDiffusionPipeline、StableDiffusionImg2ImgPipeline、StableDiffusionInpaintPipelineLegacy、StableDiffusionMegaPipeline、StableDiffusionPipelineAllinOne | waifu-diffusion-v1-2 使用 stable-diffusion-v1-4 权重初始化,并且在**高质量动漫**图像数据集上进行微调后得到的模型。用于微调的数据是 **680k** 文本图像样本,这些样本是通过 **booru 网站** 下载的。| [地址](https://huggingface.co/hakurei/waifu-diffusion) | +| hakurei/waifu-diffusion-v1-3 | StableDiffusionPipeline、StableDiffusionImg2ImgPipeline、StableDiffusionInpaintPipelineLegacy、StableDiffusionMegaPipeline、StableDiffusionPipelineAllinOne | waifu-diffusion-v1-3 是 waifu-diffusion-v1-2 基础上进一步训练得到的。他们对数据集进行了额外操作:(1)删除下划线;(2)删除括号;(3)用逗号分隔每个booru 标签;(4)随机化标签顺序。| [地址](https://huggingface.co/hakurei/waifu-diffusion) | +| naclbit/trinart_stable_diffusion_v2_60k | StableDiffusionPipeline、StableDiffusionImg2ImgPipeline、StableDiffusionInpaintPipelineLegacy、StableDiffusionMegaPipeline、StableDiffusionPipelineAllinOne | trinart_stable_diffusion 使用 stable-diffusion-v1-4 权重初始化,在 40k **高分辨率漫画/动漫风格**的图片数据集上微调了 8 个 epoch。V2 版模型使用 **dropouts**、**10k+ 图像**和**新的标记策略**训练了**更长时间**。| [地址](https://huggingface.co/naclbit/trinart_stable_diffusion_v2) | +| naclbit/trinart_stable_diffusion_v2_95k | StableDiffusionPipeline、StableDiffusionImg2ImgPipeline、StableDiffusionInpaintPipelineLegacy、StableDiffusionMegaPipeline、StableDiffusionPipelineAllinOne | **95k** 步数的结果,其他同上。| [地址](https://huggingface.co/naclbit/trinart_stable_diffusion_v2) | +| naclbit/trinart_stable_diffusion_v2_115k | StableDiffusionPipeline、StableDiffusionImg2ImgPipeline、StableDiffusionInpaintPipelineLegacy、StableDiffusionMegaPipeline、StableDiffusionPipelineAllinOne | **115k** 步数的结果,其他同上。| [地址](https://huggingface.co/naclbit/trinart_stable_diffusion_v2) | +| Deltaadams/Hentai-Diffusion | StableDiffusionPipeline、StableDiffusionImg2ImgPipeline、StableDiffusionInpaintPipelineLegacy、StableDiffusionMegaPipeline、StableDiffusionPipelineAllinOne | None| [地址](https://huggingface.co/Deltaadams/Hentai-Diffusion) | +| ringhyacinth/nail-set-diffuser | StableDiffusionPipeline、StableDiffusionImg2ImgPipeline、StableDiffusionInpaintPipelineLegacy、StableDiffusionMegaPipeline、StableDiffusionPipelineAllinOne | 美甲领域的扩散模型,训练数据使用了 [Weekend](https://weibo.com/u/5982308498)| [地址](https://huggingface.co/ringhyacinth/nail-set-diffuser) | +| Linaqruf/anything-v3.0 | StableDiffusionPipeline、StableDiffusionImg2ImgPipeline、StableDiffusionInpaintPipelineLegacy、StableDiffusionMegaPipeline、StableDiffusionPipelineAllinOne | 该模型可通过输入几个文本提示词就能生成**高质量、高度详细的动漫风格图片**,该模型支持使用 **danbooru 标签文本** 生成图像。| [地址](https://huggingface.co/Linaqruf/anything-v3.0) | + +
+
  Stable Diffusion 模型支持的权重(中文和多语言) + + +| PPDiffusers支持的模型名称 | 支持加载的Pipeline | 备注 | huggingface.co地址 | +| :-------------------------------------------: | :--------------------------------------------------------------------: | --- | :-----------------------------------------: | +| BAAI/AltDiffusion | AltDiffusionPipeline、AltDiffusionImg2ImgPipeline | 该模型使用 [AltCLIP](https://github.com/FlagAI-Open/FlagAI/tree/master/examples/AltCLIP/README.md) 作为文本编码器,在 Stable Diffusion 基础上训练了**双语Diffusion模型**,其中训练数据来自 [WuDao数据集](https://data.baai.ac.cn/details/WuDaoCorporaText) 和 [LAION](https://huggingface.co/datasets/ChristophSchuhmann/improved_aesthetics_6plus) 。| [地址](https://huggingface.co/BAAI/AltDiffusion) | +| BAAI/AltDiffusion-m9 | AltDiffusionPipeline、AltDiffusionImg2ImgPipeline |该模型使用9种语言的 [AltCLIP-m9](https://github.com/FlagAI-Open/FlagAI/tree/master/examples/AltCLIP/README.md) 作为文本编码器,其他同上。| [地址](https://huggingface.co/BAAI/AltDiffusion-m9) | +| IDEA-CCNL/Taiyi-Stable-Diffusion-1B-Chinese-v0.1 | StableDiffusionPipeline、StableDiffusionImg2ImgPipeline、StableDiffusionInpaintPipelineLegacy、StableDiffusionMegaPipeline、StableDiffusionPipelineAllinOne | 他们将 [Noah-Wukong](https://wukong-dataset.github.io/wukong-dataset/) 数据集 (100M) 和 [Zero](https://zero.so.com/) 数据集 (23M) 用作预训练的数据集,先用 [IDEA-CCNL/Taiyi-CLIP-RoBERTa-102M-ViT-L-Chinese](https://huggingface.co/IDEA-CCNL/Taiyi-CLIP-RoBERTa-102M-ViT-L-Chinese) 对这两个数据集的图文对相似性进行打分,取 CLIP Score 大于 0.2 的图文对作为训练集。 他们使用 [IDEA-CCNL/Taiyi-CLIP-RoBERTa-102M-ViT-L-Chinese](https://huggingface.co/IDEA-CCNL/Taiyi-CLIP-RoBERTa-102M-ViT-L-Chinese) 作为初始化的text encoder,冻住 [stable-diffusion-v1-4](https://huggingface.co/CompVis/stable-diffusion-v1-4) ([论文](https://arxiv.org/abs/2112.10752)) 模型的其他部分,只训练 text encoder,以便保留原始模型的生成能力且实现中文概念的对齐。该模型目前在0.2亿图文对上训练了一个 epoch。 在 32 x A100 上训练了大约100小时,该版本只是一个初步的版本。| [地址](https://huggingface.co/IDEA-CCNL/Taiyi-Stable-Diffusion-1B-Chinese-v0.1) | +| IDEA-CCNL/Taiyi-Stable-Diffusion-1B-Chinese-EN-v0.1 | StableDiffusionPipeline、StableDiffusionImg2ImgPipeline、StableDiffusionInpaintPipelineLegacy、StableDiffusionMegaPipeline、StableDiffusionPipelineAllinOne | 他们将 [Noah-Wukong](https://wukong-dataset.github.io/wukong-dataset/) 数据集 (100M) 和 [Zero](https://zero.so.com/) 数据集 (23M) 用作预训练的数据集,先用 [IDEA-CCNL/Taiyi-CLIP-RoBERTa-102M-ViT-L-Chinese](https://huggingface.co/IDEA-CCNL/Taiyi-CLIP-RoBERTa-102M-ViT-L-Chinese) 对这两个数据集的图文对相似性进行打分,取 CLIP Score 大于 0.2 的图文对作为训练集。 他们使用 [stable-diffusion-v1-4](https://huggingface.co/CompVis/stable-diffusion-v1-4) ([论文](https://arxiv.org/abs/2112.10752)) 模型进行继续训练,其中训练分为**两个stage**。**第一个stage** 中冻住模型的其他部分,只训练 text encoder ,以便保留原始模型的生成能力且实现中文概念的对齐。**第二个stage** 中将全部模型解冻,一起训练 text encoder 和 diffusion model ,以便 diffusion model 更好的适配中文引导。第一个 stage 他们训练了 80 小时,第二个 stage 训练了 100 小时,两个stage都是用了8 x A100,该版本是一个初步的版本。| [地址](https://huggingface.co/IDEA-CCNL/Taiyi-Stable-Diffusion-1B-Chinese-EN-v0.1) | +
+ + +### 加载HF Diffusers权重 +```python +from ppdiffusers import StableDiffusionPipeline +# 设置from_hf_hub为True,表示从huggingface hub下载,from_diffusers为True表示加载的是diffusers版Pytorch权重 +pipe = StableDiffusionPipeline.from_pretrained("stabilityai/stable-diffusion-2", from_hf_hub=True, from_diffusers=True) +``` + +### 加载原库的Lightning权重 +```python +from ppdiffusers import StableDiffusionPipeline +# 可输入网址 或 本地ckpt、safetensors文件 +pipe = StableDiffusionPipeline.from_single_file("https://paddlenlp.bj.bcebos.com/models/community/junnyu/develop/ppdiffusers/chilloutmix_NiPrunedFp32Fix.safetensors") +``` + +### 加载HF LoRA权重 +```python +from ppdiffusers import DiffusionPipeline + +pipe = DiffusionPipeline.from_pretrained("stabilityai/stable-diffusion-xl-base-1.0", paddle_dtype=paddle.float16) + +pipe.load_lora_weights("stabilityai/stable-diffusion-xl-base-1.0", + weight_name="sd_xl_offset_example-lora_1.0.safetensors", + from_diffusers=True) +``` + +### 加载Civitai社区的LoRA权重 +```python +from ppdiffusers import StableDiffusionPipeline +pipe = StableDiffusionPipeline.from_pretrained("TASUKU2023/Chilloutmix") +# 加载lora权重 +pipe.load_lora_weights("./", + weight_name="Moxin_10.safetensors", + from_diffusers=True) +pipe.fuse_lora() +``` + +### XFormers加速 +为了使用**XFormers加速**,我们需要安装`develop`版本的`paddle`,Linux系统的安装命令如下: +```sh +python -m pip install paddlepaddle-gpu==0.0.0.post117 -f https://www.paddlepaddle.org.cn/whl/linux/gpu/develop.html +``` + +```python +import paddle +from ppdiffusers import StableDiffusionPipeline +pipe = StableDiffusionPipeline.from_pretrained("TASUKU2023/Chilloutmix", paddle_dtype=paddle.float16) +# 开启xformers加速 默认选择"cutlass"加速 +pipe.enable_xformers_memory_efficient_attention() +# flash 需要使用 A100、A10、3060、3070、3080、3090 等以上显卡。 +# pipe.enable_xformers_memory_efficient_attention("flash") +``` + +### ToME + ControlNet +```python +# 安装develop的ppdiffusers +# pip install "ppdiffusers>=0.24.0" +import paddle +from ppdiffusers import ControlNetModel, StableDiffusionControlNetPipeline +from ppdiffusers.utils import load_image + +controlnet = ControlNetModel.from_pretrained("lllyasviel/sd-controlnet-canny") +pipe = StableDiffusionControlNetPipeline.from_pretrained( + "runwayml/stable-diffusion-v1-5", safety_checker=None, controlnet=controlnet, paddle_dtype=paddle.float16 +) + +# Apply ToMe with a 50% merging ratio +pipe.apply_tome(ratio=0.5) # Can also use pipe.unet in place of pipe here + +# 我们可以开启 xformers +# pipe.enable_xformers_memory_efficient_attention() +generator = paddle.Generator().manual_seed(0) +prompt = "bird" +image = load_image( + "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/sd_controlnet/bird_canny.png" +) + +image = pipe(prompt, image, generator=generator).images[0] + +image.save("bird.png") +``` + +### 文图生成 (Text-to-Image Generation) + +```python +import paddle +from ppdiffusers import StableDiffusionPipeline + +pipe = StableDiffusionPipeline.from_pretrained("stabilityai/stable-diffusion-2") + +# 设置随机种子,我们可以复现下面的结果! +paddle.seed(5232132133) +prompt = "a portrait of shiba inu with a red cap growing on its head. intricate. lifelike. soft light. sony a 7 r iv 5 5 mm. cinematic post - processing " +image = pipe(prompt, guidance_scale=7.5, height=768, width=768).images[0] + +image.save("shiba_dog_with_a_red_cap.png") +``` +
+image +
+ +### 文本引导的图像变换(Image-to-Image Text-Guided Generation) + +
 Image-to-Image Text-Guided Generation Demo + +```python +import paddle +from ppdiffusers import StableDiffusionImg2ImgPipeline +from ppdiffusers.utils import load_image + +pipe = StableDiffusionImg2ImgPipeline.from_pretrained("Linaqruf/anything-v3.0", safety_checker=None) + +url = "https://paddlenlp.bj.bcebos.com/models/community/CompVis/data/image_Kurisu.png" +image = load_image(url).resize((512, 768)) + +# 设置随机种子,我们可以复现下面的结果! +paddle.seed(42) +prompt = "Kurisu Makise, looking at viewer, long hair, standing, 1girl, hair ornament, hair flower, cute, jacket, white flower, white dress" +negative_prompt = "lowres, bad anatomy, bad hands, text, error, missing fingers, extra digit, fewer digits, cropped, worst quality, low quality, normal quality, jpeg artifacts, signature, watermark, username, blurry" + +image = pipe(prompt=prompt, negative_prompt=negative_prompt, image=image, strength=0.75, guidance_scale=7.5).images[0] +image.save("image_Kurisu_img2img.png") +``` +
+image +
+
+ +### 文本引导的图像编辑(Text-Guided Image Inpainting) + +注意!当前有两种版本的图像编辑代码,一个是Legacy版本,一个是正式版本,下面将分别介绍两种代码如何使用! + +
 Legacy版本代码 + +```python +import paddle +from ppdiffusers import StableDiffusionInpaintPipelineLegacy +from ppdiffusers.utils import load_image + +# 可选模型权重 +# CompVis/stable-diffusion-v1-4 +# runwayml/stable-diffusion-v1-5 +# stabilityai/stable-diffusion-2-base (原始策略 512x512) +# stabilityai/stable-diffusion-2 (v-objective 768x768) +# Linaqruf/anything-v3.0 +# ...... +img_url = "https://paddlenlp.bj.bcebos.com/models/community/CompVis/stable-diffusion-v1-4/overture-creations.png" +mask_url = "https://paddlenlp.bj.bcebos.com/models/community/CompVis/stable-diffusion-v1-4/overture-creations-mask.png" + +image = load_image(img_url).resize((512, 512)) +mask_image = load_image(mask_url).resize((512, 512)) + +pipe = StableDiffusionInpaintPipelineLegacy.from_pretrained("stabilityai/stable-diffusion-2-base", safety_checker=None) + +# 设置随机种子,我们可以复现下面的结果! +paddle.seed(10245) +prompt = "a red cat sitting on a bench" +image = pipe(prompt=prompt, image=image, mask_image=mask_image, strength=0.75).images[0] + +image.save("a_red_cat_legacy.png") +``` +
+image +
+ +
+ +
 正式版本代码 + +Tips: 下面的使用方法是新版本的代码,也是官方推荐的代码,注意必须配合 **runwayml/stable-diffusion-inpainting** 和 **stabilityai/stable-diffusion-2-inpainting** 才可正常使用。 +```python +import paddle +from ppdiffusers import StableDiffusionInpaintPipeline +from ppdiffusers.utils import load_image + +# 可选模型权重 +# runwayml/stable-diffusion-inpainting +# stabilityai/stable-diffusion-2-inpainting +img_url = "https://paddlenlp.bj.bcebos.com/models/community/CompVis/stable-diffusion-v1-4/overture-creations.png" +mask_url = "https://paddlenlp.bj.bcebos.com/models/community/CompVis/stable-diffusion-v1-4/overture-creations-mask.png" + +image = load_image(img_url).resize((512, 512)) +mask_image = load_image(mask_url).resize((512, 512)) + +pipe = StableDiffusionInpaintPipeline.from_pretrained("stabilityai/stable-diffusion-2-inpainting") + +# 设置随机种子,我们可以复现下面的结果! +paddle.seed(1024) +prompt = "Face of a yellow cat, high resolution, sitting on a park bench" +image = pipe(prompt=prompt, image=image, mask_image=mask_image).images[0] + +image.save("a_yellow_cat.png") +``` +
+image +
+
+ +### 文本引导的图像放大 & 超分(Text-Guided Image Upscaling & Super-Resolution) + +
 Text-Guided Image Upscaling Demo + +```python +import paddle +from ppdiffusers import StableDiffusionUpscalePipeline +from ppdiffusers.utils import load_image + +pipe = StableDiffusionUpscalePipeline.from_pretrained("stabilityai/stable-diffusion-x4-upscaler") + +url = "https://paddlenlp.bj.bcebos.com/models/community/CompVis/data/low_res_cat.png" +# 我们人工将原始图片缩小成 128x128 分辨率,最终保存的图片会放大4倍! +low_res_img = load_image(url).resize((128, 128)) + +prompt = "a white cat" +image = pipe(prompt=prompt, image=low_res_img).images[0] + +image.save("upscaled_white_cat.png") +``` +
+image +image +
+
+ +
 Super-Resolution Demo + +```python +import paddle +from ppdiffusers import LDMSuperResolutionPipeline +from ppdiffusers.utils import load_image + +pipe = LDMSuperResolutionPipeline.from_pretrained("CompVis/ldm-super-resolution-4x-openimages") + +url = "https://paddlenlp.bj.bcebos.com/models/community/CompVis/stable-diffusion-v1-4/overture-creations.png" + +# 我们人工将原始图片缩小成 128x128 分辨率,最终保存的图片会放大4倍! +low_res_img = load_image(url).resize((128, 128)) + +image = pipe(image=low_res_img, num_inference_steps=100).images[0] + +image.save("ldm-super-resolution-image.png") +``` +
+image +image +
+ +
+ +## 模型推理部署 +除了**Paddle动态图**运行之外,很多模型还支持将模型导出并使用推理引擎运行。我们提供基于[FastDeploy](https://github.com/PaddlePaddle/FastDeploy)上的**StableDiffusion**模型部署示例,涵盖文生图、图生图、图像编辑等任务,用户可以按照我们提供[StableDiffusion模型导出教程](https://github.com/PaddlePaddle/PaddleMIX/blob/develop/ppdiffusers/deploy/export.md)将模型导出,然后使用`FastDeployStableDiffusionMegaPipeline`进行高性能推理部署! + +
  已预先导出的FastDeploy版Stable Diffusion权重 + +**注意:当前导出的vae encoder带有随机因素!** + +- CompVis/stable-diffusion-v1-4@fastdeploy +- runwayml/stable-diffusion-v1-5@fastdeploy +- runwayml/stable-diffusion-inpainting@fastdeploy +- stabilityai/stable-diffusion-2-base@fastdeploy +- stabilityai/stable-diffusion-2@fastdeploy +- stabilityai/stable-diffusion-2-inpainting@fastdeploy +- Linaqruf/anything-v3.0@fastdeploy +- hakurei/waifu-diffusion-v1-3@fastdeploy + +
+ +
  FastDeploy Demo + +```python +import paddle +import fastdeploy as fd +from ppdiffusers import FastDeployStableDiffusionMegaPipeline +from ppdiffusers.utils import load_image + +def create_runtime_option(device_id=0, backend="paddle", use_cuda_stream=True): + option = fd.RuntimeOption() + if backend == "paddle": + option.use_paddle_backend() + else: + option.use_ort_backend() + if device_id == -1: + option.use_cpu() + else: + option.use_gpu(device_id) + if use_cuda_stream: + paddle_stream = paddle.device.cuda.current_stream(device_id).cuda_stream + option.set_external_raw_stream(paddle_stream) + return option + +runtime_options = { + "text_encoder": create_runtime_option(0, "paddle"), # use gpu:0 + "vae_encoder": create_runtime_option(0, "paddle"), # use gpu:0 + "vae_decoder": create_runtime_option(0, "paddle"), # use gpu:0 + "unet": create_runtime_option(0, "paddle"), # use gpu:0 +} + +fd_pipe = FastDeployStableDiffusionMegaPipeline.from_pretrained( + "Linaqruf/anything-v3.0@fastdeploy", runtime_options=runtime_options +) + +# text2img +prompt = "a portrait of shiba inu with a red cap growing on its head. intricate. lifelike. soft light. sony a 7 r iv 5 5 mm. cinematic post - processing " +image_text2img = fd_pipe.text2img(prompt=prompt, num_inference_steps=50).images[0] +image_text2img.save("image_text2img.png") + +# img2img +url = "https://paddlenlp.bj.bcebos.com/models/community/CompVis/data/image_Kurisu.png" +image = load_image(url).resize((512, 512)) +prompt = "Kurisu Makise, looking at viewer, long hair, standing, 1girl, hair ornament, hair flower, cute, jacket, white flower, white dress" +negative_prompt = "lowres, bad anatomy, bad hands, text, error, missing fingers, extra digit, fewer digits, cropped, worst quality, low quality, normal quality, jpeg artifacts, signature, watermark, username, blurry" + +image_img2img = fd_pipe.img2img( + prompt=prompt, negative_prompt=negative_prompt, image=image, strength=0.75, guidance_scale=7.5 +).images[0] +image_img2img.save("image_img2img.png") + +# inpaint_legacy +img_url = "https://paddlenlp.bj.bcebos.com/models/community/CompVis/stable-diffusion-v1-4/overture-creations.png" +mask_url = "https://paddlenlp.bj.bcebos.com/models/community/CompVis/stable-diffusion-v1-4/overture-creations-mask.png" +image = load_image(img_url).resize((512, 512)) +mask_image = load_image(mask_url).resize((512, 512)) +prompt = "a red cat sitting on a bench" + +image_inpaint_legacy = fd_pipe.inpaint_legacy( + prompt=prompt, image=image, mask_image=mask_image, strength=0.75, num_inference_steps=50 +).images[0] +image_inpaint_legacy.save("image_inpaint_legacy.png") +``` +
+
+image +
+ + +## 更多任务分类展示 +### 文本图像多模 + +
+ 文图生成(Text-to-Image Generation) + +#### text_to_image_generation-stable_diffusion + +```python +from ppdiffusers import StableDiffusionPipeline + +# 加载模型和scheduler +pipe = StableDiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5") + +# 执行pipeline进行推理 +prompt = "a photo of an astronaut riding a horse on mars" +image = pipe(prompt).images[0] + +# 保存图片 +image.save("astronaut_rides_horse_sd.png") +``` +
+image +
+ +#### text_to_image_generation-stable_diffusion_xl + +```python +import paddle +from ppdiffusers import StableDiffusionXLPipeline + +pipe = StableDiffusionXLPipeline.from_pretrained( + "stabilityai/stable-diffusion-xl-base-1.0", + paddle_dtype=paddle.float16, + variant="fp16" +) +prompt = "a photo of an astronaut riding a horse on mars" +generator = paddle.Generator().manual_seed(42) +image = pipe(prompt=prompt, generator=generator, num_inference_steps=50).images[0] +image.save('sdxl_text2image.png') +``` +
+image +
+ +#### text_to_image_generation-sdxl_base_with_refiner + +```python +from ppdiffusers import DiffusionPipeline +import paddle + +# load both base & refiner +base = DiffusionPipeline.from_pretrained( + "stabilityai/stable-diffusion-xl-base-1.0", + paddle_dtype=paddle.float16, +) +refiner = DiffusionPipeline.from_pretrained( + "stabilityai/stable-diffusion-xl-refiner-1.0", + text_encoder_2=base.text_encoder_2, + vae=base.vae, + paddle_dtype=paddle.float16, + variant="fp16", +) + +# Define how many steps and what % of steps to be run on each experts (80/20) here +n_steps = 40 +high_noise_frac = 0.8 + +prompt = "A majestic lion jumping from a big stone at night" +prompt = "a photo of an astronaut riding a horse on mars" +generator = paddle.Generator().manual_seed(42) + +# run both experts +image = base( + prompt=prompt, + output_type="latent", + generator=generator, +).images + +image = refiner( + prompt=prompt, + image=image, + generator=generator, +).images[0] +image.save('text_to_image_generation-sdxl-base-with-refiner-result.png') +``` +
+image +
+ +#### text_to_image_generation-kandinsky2_2 +```python +from ppdiffusers import KandinskyV22Pipeline, KandinskyV22PriorPipeline + +pipe_prior = KandinskyV22PriorPipeline.from_pretrained("kandinsky-community/kandinsky-2-2-prior") +prompt = "red cat, 4k photo" +out = pipe_prior(prompt) +image_emb = out.image_embeds +zero_image_emb = out.negative_image_embeds +pipe = KandinskyV22Pipeline.from_pretrained("kandinsky-community/kandinsky-2-2-decoder") +image = pipe( + image_embeds=image_emb, + negative_image_embeds=zero_image_emb, + height=768, + width=768, + num_inference_steps=50, +).images +image[0].save("text_to_image_generation-kandinsky2_2-result-cat.png") +``` +
+image +
+ +#### text_to_image_generation-unidiffuser +```python +import paddle +from paddlenlp.trainer import set_seed + +from ppdiffusers import UniDiffuserPipeline + +model_id_or_path = "thu-ml/unidiffuser-v1" +pipe = UniDiffuserPipeline.from_pretrained(model_id_or_path, paddle_dtype=paddle.float16) +set_seed(42) + +# Text variation can be performed with a text-to-image generation followed by a image-to-text generation: +# 1. Text-to-image generation +prompt = "an elephant under the sea" +sample = pipe(prompt=prompt, num_inference_steps=20, guidance_scale=8.0) +t2i_image = sample.images[0] +t2i_image.save("t2i_image.png") +```` +
+image +
+ +#### text_to_image_generation-deepfloyd_if + +```python +import paddle + +from ppdiffusers import DiffusionPipeline, IFPipeline, IFSuperResolutionPipeline +from ppdiffusers.utils import pd_to_pil + +# Stage 1: generate images +pipe = IFPipeline.from_pretrained("DeepFloyd/IF-I-XL-v1.0", variant="fp16", paddle_dtype=paddle.float16) +pipe.enable_xformers_memory_efficient_attention() +prompt = 'a photo of a kangaroo wearing an orange hoodie and blue sunglasses standing in front of the eiffel tower holding a sign that says "very deep learning"' +prompt_embeds, negative_embeds = pipe.encode_prompt(prompt) +image = pipe( + prompt_embeds=prompt_embeds, + negative_prompt_embeds=negative_embeds, + output_type="pd", +).images + +# save intermediate image +pil_image = pd_to_pil(image) +pil_image[0].save("text_to_image_generation-deepfloyd_if-result-if_stage_I.png") +# save gpu memory +pipe.to(paddle_device="cpu") + +# Stage 2: super resolution stage1 +super_res_1_pipe = IFSuperResolutionPipeline.from_pretrained( + "DeepFloyd/IF-II-L-v1.0", text_encoder=None, variant="fp16", paddle_dtype=paddle.float16 +) +super_res_1_pipe.enable_xformers_memory_efficient_attention() + +image = super_res_1_pipe( + image=image, + prompt_embeds=prompt_embeds, + negative_prompt_embeds=negative_embeds, + output_type="pd", +).images +# save intermediate image +pil_image = pd_to_pil(image) +pil_image[0].save("text_to_image_generation-deepfloyd_if-result-if_stage_II.png") +# save gpu memory +super_res_1_pipe.to(paddle_device="cpu") +``` +
+image +
+
+
if_stage_I
+
+
+image +
+
+
if_stage_II
+ +
+
+ + +
 文本引导的图像放大(Text-Guided Image Upscaling) + +#### text_guided_image_upscaling-stable_diffusion_2 + +```python +from ppdiffusers import StableDiffusionUpscalePipeline +from ppdiffusers.utils import load_image + +pipe = StableDiffusionUpscalePipeline.from_pretrained("stabilityai/stable-diffusion-x4-upscaler") + +url = "https://paddlenlp.bj.bcebos.com/models/community/CompVis/data/low_res_cat.png" +low_res_img = load_image(url).resize((128, 128)) + +prompt = "a white cat" +upscaled_image = pipe(prompt=prompt, image=low_res_img).images[0] +upscaled_image.save("upsampled_cat_sd2.png") +``` +
+image +
原图像
+image +
生成图像
+
+
+ +
 文本引导的图像编辑(Text-Guided Image Inpainting) + +#### text_guided_image_inpainting-stable_diffusion_2 + +```python +import paddle + +from ppdiffusers import PaintByExamplePipeline +from ppdiffusers.utils import load_image + +img_url = "https://paddlenlp.bj.bcebos.com/models/community/Fantasy-Studio/data/image_example_1.png" +mask_url = "https://paddlenlp.bj.bcebos.com/models/community/Fantasy-Studio/data/mask_example_1.png" +example_url = "https://paddlenlp.bj.bcebos.com/models/community/Fantasy-Studio/data/reference_example_1.jpeg" + +init_image = load_image(img_url).resize((512, 512)) +mask_image = load_image(mask_url).resize((512, 512)) +example_image = load_image(example_url).resize((512, 512)) + +pipe = PaintByExamplePipeline.from_pretrained("Fantasy-Studio/Paint-by-Example") + +# 使用fp16加快生成速度 +with paddle.amp.auto_cast(True): + image = pipe(image=init_image, mask_image=mask_image, example_image=example_image).images[0] +image.save("image_guided_image_inpainting-paint_by_example-result.png") +``` +
+image +
原图像
+
+image +
掩码图像
+
+image +
参考图像
+image +
生成图像
+
+
+ + +
 文本引导的图像变换(Image-to-Image Text-Guided Generation) + +#### text_guided_image_inpainting-kandinsky2_2 +```python +import numpy as np +import paddle + +from ppdiffusers import KandinskyV22InpaintPipeline, KandinskyV22PriorPipeline +from ppdiffusers.utils import load_image + +pipe_prior = KandinskyV22PriorPipeline.from_pretrained( + "kandinsky-community/kandinsky-2-2-prior", paddle_dtype=paddle.float16 +) +prompt = "a hat" +image_emb, zero_image_emb = pipe_prior(prompt, return_dict=False) +pipe = KandinskyV22InpaintPipeline.from_pretrained( + "kandinsky-community/kandinsky-2-2-decoder-inpaint", paddle_dtype=paddle.float16 +) +init_image = load_image( + "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/kandinsky/cat.png" +) +mask = np.zeros((768, 768), dtype=np.float32) +mask[:250, 250:-250] = 1 +out = pipe( + image=init_image, + mask_image=mask, + image_embeds=image_emb, + negative_image_embeds=zero_image_emb, + height=768, + width=768, + num_inference_steps=50, +) +image = out.images[0] +image.save("text_guided_image_inpainting-kandinsky2_2-result-cat_with_hat.png") +``` +
+image +
原图像
+image +
生成图像
+
+ +#### image_to_image_text_guided_generation-stable_diffusion +```python +import paddle + +from ppdiffusers import StableDiffusionImg2ImgPipeline +from ppdiffusers.utils import load_image + +# 加载pipeline +pipe = StableDiffusionImg2ImgPipeline.from_pretrained("runwayml/stable-diffusion-v1-5") + +# 下载初始图片 +url = "https://paddlenlp.bj.bcebos.com/models/community/CompVis/stable-diffusion-v1-4/sketch-mountains-input.png" + +init_image = load_image(url).resize((768, 512)) + +prompt = "A fantasy landscape, trending on artstation" +# 使用fp16加快生成速度 +with paddle.amp.auto_cast(True): + image = pipe(prompt=prompt, image=init_image, strength=0.75, guidance_scale=7.5).images[0] + +image.save("fantasy_landscape.png") +``` +
+image +
原图像
+image +
生成图像
+
+ +#### image_to_image_text_guided_generation-stable_diffusion_xl +```python +import paddle +from ppdiffusers import StableDiffusionXLImg2ImgPipeline +from ppdiffusers.utils import load_image + +pipe = StableDiffusionXLImg2ImgPipeline.from_pretrained( + "stabilityai/stable-diffusion-xl-refiner-1.0", + paddle_dtype=paddle.float16, + # from_hf_hub=True, + # from_diffusers=True, + variant="fp16" +) +url = "https://paddlenlp.bj.bcebos.com/models/community/westfish/develop-0-19-3/000000009.png" +init_image = load_image(url).convert("RGB") +prompt = "a photo of an astronaut riding a horse on mars" +image = pipe(prompt, image=init_image).images[0] +image.save('sdxl_image2image.png') +``` +
+image +
原图像
+image +
生成图像
+
+ +#### image_to_image_text_guided_generation-kandinsky2_2 +```python +import paddle + +from ppdiffusers import KandinskyV22Img2ImgPipeline, KandinskyV22PriorPipeline +from ppdiffusers.utils import load_image + +pipe_prior = KandinskyV22PriorPipeline.from_pretrained( + "kandinsky-community/kandinsky-2-2-prior", paddle_dtype=paddle.float16 +) +prompt = "A red cartoon frog, 4k" +image_emb, zero_image_emb = pipe_prior(prompt, return_dict=False) +pipe = KandinskyV22Img2ImgPipeline.from_pretrained( + "kandinsky-community/kandinsky-2-2-decoder", paddle_dtype=paddle.float16 +) + +init_image = load_image( + "https://hf-mirror.com/datasets/hf-internal-testing/diffusers-images/resolve/main/kandinsky/frog.png" +) +image = pipe( + image=init_image, + image_embeds=image_emb, + negative_image_embeds=zero_image_emb, + height=768, + width=768, + num_inference_steps=100, + strength=0.2, +).images +image[0].save("image_to_image_text_guided_generation-kandinsky2_2-result-red_frog.png") +``` +
+image +
原图像
+image +
生成图像
+
+ +
+ + +
 文本图像双引导图像生成(Dual Text and Image Guided Generation) + +#### dual_text_and_image_guided_generation-versatile_diffusion +```python +from ppdiffusers import VersatileDiffusionDualGuidedPipeline +from ppdiffusers.utils import load_image + +url = "https://paddlenlp.bj.bcebos.com/models/community/CompVis/data/benz.jpg" +image = load_image(url) +text = "a red car in the sun" + +pipe = VersatileDiffusionDualGuidedPipeline.from_pretrained("shi-labs/versatile-diffusion") +pipe.remove_unused_weights() + +text_to_image_strength = 0.75 +image = pipe(prompt=text, image=image, text_to_image_strength=text_to_image_strength).images[0] +image.save("versatile-diffusion-red_car.png") +``` +
+image +
原图像
+image +
生成图像
+
+
+ +### 文本视频多模 + +
+ 文本条件的视频生成(Text-to-Video Generation) + +#### text_to_video_generation-lvdm + +```python +import paddle + +from ppdiffusers import LVDMTextToVideoPipeline + +# 加载模型和scheduler +pipe = LVDMTextToVideoPipeline.from_pretrained("westfish/lvdm_text2video_orig_webvid_2m") + +# 执行pipeline进行推理 +seed = 2013 +generator = paddle.Generator().manual_seed(seed) +samples = pipe( + prompt="cutting in kitchen", + num_frames=16, + height=256, + width=256, + num_inference_steps=50, + generator=generator, + guidance_scale=15, + eta=1, + save_dir=".", + save_name="text_to_video_generation-lvdm-result-ddim_lvdm_text_to_video_ucf", + encoder_type="2d", + scale_factor=0.18215, + shift_factor=0, +) +``` +
+image +
+ +#### text_to_video_generation-synth + +```python +import imageio + +from ppdiffusers import DPMSolverMultistepScheduler, TextToVideoSDPipeline + +pipe = TextToVideoSDPipeline.from_pretrained("damo-vilab/text-to-video-ms-1.7b") +pipe.scheduler = DPMSolverMultistepScheduler.from_config(pipe.scheduler.config) + +prompt = "An astronaut riding a horse." +video_frames = pipe(prompt, num_inference_steps=25).frames +imageio.mimsave("text_to_video_generation-synth-result-astronaut_riding_a_horse.mp4", video_frames, fps=8) +``` +
+image +
+ + +#### text_to_video_generation-synth with zeroscope_v2_XL + +```python +import imageio + +from ppdiffusers import DPMSolverMultistepScheduler, TextToVideoSDPipeline + +# from ppdiffusers.utils import export_to_video + +pipe = TextToVideoSDPipeline.from_pretrained("cerspense/zeroscope_v2_XL") +pipe.scheduler = DPMSolverMultistepScheduler.from_config(pipe.scheduler.config) + +prompt = "An astronaut riding a horse." +video_frames = pipe(prompt, num_inference_steps=50, height=320, width=576, num_frames=24).frames +imageio.mimsave("text_to_video_generation-synth-result-astronaut_riding_a_horse.mp4", video_frames, fps=8) +``` +
+image +
+ +#### text_to_video_generation-zero + +```python +import imageio + +# pip install imageio[ffmpeg] +import paddle + +from ppdiffusers import TextToVideoZeroPipeline + +model_id = "runwayml/stable-diffusion-v1-5" +pipe = TextToVideoZeroPipeline.from_pretrained(model_id, paddle_dtype=paddle.float16) + +prompt = "A panda is playing guitar on times square" +result = pipe(prompt=prompt).images +result = [(r * 255).astype("uint8") for r in result] +imageio.mimsave("text_to_video_generation-zero-result-panda.mp4", result, fps=4) +``` +
+image +
+ +
+ +### 文本音频多模 +
+ 文本条件的音频生成(Text-to-Audio Generation) + +#### text_to_audio_generation-audio_ldm + +```python +import paddle +import scipy + +from ppdiffusers import AudioLDM2Pipeline + +pipe = AudioLDM2Pipeline.from_pretrained("cvssp/audioldm2", paddle_dtype=paddle.float16) + +prompt = "Musical constellations twinkling in the night sky, forming a cosmic melody." +negative_prompt = "Low quality." +audio = pipe(prompt, negative_prompt=negative_prompt, num_inference_steps=200, audio_length_in_s=10).audios[0] + +output_path = f"{prompt}.wav" +# save the audio sample as a .wav file +scipy.io.wavfile.write(output_path, rate=16000, data=audio) +``` +
+ + + + + + +
+ + + +
+
+ +可以使用以下代码转换[huggingface](https://huggingface.co/docs/diffusers/api/pipelines/audioldm2)的模型,一键在paddle中使用 +```python +pipe = AudioLDM2Pipeline.from_pretrained("cvssp/audioldm2-music", from_hf_hub=True, from_diffusers=True).save_pretrained("cvssp/audioldm2-music") +``` +### 图像 + +
 无条件图像生成(Unconditional Image Generation) + +#### unconditional_image_generation-latent_diffusion_uncond + +```python +from ppdiffusers import LDMPipeline + +# 加载模型和scheduler +pipe = LDMPipeline.from_pretrained("CompVis/ldm-celebahq-256") + +# 执行pipeline进行推理 +image = pipe(num_inference_steps=200).images[0] + +# 保存图片 +image.save("ldm_generated_image.png") +``` +
+image +
+
+ +
 超分(Super Superresolution) + +#### super_resolution-latent_diffusion +```python +import paddle + +from ppdiffusers import LDMSuperResolutionPipeline +from ppdiffusers.utils import load_image + +# 加载pipeline +pipe = LDMSuperResolutionPipeline.from_pretrained("CompVis/ldm-super-resolution-4x-openimages") + +# 下载初始图片 +url = "https://paddlenlp.bj.bcebos.com/models/community/CompVis/stable-diffusion-v1-4/overture-creations.png" + +init_image = load_image(url).resize((128, 128)) +init_image.save("original-image.png") + +# 使用fp16加快生成速度 +with paddle.amp.auto_cast(True): + image = pipe(init_image, num_inference_steps=100, eta=1).images[0] + +image.save("super-resolution-image.png") +``` +
+image +
原图像
+image +
生成图像
+
+
+ + +
 图像编辑(Image Inpainting) + +#### image_inpainting-repaint +```python +from ppdiffusers import RePaintPipeline, RePaintScheduler +from ppdiffusers.utils import load_image + +img_url = "https://paddlenlp.bj.bcebos.com/models/community/CompVis/data/celeba_hq_256.png" +mask_url = "https://paddlenlp.bj.bcebos.com/models/community/CompVis/data/mask_256.png" + +# Load the original image and the mask as PIL images +original_image = load_image(img_url).resize((256, 256)) +mask_image = load_image(mask_url).resize((256, 256)) + +scheduler = RePaintScheduler.from_pretrained("google/ddpm-ema-celebahq-256", subfolder="scheduler") +pipe = RePaintPipeline.from_pretrained("google/ddpm-ema-celebahq-256", scheduler=scheduler) + +output = pipe( + original_image=original_image, + mask_image=mask_image, + num_inference_steps=250, + eta=0.0, + jump_length=10, + jump_n_sample=10, +) +inpainted_image = output.images[0] + +inpainted_image.save("repaint-image.png") +``` +
+image +
原图像
+image +
mask图像
+image +
生成图像
+
+
+ + + +
 图像变化(Image Variation) + +#### image_variation-versatile_diffusion +```python +from ppdiffusers import VersatileDiffusionImageVariationPipeline +from ppdiffusers.utils import load_image + +url = "https://paddlenlp.bj.bcebos.com/models/community/CompVis/data/benz.jpg" +image = load_image(url) + +pipe = VersatileDiffusionImageVariationPipeline.from_pretrained("shi-labs/versatile-diffusion") + +image = pipe(image).images[0] +image.save("versatile-diffusion-car_variation.png") +``` +
+image +
原图像
+image +
生成图像
+
+
+ + + + + +### 音频 +
+ 无条件音频生成(Unconditional Audio Generation) + +#### unconditional_audio_generation-audio_diffusion + +```python +from scipy.io.wavfile import write +from ppdiffusers import AudioDiffusionPipeline +import paddle + +# 加载模型和scheduler +pipe = AudioDiffusionPipeline.from_pretrained("teticio/audio-diffusion-ddim-256") +pipe.set_progress_bar_config(disable=None) +generator = paddle.Generator().manual_seed(42) + +output = pipe(generator=generator) +audio = output.audios[0] +image = output.images[0] + +# 保存音频到本地 +for i, audio in enumerate(audio): + write(f"audio_diffusion_test{i}.wav", pipe.mel.config.sample_rate, audio.transpose()) + +# 保存图片 +image.save("audio_diffusion_test.png") +``` +
+ + + + + + +
+ + + +
+ +
+image +
+ + +#### unconditional_audio_generation-spectrogram_diffusion + +```python +import paddle +import scipy + +from ppdiffusers import MidiProcessor, SpectrogramDiffusionPipeline +from ppdiffusers.utils.download_utils import ppdiffusers_url_download + +# Download MIDI from: wget https://paddlenlp.bj.bcebos.com/models/community/junnyu/develop/beethoven_hammerklavier_2.mid +mid_file_path = ppdiffusers_url_download( + "https://paddlenlp.bj.bcebos.com/models/community/junnyu/develop/beethoven_hammerklavier_2.mid", cache_dir="." +) +pipe = SpectrogramDiffusionPipeline.from_pretrained("google/music-spectrogram-diffusion", paddle_dtype=paddle.float16) +processor = MidiProcessor() +output = pipe(processor(mid_file_path)) +audio = output.audios[0] + +output_path = "unconditional_audio_generation-spectrogram_diffusion-result-beethoven_hammerklavier_2.wav" +# save the audio sample as a .wav file +scipy.io.wavfile.write(output_path, rate=16000, data=audio) +``` +
+ + + + + + +
+ + + +
+
+ + + +## License +PPDiffusers 遵循 [Apache-2.0开源协议](https://github.com/PaddlePaddle/PaddleMIX/blob/develop/ppdiffusers/LICENSE)。 + +Stable Diffusion 遵循 [The CreativeML OpenRAIL M 开源协议](https://huggingface.co/spaces/CompVis/stable-diffusion-license)。 +> The CreativeML OpenRAIL M is an [Open RAIL M license](https://www.licenses.ai/blog/2022/8/18/naming-convention-of-responsible-ai-licenses), adapted from the work that [BigScience](https://bigscience.huggingface.co/) and [the RAIL Initiative](https://www.licenses.ai/) are jointly carrying in the area of responsible AI licensing. See also [the article about the BLOOM Open RAIL license](https://bigscience.huggingface.co/blog/the-bigscience-rail-license) on which this license is based. + +Stable Diffusion 3遵循 [Stability Community 开源协议](https://stability.ai/license)。 +> Community License: Free for research, non-commercial, and commercial use for organisations or individuals with less than $1M annual revenue. You only need a paid Enterprise license if your yearly revenues exceed USD$1M and you use Stability AI models in commercial products or services. Read more: https://stability.ai/license + +## Acknowledge +我们借鉴了🤗 Hugging Face的[Diffusers](https://github.com/huggingface/diffusers)关于预训练扩散模型使用的优秀设计,在此对Hugging Face作者及其开源社区表示感谢。 + +## Citation + +```bibtex +@misc{ppdiffusers, + author = {PaddlePaddle Authors}, + title = {PPDiffusers: State-of-the-art diffusion model toolkit based on PaddlePaddle}, + year = {2022}, + publisher = {GitHub}, + journal = {GitHub repository}, + howpublished = {\url{https://github.com/PaddlePaddle/PaddleMIX/tree/develop/ppdiffusers}} +} +``` diff --git a/VLMEvalKit_old/PaddleMIX/ppdiffusers/deploy/controlnet/export_model.py b/VLMEvalKit_old/PaddleMIX/ppdiffusers/deploy/controlnet/export_model.py new file mode 100644 index 0000000000000000000000000000000000000000..05a4c0e5335b5714dc1fc6f658431d0c36ec5a34 --- /dev/null +++ b/VLMEvalKit_old/PaddleMIX/ppdiffusers/deploy/controlnet/export_model.py @@ -0,0 +1,263 @@ +# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import argparse +import os + +# set USE_PPXFORMERS=False to avoid using ppxformers +os.environ["USE_PPXFORMERS"] = "False" +from pathlib import Path +from types import MethodType + +import paddle + +from ppdiffusers import ( + ControlNetModel, + PaddleInferRuntimeModel, + PaddleInferStableDiffusionControlNetPipeline, + StableDiffusionControlNetPipeline, + UNet2DConditionModel, +) + + +class ControlNetWithUnetModel(paddle.nn.Layer): + def __init__( + self, + unet, + controlnet, + ): + super().__init__() + self.unet = unet + self.controlnet = controlnet + + def forward( + self, + sample, + timestep, + encoder_hidden_states, + controlnet_cond, + controlnet_conditioning_scale, + return_dict=True, + ): + down_block_res_samples, mid_block_res_sample = self.controlnet( + sample, + timestep, + encoder_hidden_states=encoder_hidden_states, + controlnet_cond=controlnet_cond, + conditioning_scale=controlnet_conditioning_scale, + return_dict=False, + ) + + noise_pred = self.unet( + sample, + timestep, + encoder_hidden_states=encoder_hidden_states, + down_block_additional_residuals=down_block_res_samples, + mid_block_additional_residual=mid_block_res_sample, + return_dict=return_dict, + ) + return noise_pred + + +def convert_ppdiffusers_pipeline_to_paddleinfer_pipeline( + model_path: str, + controlnet_model_path: str, + output_path: str, + sample: bool = False, + height: int = None, + width: int = None, +): + unet_tmp = UNet2DConditionModel.from_pretrained(model_path, resnet_pre_temb_non_linearity=False, subfolder="unet") + controlnet_tmp = ControlNetModel.from_pretrained(controlnet_model_path, resnet_pre_temb_non_linearity=False) + + pipeline = StableDiffusionControlNetPipeline.from_pretrained( + model_path, + unet=unet_tmp, + controlnet=controlnet_tmp, + safety_checker=None, + feature_extractor=None, + requires_safety_checker=False, + ) + output_path = Path(output_path) + # calculate latent's H and W + latent_height = height // 8 if height is not None else None + latent_width = width // 8 if width is not None else None + # get arguments + cross_attention_dim = pipeline.unet.config.cross_attention_dim # 768 or 1024 or 1280 + unet_channels = pipeline.unet.config.in_channels # 4 + vae_in_channels = pipeline.vae.config.in_channels # 3 + vae_latent_channels = pipeline.vae.config.latent_channels # 4 + print( + f"cross_attention_dim: {cross_attention_dim}\n", + f"unet_in_channels: {unet_channels}\n", + f"vae_encoder_in_channels: {vae_in_channels}\n", + f"vae_decoder_latent_channels: {vae_latent_channels}", + ) + # 1. Convert text_encoder + text_encoder = paddle.jit.to_static( + pipeline.text_encoder, + input_spec=[paddle.static.InputSpec(shape=[None, None], dtype="int64", name="input_ids")], # input_ids + ) + save_path = os.path.join(args.output_path, "text_encoder", "inference") + paddle.jit.save(text_encoder, save_path) + print(f"Save text_encoder model in {save_path} successfully.") + del pipeline.text_encoder + + # wrap unet + controlnet + new_unet = ControlNetWithUnetModel(unet=pipeline.unet, controlnet=pipeline.controlnet) + + # 2. Convert unet + unet = paddle.jit.to_static( + new_unet, + input_spec=[ + paddle.static.InputSpec( + shape=[None, unet_channels, latent_height, latent_width], + dtype="float32", + name="sample", + ), # sample + paddle.static.InputSpec(shape=[1], dtype="float32", name="timestep"), # timestep + paddle.static.InputSpec( + shape=[None, None, cross_attention_dim], + dtype="float32", + name="encoder_hidden_states", + ), # encoder_hidden_states + paddle.static.InputSpec( + shape=[None, vae_in_channels, height, width], + dtype="float32", + name="controlnet_cond", + ), # controlnet_cond + paddle.static.InputSpec( + shape=[len(pipeline.unet.config.block_out_channels) * 3 + 1], + dtype="float32", + name="controlnet_conditioning_scale", + ), # controlnet_conditioning_scale + ], + ) + + save_path = os.path.join(args.output_path, "unet", "inference") + paddle.jit.save(unet, save_path) + print(f"Save unet model in {save_path} successfully.") + del pipeline.unet + del new_unet + + def forward_vae_encoder_mode(self, z): + return self.encode(z, True).latent_dist.mode() + + def forward_vae_encoder_sample(self, z): + return self.encode(z, True).latent_dist.sample() + + # 3. Convert vae encoder + vae_encoder = pipeline.vae + if sample: + vae_encoder.forward = MethodType(forward_vae_encoder_sample, vae_encoder) + else: + vae_encoder.forward = MethodType(forward_vae_encoder_mode, vae_encoder) + + vae_encoder = paddle.jit.to_static( + vae_encoder, + input_spec=[ + paddle.static.InputSpec( + shape=[None, vae_in_channels, height, width], + dtype="float32", + name="sample", # N, C, H, W + ), # latent + ], + ) + # Save vae_encoder in static graph model. + save_path = os.path.join(args.output_path, "vae_encoder", "inference") + paddle.jit.save(vae_encoder, save_path) + print(f"Save vae_encoder model in {save_path} successfully.") + + # 4. Convert vae encoder + vae_decoder = pipeline.vae + + def forward_vae_decoder(self, z): + return self.decode(z, True).sample + + vae_decoder.forward = MethodType(forward_vae_decoder, vae_decoder) + vae_decoder = paddle.jit.to_static( + vae_decoder, + input_spec=[ + paddle.static.InputSpec( + shape=[None, vae_latent_channels, latent_height, latent_width], + dtype="float32", + name="latent_sample", + ), # latent_sample + ], + ) + # Save vae_decoder in static graph model. + save_path = os.path.join(args.output_path, "vae_decoder", "inference") + paddle.jit.save(vae_decoder, save_path) + print(f"Save vae_decoder model in {save_path} successfully.") + del pipeline.vae + + paddleinfer_pipeline = PaddleInferStableDiffusionControlNetPipeline( + vae_encoder=PaddleInferRuntimeModel.from_pretrained(output_path / "vae_encoder"), + vae_decoder=PaddleInferRuntimeModel.from_pretrained(output_path / "vae_decoder"), + text_encoder=PaddleInferRuntimeModel.from_pretrained(output_path / "text_encoder"), + unet=PaddleInferRuntimeModel.from_pretrained(output_path / "unet"), + tokenizer=pipeline.tokenizer, + scheduler=pipeline.scheduler, + safety_checker=None, + feature_extractor=None, + image_encoder=None, + requires_safety_checker=False, + ) + paddleinfer_pipeline.save_pretrained(str(output_path)) + print("PaddleInfer pipeline saved to", output_path) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + + parser.add_argument( + "--pretrained_model_name_or_path", + type=str, + default="runwayml/stable-diffusion-v1-5", + help="Path to the `ppdiffusers` checkpoint to convert (either a local directory or on the bos).", + ) + parser.add_argument( + "--controlnet_pretrained_model_name_or_path", + type=str, + default="lllyasviel/sd-controlnet-canny", + help="Path to the `ppdiffusers` controlnet_pretrained_model_name_or_path checkpoint to convert (either a local directory or on the bos).", + ) + parser.add_argument("--output_path", type=str, required=True, help="Path to the output model.") + parser.add_argument( + "--sample", + action="store_true", + default=False, + help="Export the vae encoder in mode or sample", + ) + parser.add_argument( + "--height", + type=int, + default=None, + help="The height of output images. Default: None", + ) + parser.add_argument( + "--width", + type=int, + default=None, + help="The width of output images. Default: None", + ) + args = parser.parse_args() + + convert_ppdiffusers_pipeline_to_paddleinfer_pipeline( + args.pretrained_model_name_or_path, + args.controlnet_pretrained_model_name_or_path, + args.output_path, + args.sample, + args.height, + args.width, + ) diff --git a/VLMEvalKit_old/PaddleMIX/ppdiffusers/deploy/controlnet/scripts/benchmark_paddle.sh b/VLMEvalKit_old/PaddleMIX/ppdiffusers/deploy/controlnet/scripts/benchmark_paddle.sh new file mode 100644 index 0000000000000000000000000000000000000000..babde7cd92a54bcb31ab4e4c89e1c7c2017e33f4 --- /dev/null +++ b/VLMEvalKit_old/PaddleMIX/ppdiffusers/deploy/controlnet/scripts/benchmark_paddle.sh @@ -0,0 +1,32 @@ +# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# attention raw fp16 +python infer_dygraph_paddle.py --scheduler "ddim" --task_name all --attention_type raw --use_fp16 True --inference_steps 50 --height 512 --width 512 --benchmark_steps 10 + +# attention cutlass fp16 +python infer_dygraph_paddle.py --scheduler "ddim" --task_name all --attention_type cutlass --use_fp16 True --inference_steps 50 --height 512 --width 512 --benchmark_steps 10 + +# attention flash fp16 +python infer_dygraph_paddle.py --scheduler "ddim" --task_name all --attention_type flash --use_fp16 True --inference_steps 50 --height 512 --width 512 --benchmark_steps 10 + + +# attention raw fp32 +python infer_dygraph_paddle.py --scheduler "ddim" --task_name all --attention_type raw --use_fp16 False --inference_steps 50 --height 512 --width 512 --benchmark_steps 10 + +# attention cutlass fp32 +python infer_dygraph_paddle.py --scheduler "ddim" --task_name all --attention_type cutlass --use_fp16 False --inference_steps 50 --height 512 --width 512 --benchmark_steps 10 + +# attention flash fp32 +python infer_dygraph_paddle.py --scheduler "ddim" --task_name all --attention_type flash --use_fp16 False --inference_steps 50 --height 512 --width 512 --benchmark_steps 10 diff --git a/VLMEvalKit_old/PaddleMIX/ppdiffusers/deploy/controlnet/scripts/benchmark_torch.sh b/VLMEvalKit_old/PaddleMIX/ppdiffusers/deploy/controlnet/scripts/benchmark_torch.sh new file mode 100644 index 0000000000000000000000000000000000000000..40eb9bc45707a567eb68415727060bdf1344c5cc --- /dev/null +++ b/VLMEvalKit_old/PaddleMIX/ppdiffusers/deploy/controlnet/scripts/benchmark_torch.sh @@ -0,0 +1,26 @@ +# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# attention raw +python infer_dygraph_torch.py --scheduler "ddim" --task_name all --attention_type raw --use_fp16 True --inference_steps 50 --height 512 --width 512 --benchmark_steps 10 + +# attention sdp +python infer_dygraph_torch.py --scheduler "ddim" --task_name all --attention_type sdp --use_fp16 True --inference_steps 50 --height 512 --width 512 --benchmark_steps 10 + + +# attention raw fp32 +python infer_dygraph_torch.py --scheduler "ddim" --task_name all --attention_type raw --use_fp16 False --inference_steps 50 --height 512 --width 512 --benchmark_steps 10 + +# attention sdp fp32 +python infer_dygraph_torch.py --scheduler "ddim" --task_name all --attention_type sdp --use_fp16 False --inference_steps 50 --height 512 --width 512 --benchmark_steps 10 \ No newline at end of file diff --git a/VLMEvalKit_old/PaddleMIX/ppdiffusers/deploy/controlnet/scripts/inference.sh b/VLMEvalKit_old/PaddleMIX/ppdiffusers/deploy/controlnet/scripts/inference.sh new file mode 100644 index 0000000000000000000000000000000000000000..24541c8f5297b87a28c1c343f1addd9608a558e8 --- /dev/null +++ b/VLMEvalKit_old/PaddleMIX/ppdiffusers/deploy/controlnet/scripts/inference.sh @@ -0,0 +1,26 @@ +# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# ============================================================================== +# use paddle as backend to inference static model is not fast, +# this script is used to make sure the inference is correct. +# ============================================================================== +# text2img +python infer.py --model_dir static_model/stable-diffusion-v1-5-canny/ --scheduler "ddim" --backend paddle --device gpu --task_name text2img + +# img2img +python infer.py --model_dir static_model/stable-diffusion-v1-5-canny/ --scheduler "ddim" --backend paddle --device gpu --task_name img2img + +# inpaint +python infer.py --model_dir static_model/stable-diffusion-v1-5-canny/ --scheduler "ddim" --backend paddle --device gpu --task_name inpaint_legacy \ No newline at end of file diff --git a/VLMEvalKit_old/PaddleMIX/ppdiffusers/deploy/sd15/export_model.py b/VLMEvalKit_old/PaddleMIX/ppdiffusers/deploy/sd15/export_model.py new file mode 100644 index 0000000000000000000000000000000000000000..796b2c99ac368056563c192e6e92cf18c46ccb3e --- /dev/null +++ b/VLMEvalKit_old/PaddleMIX/ppdiffusers/deploy/sd15/export_model.py @@ -0,0 +1,205 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import argparse +import os + +# set USE_PPXFORMERS=False to avoid using ppxformers +os.environ["USE_PPXFORMERS"] = "False" +from pathlib import Path +from types import MethodType + +import paddle +from unet_2d_condition_housing import UNet2DConditionModelSDHousing + +from ppdiffusers import ( + PaddleInferRuntimeModel, + PaddleInferStableDiffusionInpaintPipeline, + PaddleInferStableDiffusionMegaPipeline, + StableDiffusionPipeline, +) + + +def convert_ppdiffusers_pipeline_to_paddleinfer_pipeline( + model_path: str, + output_path: str, + sample: bool = False, + height: int = None, + width: int = None, +): + # specify unet model with unet pre_temb_act opt enabled. + unet_model = UNet2DConditionModelSDHousing.from_pretrained( + model_path, resnet_pre_temb_non_linearity=False, subfolder="unet" + ) + pipeline = StableDiffusionPipeline.from_pretrained( + model_path, + unet=unet_model, + safety_checker=None, + ) + output_path = Path(output_path) + # calculate latent's H and W + latent_height = height // 8 if height is not None else None + latent_width = width // 8 if width is not None else None + # get arguments + cross_attention_dim = pipeline.unet.config.cross_attention_dim # 768 or 1024 or 1280 + unet_channels = pipeline.unet.config.in_channels # 4 or 9 + vae_in_channels = pipeline.vae.config.in_channels # 3 + vae_latent_channels = pipeline.vae.config.latent_channels # 4 + print( + f"cross_attention_dim: {cross_attention_dim}\n", + f"unet_in_channels: {unet_channels}\n", + f"vae_encoder_in_channels: {vae_in_channels}\n", + f"vae_decoder_latent_channels: {vae_latent_channels}", + ) + # 1. Convert text_encoder + text_encoder = paddle.jit.to_static( + pipeline.text_encoder, + input_spec=[paddle.static.InputSpec(shape=[None, None], dtype="int64", name="input_ids")], # input_ids + ) + save_path = os.path.join(args.output_path, "text_encoder", "inference") + paddle.jit.save(text_encoder, save_path) + print(f"Save text_encoder model in {save_path} successfully.") + del pipeline.text_encoder + + # 2. Convert unet + unet = paddle.jit.to_static( + pipeline.unet, + input_spec=[ + paddle.static.InputSpec( + shape=[None, unet_channels, latent_height, latent_width], + dtype="float32", + name="sample", + ), # sample + paddle.static.InputSpec(shape=[1], dtype="float32", name="timestep"), # timestep + paddle.static.InputSpec( + shape=[None, None, cross_attention_dim], + dtype="float32", + name="encoder_hidden_states", + ), # encoder_hidden_states + ], + ) + save_path = os.path.join(args.output_path, "unet", "inference") + paddle.jit.save(unet, save_path) + print(f"Save unet model in {save_path} successfully.") + del pipeline.unet + + def forward_vae_encoder_mode(self, z): + return self.encode(z, True).latent_dist.mode() + + def forward_vae_encoder_sample(self, z): + return self.encode(z, True).latent_dist.sample() + + # 3. Convert vae encoder + vae_encoder = pipeline.vae + if sample: + vae_encoder.forward = MethodType(forward_vae_encoder_sample, vae_encoder) + else: + vae_encoder.forward = MethodType(forward_vae_encoder_mode, vae_encoder) + + vae_encoder = paddle.jit.to_static( + vae_encoder, + input_spec=[ + paddle.static.InputSpec( + shape=[None, vae_in_channels, height, width], + dtype="float32", + name="sample", # N, C, H, W + ), # latent + ], + ) + # Save vae_encoder in static graph model. + save_path = os.path.join(args.output_path, "vae_encoder", "inference") + paddle.jit.save(vae_encoder, save_path) + print(f"Save vae_encoder model in {save_path} successfully.") + + # 4. Convert vae encoder + vae_decoder = pipeline.vae + + def forward_vae_decoder(self, z): + return self.decode(z, True).sample + + vae_decoder.forward = MethodType(forward_vae_decoder, vae_decoder) + vae_decoder = paddle.jit.to_static( + vae_decoder, + input_spec=[ + paddle.static.InputSpec( + shape=[None, vae_latent_channels, latent_height, latent_width], + dtype="float32", + name="latent_sample", + ), # latent_sample + ], + ) + # Save vae_decoder in static graph model. + save_path = os.path.join(args.output_path, "vae_decoder", "inference") + paddle.jit.save(vae_decoder, save_path) + print(f"Save vae_decoder model in {save_path} successfully.") + del pipeline.vae + + if "inpainting" in model_path: + fd_pipe_cls = PaddleInferStableDiffusionInpaintPipeline + else: + fd_pipe_cls = PaddleInferStableDiffusionMegaPipeline + + paddleinfer_pipeline = fd_pipe_cls( + vae_encoder=PaddleInferRuntimeModel.from_pretrained(output_path / "vae_encoder"), + vae_decoder=PaddleInferRuntimeModel.from_pretrained(output_path / "vae_decoder"), + text_encoder=PaddleInferRuntimeModel.from_pretrained(output_path / "text_encoder"), + unet=PaddleInferRuntimeModel.from_pretrained(output_path / "unet"), + tokenizer=pipeline.tokenizer, + scheduler=pipeline.scheduler, + feature_extractor=pipeline.feature_extractor, + image_encoder=None, + safety_checker=None, + requires_safety_checker=False, + ) + paddleinfer_pipeline.save_pretrained(str(output_path)) + print("PaddleInfer pipeline saved to", output_path) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + + parser.add_argument( + "--pretrained_model_name_or_path", + type=str, + required=True, + help="Path to the `ppdiffusers` checkpoint to convert (either a local directory or on the bos).", + ) + parser.add_argument("--output_path", type=str, required=True, help="Path to the output model.") + parser.add_argument( + "--sample", + action="store_true", + default=False, + help="Export the vae encoder in mode or sample", + ) + parser.add_argument( + "--height", + type=int, + default=None, + help="The height of output images. Default: None", + ) + parser.add_argument( + "--width", + type=int, + default=None, + help="The width of output images. Default: None", + ) + args = parser.parse_args() + + convert_ppdiffusers_pipeline_to_paddleinfer_pipeline( + args.pretrained_model_name_or_path, + args.output_path, + args.sample, + args.height, + args.width, + ) diff --git a/VLMEvalKit_old/PaddleMIX/ppdiffusers/deploy/sd15/infer.py b/VLMEvalKit_old/PaddleMIX/ppdiffusers/deploy/sd15/infer.py new file mode 100644 index 0000000000000000000000000000000000000000..0ad63f98f50ed7cdd9f4f9c23476db3346fff131 --- /dev/null +++ b/VLMEvalKit_old/PaddleMIX/ppdiffusers/deploy/sd15/infer.py @@ -0,0 +1,408 @@ +# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import argparse +import os +import time + +# isort: split +import paddle +import paddle.inference as paddle_infer + +# isort: split +import numpy as np +from paddlenlp.trainer.argparser import strtobool +from tqdm.auto import trange + +from ppdiffusers import ( # noqa + DiffusionPipeline, + PaddleInferStableDiffusionMegaPipeline, +) +from ppdiffusers.utils import load_image + + +def parse_arguments(): + + parser = argparse.ArgumentParser() + parser.add_argument( + "--model_dir", + default="runwayml/stable-diffusion-v1-5@paddleinfer", + help="The model directory of diffusion_model.", + ) + parser.add_argument( + "--inference_steps", + type=int, + default=50, + help="The number of unet inference steps.", + ) + parser.add_argument( + "--benchmark_steps", + type=int, + default=10, + help="The number of performance benchmark steps.", + ) + parser.add_argument( + "--backend", + type=str, + default="paddle_tensorrt", + choices=["paddle", "paddle_tensorrt"], + help="The inference runtime backend of unet model and text encoder model.", + ) + parser.add_argument( + "--device", + type=str, + default="gpu", + choices=[ + "cpu", + "gpu", + "huawei_ascend_npu", + "kunlunxin_xpu", + ], + help="The inference runtime device of models.", + ) + parser.add_argument( + "--task_name", + type=str, + default="text2img", + choices=[ + "text2img", + "img2img", + "inpaint_legacy", + "all", + ], + help="The task can be one of [text2img, img2img, inpaint_legacy, all]. ", + ) + parser.add_argument( + "--parse_prompt_type", + type=str, + default="lpw", + choices=[ + "raw", + "lpw", + ], + help="The parse_prompt_type can be one of [raw, lpw]. ", + ) + parser.add_argument("--use_fp16", type=strtobool, default=True, help="Wheter to use FP16 mode") + parser.add_argument("--device_id", type=int, default=0, help="The selected gpu id. -1 means use cpu") + parser.add_argument( + "--scheduler", + type=str, + default="preconfig-euler-ancestral", + choices=[ + "pndm", + "lms", + "euler", + "euler-ancestral", + "preconfig-euler-ancestral", + "dpm-multi", + "dpm-single", + "unipc-multi", + "ddim", + "ddpm", + "deis-multi", + "heun", + "kdpm2-ancestral", + "kdpm2", + ], + help="The scheduler type of stable diffusion.", + ) + parser.add_argument("--height", type=int, default=512, help="Height of input image") + parser.add_argument("--width", type=int, default=512, help="Width of input image") + parser.add_argument("--strength", type=float, default=1.0, help="Strength for img2img / inpaint") + parser.add_argument("--hr_resize_height", type=int, default=768, help="HR Height of input image") + parser.add_argument("--hr_resize_width", type=int, default=768, help="HR Width of input image") + parser.add_argument("--is_sd2_0", type=strtobool, default=False, help="Is sd2_0 model?") + parser.add_argument( + "--tune", + type=strtobool, + default=False, + help="Whether to tune the shape of tensorrt engine.", + ) + + return parser.parse_args() + + +def create_paddle_inference_runtime( + model_dir="", + model_name="", + use_trt=False, + precision_mode=paddle_infer.PrecisionType.Half, + device_id=0, + disable_paddle_trt_ops=[], + disable_paddle_pass=[], + workspace=24 * 1024 * 1024 * 1024, + tune=False, +): + config = paddle_infer.Config() + config.enable_memory_optim() + shape_file = f"{model_dir}/{model_name}/shape_range_info.pbtxt" + if tune: + config.collect_shape_range_info(shape_file) + config.switch_ir_optim(False) + else: + config.enable_new_executor() + if str(os.environ.get("FLAGS_enable_pir_in_executor")).lower() in ("true", "1"): + config.enable_new_ir() + if str(os.environ.get("FLAGS_use_cinn")).lower() in ("true", "1"): + config.enable_cinn() + + if device_id != -1: + config.use_gpu() + config.enable_use_gpu(memory_pool_init_size_mb=2000, device_id=device_id, precision_mode=precision_mode) + for pass_name in disable_paddle_pass: + config.delete_pass(pass_name) + if use_trt: + config.enable_tensorrt_engine( + workspace_size=workspace, + precision_mode=precision_mode, + max_batch_size=1, + min_subgraph_size=3, + use_static=True, + ) + config.enable_tensorrt_memory_optim() + config.enable_tuned_tensorrt_dynamic_shape(shape_file, True) + cache_file = os.path.join(model_dir, model_name, "_opt_cache/") + config.set_optim_cache_dir(cache_file) + if precision_mode != paddle_infer.PrecisionType.Half: + only_fp16_passes = [ + "trt_cross_multihead_matmul_fuse_pass", + "trt_flash_multihead_matmul_fuse_pass", + "preln_elementwise_groupnorm_act_pass", + "elementwise_groupnorm_act_pass", + ] + for curr_pass in only_fp16_passes: + config.delete_pass(curr_pass) + return config + + +def main(args): + if args.device_id == -1: + paddle.set_device("cpu") + else: + paddle.set_device(f"gpu:{args.device_id}") + + seed = 1024 + min_image_size = 512 + max_image_size = 768 + max_image_size = max(min_image_size, max_image_size) + + # 4. Init runtime + only_fp16_passes = [ + "trt_cross_multihead_matmul_fuse_pass", + "trt_flash_multihead_matmul_fuse_pass", + "preln_elementwise_groupnorm_act_pass", + "elementwise_groupnorm_act_pass", + ] + no_need_passes = [ + "trt_prompt_tuning_embedding_eltwise_layernorm_fuse_pass", + "add_support_int8_pass", + "elementwise_groupnorm_act_pass", + "groupnorm_act_pass", + "preln_elementwise_groupnorm_act_pass", + ] + paddle_delete_passes = dict( + text_encoder=only_fp16_passes + no_need_passes if not args.use_fp16 else no_need_passes, + text_encoder_2=only_fp16_passes + no_need_passes if not args.use_fp16 else no_need_passes, + vae_encoder=only_fp16_passes + [] if args.use_fp16 else [], + vae_decoder=only_fp16_passes + no_need_passes if not args.use_fp16 else no_need_passes, + unet=only_fp16_passes + no_need_passes if not args.use_fp16 else no_need_passes, + image_encoder=only_fp16_passes + no_need_passes if not args.use_fp16 else no_need_passes, + ) + args.use_trt = args.backend == "paddle_tensorrt" + precision_mode = paddle_infer.PrecisionType.Half if args.use_fp16 else paddle_infer.PrecisionType.Float32 + infer_configs = dict( + text_encoder=create_paddle_inference_runtime( + model_dir=args.model_dir, + use_trt=False, + model_name="text_encoder", + precision_mode=paddle_infer.PrecisionType.Half, + device_id=args.device_id, + disable_paddle_trt_ops=["range", "lookup_table_v2"], + disable_paddle_pass=paddle_delete_passes.get("text_encoder", []), + tune=False, + ), + vae_encoder=create_paddle_inference_runtime( + model_dir=args.model_dir, + model_name="vae_encoder", + use_trt=False, + precision_mode=paddle_infer.PrecisionType.Half, + device_id=args.device_id, + disable_paddle_pass=paddle_delete_passes.get("vae_encoder", []), + tune=False, + ), + vae_decoder=create_paddle_inference_runtime( + model_dir=args.model_dir, + model_name="vae_decoder", + use_trt=False, + precision_mode=paddle_infer.PrecisionType.Float32, + device_id=args.device_id, + disable_paddle_pass=paddle_delete_passes.get("vae_decoder", []), + tune=False, + ), + unet=create_paddle_inference_runtime( + model_dir=args.model_dir, + model_name="unet", + use_trt=args.use_trt, + precision_mode=precision_mode, + device_id=args.device_id, + disable_paddle_pass=no_need_passes, + tune=args.tune, + ), + ) + pipe = PaddleInferStableDiffusionMegaPipeline.from_pretrained( + args.model_dir, + infer_configs=infer_configs, + use_optim_cache=False, + ) + pipe.set_progress_bar_config(disable=False) + pipe.change_scheduler(args.scheduler) + parse_prompt_type = args.parse_prompt_type + width = args.width + height = args.height + + folder = f"results-{args.backend}" + os.makedirs(folder, exist_ok=True) + if args.task_name in ["text2img", "all"]: + # text2img + prompt = "a photo of an astronaut riding a horse on mars" + time_costs = [] + # warmup + pipe.text2img( + prompt, + num_inference_steps=20, + height=height, + width=width, + # parse_prompt_type=parse_prompt_type, + ) + print("==> Test text2img performance.") + for step in trange(args.benchmark_steps): + start = time.time() + paddle.seed(seed) + images = pipe.text2img( + prompt, + output_type="pil", + num_inference_steps=args.inference_steps, + height=height, + width=width, + # parse_prompt_type=parse_prompt_type, + ).images + latency = time.time() - start + time_costs += [latency] + # print(f"No {step:3d} time cost: {latency:2f} s") + print( + f"Use fp16: {'true' if args.use_fp16 else 'false'}, " + f"Mean iter/sec: {1 / (np.mean(time_costs) / args.inference_steps):2f} it/s, " + f"Mean latency: {np.mean(time_costs):2f} s, p50 latency: {np.percentile(time_costs, 50):2f} s, " + f"p90 latency: {np.percentile(time_costs, 90):2f} s, p95 latency: {np.percentile(time_costs, 95):2f} s." + ) + images[0].save(f"{folder}/text2img.png") + + if args.task_name in ["img2img", "all"]: + # img2img + img_url = ( + "https://paddlenlp.bj.bcebos.com/models/community/CompVis/stable-diffusion-v1-4/sketch-mountains-input.png" + ) + init_image = load_image(img_url) + prompt = "A fantasy landscape, trending on artstation" + time_costs = [] + # warmup + pipe.img2img( + prompt, + image=init_image, + num_inference_steps=20, + height=height, + width=width, + strength=args.strength, + # parse_prompt_type=parse_prompt_type, + ) + print("==> Test img2img performance.") + for step in trange(args.benchmark_steps): + start = time.time() + paddle.seed(seed) + images = pipe.img2img( + prompt, + image=init_image, + num_inference_steps=args.inference_steps, + height=height, + width=width, + strength=args.strength, + # parse_prompt_type=parse_prompt_type, + ).images + latency = time.time() - start + time_costs += [latency] + # print(f"No {step:3d} time cost: {latency:2f} s") + print( + f"Use fp16: {'true' if args.use_fp16 else 'false'}, " + f"Mean iter/sec: {1 / (np.mean(time_costs) / args.inference_steps):2f} it/s, " + f"Mean latency: {np.mean(time_costs):2f} s, p50 latency: {np.percentile(time_costs, 50):2f} s, " + f"p90 latency: {np.percentile(time_costs, 90):2f} s, p95 latency: {np.percentile(time_costs, 95):2f} s." + ) + images[0].save(f"{folder}/img2img.png") + + if args.task_name in ["inpaint", "inpaint_legacy", "all"]: + img_url = ( + "https://paddlenlp.bj.bcebos.com/models/community/CompVis/stable-diffusion-v1-4/overture-creations.png" + ) + mask_url = "https://paddlenlp.bj.bcebos.com/models/community/CompVis/stable-diffusion-v1-4/overture-creations-mask.png" + init_image = load_image(img_url) + mask_image = load_image(mask_url) + prompt = "Face of a yellow cat, high resolution, sitting on a park bench" + time_costs = [] + # warmup + if args.task_name in ["inpaint_legacy", "all"]: + call_fn = pipe.inpaint_legacy + task_name = "inpaint_legacy" + else: + call_fn = pipe.inpaint + task_name = "inpaint" + call_fn( + prompt, + image=init_image, + mask_image=mask_image, + num_inference_steps=20, + height=height, + width=width, + strength=args.strength, + parse_prompt_type=parse_prompt_type, + ) + print(f"==> Test {task_name} performance.") + for step in trange(args.benchmark_steps): + start = time.time() + paddle.seed(seed) + images = call_fn( + prompt, + image=init_image, + mask_image=mask_image, + num_inference_steps=args.inference_steps, + height=height, + width=width, + strength=args.strength, + parse_prompt_type=parse_prompt_type, + ).images + latency = time.time() - start + time_costs += [latency] + # print(f"No {step:3d} time cost: {latency:2f} s") + print( + f"Use fp16: {'true' if args.use_fp16 else 'false'}, " + f"Mean iter/sec: {1 / (np.mean(time_costs) / args.inference_steps):2f} it/s, " + f"Mean latency: {np.mean(time_costs):2f} s, p50 latency: {np.percentile(time_costs, 50):2f} s, " + f"p90 latency: {np.percentile(time_costs, 90):2f} s, p95 latency: {np.percentile(time_costs, 95):2f} s." + ) + + images[0].save(f"{folder}/{task_name}.png") + + +if __name__ == "__main__": + args = parse_arguments() + main(args) diff --git a/VLMEvalKit_old/PaddleMIX/ppdiffusers/deploy/sd15/infer_dygraph_paddle.py b/VLMEvalKit_old/PaddleMIX/ppdiffusers/deploy/sd15/infer_dygraph_paddle.py new file mode 100644 index 0000000000000000000000000000000000000000..06ffde0f7ddd1b75c3ada2a5f62c8e6165ae9056 --- /dev/null +++ b/VLMEvalKit_old/PaddleMIX/ppdiffusers/deploy/sd15/infer_dygraph_paddle.py @@ -0,0 +1,357 @@ +# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import argparse +import os +import time +import warnings + +import cv2 +import numpy as np +import paddle +from PIL import Image +from tqdm.auto import trange + +from ppdiffusers import ( + DDIMScheduler, + DDPMScheduler, + DEISMultistepScheduler, + DPMSolverMultistepScheduler, + DPMSolverSinglestepScheduler, + EulerAncestralDiscreteScheduler, + EulerDiscreteScheduler, + HeunDiscreteScheduler, + KDPM2AncestralDiscreteScheduler, + KDPM2DiscreteScheduler, + LMSDiscreteScheduler, + PNDMScheduler, + StableDiffusionImg2ImgPipeline, + StableDiffusionInpaintPipeline, + StableDiffusionPipeline, + UniPCMultistepScheduler, +) +from ppdiffusers.utils import load_image + + +def get_canny_image(image, args): + if isinstance(image, Image.Image): + image = np.array(image) + image = cv2.Canny(image, args.low_threshold, args.high_threshold) + image = image[:, :, None] + image = np.concatenate([image, image, image], axis=2) + canny_image = Image.fromarray(image) + return canny_image + + +def strtobool(v): + if isinstance(v, bool): + return v + if v.lower() in ("yes", "true", "t", "y", "1"): + return True + elif v.lower() in ("no", "false", "f", "n", "0"): + return False + else: + raise ValueError( + f"Truthy value expected: got {v} but expected one of yes/no, true/false, t/f, y/n, 1/0 (case insensitive)." + ) + + +def change_scheduler(self, scheduler_type="ddim"): + self.orginal_scheduler_config = self.scheduler.config + scheduler_type = scheduler_type.lower() + if scheduler_type == "pndm": + scheduler = PNDMScheduler.from_config(self.orginal_scheduler_config, skip_prk_steps=True) + elif scheduler_type == "lms": + scheduler = LMSDiscreteScheduler.from_config(self.orginal_scheduler_config) + elif scheduler_type == "heun": + scheduler = HeunDiscreteScheduler.from_config(self.orginal_scheduler_config) + elif scheduler_type == "euler": + scheduler = EulerDiscreteScheduler.from_config(self.orginal_scheduler_config) + elif scheduler_type == "euler-ancestral": + scheduler = EulerAncestralDiscreteScheduler.from_config(self.orginal_scheduler_config) + elif scheduler_type == "dpm-multi": + scheduler = DPMSolverMultistepScheduler.from_config(self.orginal_scheduler_config) + elif scheduler_type == "dpm-single": + scheduler = DPMSolverSinglestepScheduler.from_config(self.orginal_scheduler_config) + elif scheduler_type == "kdpm2-ancestral": + scheduler = KDPM2AncestralDiscreteScheduler.from_config(self.orginal_scheduler_config) + elif scheduler_type == "kdpm2": + scheduler = KDPM2DiscreteScheduler.from_config(self.orginal_scheduler_config) + elif scheduler_type == "unipc-multi": + scheduler = UniPCMultistepScheduler.from_config(self.orginal_scheduler_config) + elif scheduler_type == "ddim": + scheduler = DDIMScheduler.from_config( + self.orginal_scheduler_config, + steps_offset=1, + clip_sample=False, + set_alpha_to_one=False, + ) + elif scheduler_type == "ddpm": + scheduler = DDPMScheduler.from_config( + self.orginal_scheduler_config, + ) + elif scheduler_type == "deis-multi": + scheduler = DEISMultistepScheduler.from_config( + self.orginal_scheduler_config, + ) + else: + raise ValueError(f"Scheduler of type {scheduler_type} doesn't exist!") + return scheduler + + +def parse_arguments(): + + parser = argparse.ArgumentParser() + parser.add_argument( + "--pretrained_model_name_or_path", + type=str, + default="runwayml/stable-diffusion-v1-5", + help="Path to the `diffusers` checkpoint to convert (either a local directory or on the bos).", + ) + parser.add_argument( + "--inference_steps", + type=int, + default=50, + help="The number of unet inference steps.", + ) + parser.add_argument( + "--benchmark_steps", + type=int, + default=10, + help="The number of performance benchmark steps.", + ) + parser.add_argument( + "--task_name", + type=str, + default="all", + choices=[ + "text2img", + "img2img", + "inpaint_legacy", + "all", + ], + help="The task can be one of [text2img, img2img, inpaint_legacy, all]. ", + ) + parser.add_argument( + "--parse_prompt_type", + type=str, + default="raw", + choices=[ + "raw", + "lpw", + ], + help="The parse_prompt_type can be one of [raw, lpw]. ", + ) + parser.add_argument("--use_fp16", type=strtobool, default=True, help="Wheter to use FP16 mode") + parser.add_argument( + "--attention_type", type=str, default="raw", choices=["raw", "cutlass", "flash", "all"], help="attention_type." + ) + parser.add_argument("--device_id", type=int, default=0, help="The selected gpu id. -1 means use cpu") + parser.add_argument( + "--scheduler", + type=str, + default="euler-ancestral", + choices=[ + "pndm", + "lms", + "euler", + "euler-ancestral", + "dpm-multi", + "dpm-single", + "unipc-multi", + "ddim", + "ddpm", + "deis-multi", + "heun", + "kdpm2-ancestral", + "kdpm2", + ], + help="The scheduler type of stable diffusion.", + ) + parser.add_argument("--height", type=int, default=512, help="Height of input image") + parser.add_argument("--width", type=int, default=512, help="Width of input image") + parser.add_argument("--strength", type=float, default=1.0, help="Strength for img2img / inpaint") + return parser.parse_args() + + +def main(args): + + seed = 1024 + paddle_dtype = paddle.float16 if args.use_fp16 else paddle.float32 + pipe = StableDiffusionPipeline.from_pretrained( + args.pretrained_model_name_or_path, + safety_checker=None, + feature_extractor=None, + requires_safety_checker=False, + paddle_dtype=paddle_dtype, + ) + scheduler = change_scheduler(pipe, args.scheduler) + pipe.scheduler = scheduler + + if args.attention_type == "all": + args.attention_type = ["raw", "cutlass", "flash"] + else: + args.attention_type = [args.attention_type] + + for attention_type in args.attention_type: + if attention_type == "raw": + pipe.disable_xformers_memory_efficient_attention() + else: + try: + pipe.enable_xformers_memory_efficient_attention(attention_type) + except Exception as e: + if attention_type == "flash": + warnings.warn( + "Attention type flash is not supported on your GPU! We need to use 3060、3070、3080、3090、4060、4070、4080、4090、A30、A100 etc." + ) + continue + else: + raise ValueError(e) + + if not args.use_fp16 and attention_type == "flash": + print("Flash attention is not supported dtype=float32! Please use float16 or bfloat16. We will skip this!") + continue + + width = args.width + height = args.height + pipe.set_progress_bar_config(disable=False) + + folder = f"paddle_attn_{attention_type}_fp16" if args.use_fp16 else f"paddle_attn_{attention_type}_fp32" + os.makedirs(folder, exist_ok=True) + if args.task_name in ["text2img", "all"]: + init_image = load_image( + "https://paddlenlp.bj.bcebos.com/models/community/junnyu/develop/control_bird_canny_demo.png" + ) + # text2img + prompt = "bird" + time_costs = [] + # warmup + pipe( + prompt, + num_inference_steps=10, + height=height, + width=width, + ) + print("==> Test text2img performance.") + for step in trange(args.benchmark_steps): + start = time.time() + paddle.seed(seed) + images = pipe( + prompt, + num_inference_steps=args.inference_steps, + height=height, + width=width, + ).images + latency = time.time() - start + time_costs += [latency] + # print(f"No {step:3d} time cost: {latency:2f} s") + print( + f"Attention type: {attention_type}, " + f"Use fp16: {'true' if args.use_fp16 else 'false'}, " + f"Mean iter/sec: {1 / (np.mean(time_costs) / args.inference_steps):2f} it/s, " + f"Mean latency: {np.mean(time_costs):2f} s, p50 latency: {np.percentile(time_costs, 50):2f} s, " + f"p90 latency: {np.percentile(time_costs, 90):2f} s, p95 latency: {np.percentile(time_costs, 95):2f} s." + ) + images[0].save(f"{folder}/text2img.png") + + if args.task_name in ["img2img", "all"]: + pipe_img2img = StableDiffusionImg2ImgPipeline(**pipe.components) + pipe_img2img.set_progress_bar_config(disable=False) + img_url = "https://paddlenlp.bj.bcebos.com/models/community/CompVis/stable-diffusion-v1-4/sketch-mountains-input.png" + init_image = load_image(img_url).resize((width, height)) + prompt = "A fantasy landscape, trending on artstation" + time_costs = [] + # warmup + pipe_img2img( + prompt, + image=init_image, + num_inference_steps=20, + height=height, + width=width, + strength=args.strength, + ) + print("==> Test img2img performance.") + for step in trange(args.benchmark_steps): + start = time.time() + paddle.seed(seed) + images = pipe_img2img( + prompt, + image=init_image, + num_inference_steps=args.inference_steps, + height=height, + width=width, + strength=args.strength, + ).images + latency = time.time() - start + time_costs += [latency] + # print(f"No {step:3d} time cost: {latency:2f} s") + print( + f"Attention type: {attention_type}, " + f"Use fp16: {'true' if args.use_fp16 else 'false'}, " + f"Mean iter/sec: {1 / (np.mean(time_costs) / args.inference_steps):2f} it/s, " + f"Mean latency: {np.mean(time_costs):2f} s, p50 latency: {np.percentile(time_costs, 50):2f} s, " + f"p90 latency: {np.percentile(time_costs, 90):2f} s, p95 latency: {np.percentile(time_costs, 95):2f} s." + ) + images[0].save(f"{folder}/img2img.png") + + if args.task_name in ["inpaint_legacy", "all"]: + pipe_inpaint = StableDiffusionInpaintPipeline(**pipe.components) + pipe_inpaint.set_progress_bar_config(disable=False) + img_url = ( + "https://paddlenlp.bj.bcebos.com/models/community/CompVis/stable-diffusion-v1-4/overture-creations.png" + ) + mask_url = "https://paddlenlp.bj.bcebos.com/models/community/CompVis/stable-diffusion-v1-4/overture-creations-mask.png" + init_image = load_image(img_url).resize((width, height)) + mask_image = load_image(mask_url).resize((width, height)) + prompt = "Face of a yellow cat, high resolution, sitting on a park bench" + time_costs = [] + task_name = "inpaint_legacy" + pipe_inpaint( + prompt, + image=init_image, + mask_image=mask_image, + num_inference_steps=20, + height=height, + width=width, + strength=args.strength, + ) + print(f"==> Test {task_name} performance.") + for step in trange(args.benchmark_steps): + start = time.time() + paddle.seed(seed) + images = pipe_inpaint( + prompt, + image=init_image, + mask_image=mask_image, + num_inference_steps=args.inference_steps, + height=height, + width=width, + strength=args.strength, + ).images + latency = time.time() - start + time_costs += [latency] + # print(f"No {step:3d} time cost: {latency:2f} s") + print( + f"Attention type: {attention_type}, " + f"Use fp16: {'true' if args.use_fp16 else 'false'}, " + f"Mean iter/sec: {1 / (np.mean(time_costs) / args.inference_steps):2f} it/s, " + f"Mean latency: {np.mean(time_costs):2f} s, p50 latency: {np.percentile(time_costs, 50):2f} s, " + f"p90 latency: {np.percentile(time_costs, 90):2f} s, p95 latency: {np.percentile(time_costs, 95):2f} s." + ) + images[0].save(f"{folder}/{task_name}.png") + + +if __name__ == "__main__": + args = parse_arguments() + main(args) diff --git a/VLMEvalKit_old/PaddleMIX/ppdiffusers/deploy/sd15/infer_dygraph_torch.py b/VLMEvalKit_old/PaddleMIX/ppdiffusers/deploy/sd15/infer_dygraph_torch.py new file mode 100644 index 0000000000000000000000000000000000000000..febc46610eca3d524d182c8bc39495a202fdaaca --- /dev/null +++ b/VLMEvalKit_old/PaddleMIX/ppdiffusers/deploy/sd15/infer_dygraph_torch.py @@ -0,0 +1,417 @@ +# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import argparse +import os +import time + +import torch + +torch.nn.functional.scaled_dot_product_attention_ = torch.nn.functional.scaled_dot_product_attention +delattr(torch.nn.functional, "scaled_dot_product_attention") + +import cv2 +import numpy as np +from diffusers import ( + DDIMScheduler, + DDPMScheduler, + DEISMultistepScheduler, + DPMSolverMultistepScheduler, + DPMSolverSinglestepScheduler, + EulerAncestralDiscreteScheduler, + EulerDiscreteScheduler, + HeunDiscreteScheduler, + KDPM2AncestralDiscreteScheduler, + KDPM2DiscreteScheduler, + LMSDiscreteScheduler, + PNDMScheduler, + StableDiffusionImg2ImgPipeline, + StableDiffusionInpaintPipeline, + StableDiffusionPipeline, + UniPCMultistepScheduler, +) +from diffusers.models.attention_processor import AttnProcessor, AttnProcessor2_0 +from diffusers.utils import load_image +from PIL import Image +from tqdm.auto import trange + + +def get_canny_image(image, args): + if isinstance(image, Image.Image): + image = np.array(image) + image = cv2.Canny(image, args.low_threshold, args.high_threshold) + image = image[:, :, None] + image = np.concatenate([image, image, image], axis=2) + canny_image = Image.fromarray(image) + return canny_image + + +def strtobool(v): + if isinstance(v, bool): + return v + if v.lower() in ("yes", "true", "t", "y", "1"): + return True + elif v.lower() in ("no", "false", "f", "n", "0"): + return False + else: + raise ValueError( + f"Truthy value expected: got {v} but expected one of yes/no, true/false, t/f, y/n, 1/0 (case insensitive)." + ) + + +def change_scheduler(self, scheduler_type="ddim"): + self.orginal_scheduler_config = self.scheduler.config + scheduler_type = scheduler_type.lower() + if scheduler_type == "pndm": + scheduler = PNDMScheduler.from_config(self.orginal_scheduler_config, skip_prk_steps=True) + elif scheduler_type == "lms": + scheduler = LMSDiscreteScheduler.from_config(self.orginal_scheduler_config) + elif scheduler_type == "heun": + scheduler = HeunDiscreteScheduler.from_config(self.orginal_scheduler_config) + elif scheduler_type == "euler": + scheduler = EulerDiscreteScheduler.from_config(self.orginal_scheduler_config) + elif scheduler_type == "euler-ancestral": + scheduler = EulerAncestralDiscreteScheduler.from_config(self.orginal_scheduler_config) + elif scheduler_type == "dpm-multi": + scheduler = DPMSolverMultistepScheduler.from_config(self.orginal_scheduler_config) + elif scheduler_type == "dpm-single": + scheduler = DPMSolverSinglestepScheduler.from_config(self.orginal_scheduler_config) + elif scheduler_type == "kdpm2-ancestral": + scheduler = KDPM2AncestralDiscreteScheduler.from_config(self.orginal_scheduler_config) + elif scheduler_type == "kdpm2": + scheduler = KDPM2DiscreteScheduler.from_config(self.orginal_scheduler_config) + elif scheduler_type == "unipc-multi": + scheduler = UniPCMultistepScheduler.from_config(self.orginal_scheduler_config) + elif scheduler_type == "ddim": + scheduler = DDIMScheduler.from_config( + self.orginal_scheduler_config, + steps_offset=1, + clip_sample=False, + set_alpha_to_one=False, + ) + elif scheduler_type == "ddpm": + scheduler = DDPMScheduler.from_config( + self.orginal_scheduler_config, + ) + elif scheduler_type == "deis-multi": + scheduler = DEISMultistepScheduler.from_config( + self.orginal_scheduler_config, + ) + else: + raise ValueError(f"Scheduler of type {scheduler_type} doesn't exist!") + return scheduler + + +def parse_arguments(): + + parser = argparse.ArgumentParser() + parser.add_argument( + "--pretrained_model_name_or_path", + type=str, + default="runwayml/stable-diffusion-v1-5", + help="Path to the `diffusers` checkpoint to convert (either a local directory or on the bos).", + ) + parser.add_argument( + "--inference_steps", + type=int, + default=50, + help="The number of unet inference steps.", + ) + parser.add_argument( + "--benchmark_steps", + type=int, + default=10, + help="The number of performance benchmark steps.", + ) + parser.add_argument( + "--task_name", + type=str, + default="all", + choices=[ + "text2img", + "img2img", + "inpaint_legacy", + "all", + ], + help="The task can be one of [text2img, img2img, inpaint_legacy, all]. ", + ) + parser.add_argument( + "--parse_prompt_type", + type=str, + default="raw", + choices=[ + "raw", + "lpw", + ], + help="The parse_prompt_type can be one of [raw, lpw]. ", + ) + parser.add_argument( + "--channels_last", + type=strtobool, + default=False, + help="Wheter to use channels_last", + ) + parser.add_argument("--use_fp16", type=strtobool, default=True, help="Wheter to use FP16 mode") + parser.add_argument("--tf32", type=strtobool, default=True, help="tf32") + parser.add_argument("--compile", type=strtobool, default=False, help="compile") + parser.add_argument( + "--attention_type", + type=str, + default="sdp", + choices=[ + "raw", + "sdp", + ], + help="attention_type.", + ) + parser.add_argument("--device_id", type=int, default=0, help="The selected gpu id. -1 means use cpu") + parser.add_argument( + "--scheduler", + type=str, + default="euler-ancestral", + choices=[ + "pndm", + "lms", + "euler", + "euler-ancestral", + "dpm-multi", + "dpm-single", + "unipc-multi", + "ddim", + "ddpm", + "deis-multi", + "heun", + "kdpm2-ancestral", + "kdpm2", + ], + help="The scheduler type of stable diffusion.", + ) + parser.add_argument("--height", type=int, default=512, help="Height of input image") + parser.add_argument("--width", type=int, default=512, help="Width of input image") + parser.add_argument("--strength", type=float, default=1.0, help="Strength for img2img / inpaint") + return parser.parse_args() + + +def attn_processors(self): + processors = {} + + def fn_recursive_add_processors(name: str, module, processors): + if hasattr(module, "set_processor"): + processors[f"{name}.processor"] = module.processor + + for sub_name, child in module.named_children(): + fn_recursive_add_processors(f"{name}.{sub_name}", child, processors) + + return processors + + for name, module in self.named_children(): + fn_recursive_add_processors(name, module, processors) + + return processors + + +def set_attn_processor(self, processor): + count = len(attn_processors(self).keys()) + + if isinstance(processor, dict) and len(processor) != count: + raise ValueError( + f"A dict of processors was passed, but the number of processors {len(processor)} does not match the" + f" number of attention layers: {count}. Please make sure to pass {count} processor classes." + ) + + def fn_recursive_attn_processor(name: str, module, processor): + if hasattr(module, "set_processor"): + if not isinstance(processor, dict): + module.set_processor(processor) + else: + module.set_processor(processor.pop(f"{name}.processor")) + + for sub_name, child in module.named_children(): + fn_recursive_attn_processor(f"{name}.{sub_name}", child, processor) + + for name, module in self.named_children(): + fn_recursive_attn_processor(name, module, processor) + + +def main(args): + if args.tf32: + torch.backends.cuda.matmul.allow_tf32 = True + else: + torch.backends.cuda.matmul.allow_tf32 = False + + seed = 1024 + torch_dtype = torch.float16 if args.use_fp16 else torch.float32 + pipe = StableDiffusionPipeline.from_pretrained( + args.pretrained_model_name_or_path, + safety_checker=None, + feature_extractor=None, + requires_safety_checker=False, + torch_dtype=torch_dtype, + ) + scheduler = change_scheduler(pipe, args.scheduler) + pipe.scheduler = scheduler + if args.device_id >= 0: + pipe.to(f"cuda:{args.device_id}") + + if args.attention_type == "all": + args.attention_type = ["raw", "sdp"] + else: + args.attention_type = [args.attention_type] + + for attention_type in args.attention_type: + attn_prrocessor_cls = AttnProcessor if attention_type == "raw" else AttnProcessor2_0 + if attention_type == "sdp": + torch.nn.functional.scaled_dot_product_attention = torch.nn.functional.scaled_dot_product_attention_ + set_attn_processor(pipe.unet, attn_prrocessor_cls()) + set_attn_processor(pipe.vae, attn_prrocessor_cls()) + + if args.channels_last: + pipe.unet.to(memory_format=torch.channels_last) + + if args.compile: + print("Run torch compile") + pipe.unet = torch.compile(pipe.unet, mode="reduce-overhead", fullgraph=True) + + width = args.width + height = args.height + pipe.set_progress_bar_config(disable=False) + + folder = f"torch_attn_{attention_type}_fp16" if args.use_fp16 else f"torch_attn_{attention_type}_fp32" + os.makedirs(folder, exist_ok=True) + if args.task_name in ["text2img", "all"]: + init_image = load_image( + "https://paddlenlp.bj.bcebos.com/models/community/junnyu/develop/control_bird_canny_demo.png" + ) + # text2img + prompt = "bird" + time_costs = [] + # warmup + pipe( + prompt, + num_inference_steps=10, + height=height, + width=width, + ) + print("==> Test text2img performance.") + for step in trange(args.benchmark_steps): + start = time.time() + torch.cuda.manual_seed(seed) + images = pipe( + prompt, + num_inference_steps=args.inference_steps, + height=height, + width=width, + ).images + latency = time.time() - start + time_costs += [latency] + # print(f"No {step:3d} time cost: {latency:2f} s") + print( + f"Attention type: {attention_type}, " + f"Use fp16: {'true' if args.use_fp16 else 'false'}, " + f"Mean iter/sec: {1 / (np.mean(time_costs) / args.inference_steps):2f} it/s, " + f"Mean latency: {np.mean(time_costs):2f} s, p50 latency: {np.percentile(time_costs, 50):2f} s, " + f"p90 latency: {np.percentile(time_costs, 90):2f} s, p95 latency: {np.percentile(time_costs, 95):2f} s." + ) + images[0].save(f"{folder}/text2img.png") + + if args.task_name in ["img2img", "all"]: + pipe_img2img = StableDiffusionImg2ImgPipeline(**pipe.components) + pipe_img2img.set_progress_bar_config(disable=False) + img_url = "https://paddlenlp.bj.bcebos.com/models/community/CompVis/stable-diffusion-v1-4/sketch-mountains-input.png" + init_image = load_image(img_url).resize((width, height)) + prompt = "A fantasy landscape, trending on artstation" + time_costs = [] + # warmup + pipe_img2img( + prompt, + image=init_image, + num_inference_steps=20, + height=height, + width=width, + strength=args.strength, + ) + print("==> Test img2img performance.") + for step in trange(args.benchmark_steps): + start = time.time() + torch.cuda.manual_seed(seed) + images = pipe_img2img( + prompt, + image=init_image, + num_inference_steps=args.inference_steps, + height=height, + width=width, + strength=args.strength, + ).images + latency = time.time() - start + time_costs += [latency] + # print(f"No {step:3d} time cost: {latency:2f} s") + print( + f"Attention type: {attention_type}, " + f"Use fp16: {'true' if args.use_fp16 else 'false'}, " + f"Mean iter/sec: {1 / (np.mean(time_costs) / args.inference_steps):2f} it/s, " + f"Mean latency: {np.mean(time_costs):2f} s, p50 latency: {np.percentile(time_costs, 50):2f} s, " + f"p90 latency: {np.percentile(time_costs, 90):2f} s, p95 latency: {np.percentile(time_costs, 95):2f} s." + ) + images[0].save(f"{folder}/img2img.png") + + if args.task_name in ["inpaint_legacy", "all"]: + pipe_inpaint = StableDiffusionInpaintPipeline(**pipe.components) + pipe_inpaint.set_progress_bar_config(disable=False) + img_url = ( + "https://paddlenlp.bj.bcebos.com/models/community/CompVis/stable-diffusion-v1-4/overture-creations.png" + ) + mask_url = "https://paddlenlp.bj.bcebos.com/models/community/CompVis/stable-diffusion-v1-4/overture-creations-mask.png" + init_image = load_image(img_url).resize((width, height)) + mask_image = load_image(mask_url).resize((width, height)) + prompt = "Face of a yellow cat, high resolution, sitting on a park bench" + time_costs = [] + task_name = "inpaint_legacy" + pipe_inpaint( + prompt, + image=init_image, + mask_image=mask_image, + num_inference_steps=20, + height=height, + width=width, + strength=args.strength, + ) + print(f"==> Test {task_name} performance.") + for step in trange(args.benchmark_steps): + start = time.time() + torch.cuda.manual_seed(seed) + images = pipe_inpaint( + prompt, + image=init_image, + mask_image=mask_image, + num_inference_steps=args.inference_steps, + height=height, + width=width, + strength=args.strength, + ).images + latency = time.time() - start + time_costs += [latency] + # print(f"No {step:3d} time cost: {latency:2f} s") + print( + f"Attention type: {attention_type}, " + f"Use fp16: {'true' if args.use_fp16 else 'false'}, " + f"Mean iter/sec: {1 / (np.mean(time_costs) / args.inference_steps):2f} it/s, " + f"Mean latency: {np.mean(time_costs):2f} s, p50 latency: {np.percentile(time_costs, 50):2f} s, " + f"p90 latency: {np.percentile(time_costs, 90):2f} s, p95 latency: {np.percentile(time_costs, 95):2f} s." + ) + images[0].save(f"{folder}/{task_name}.png") + + +if __name__ == "__main__": + args = parse_arguments() + main(args) diff --git a/VLMEvalKit_old/PaddleMIX/ppdiffusers/deploy/sd3/README.md b/VLMEvalKit_old/PaddleMIX/ppdiffusers/deploy/sd3/README.md new file mode 100644 index 0000000000000000000000000000000000000000..b2804832d904d250bf1806d52c6f285f0652555d --- /dev/null +++ b/VLMEvalKit_old/PaddleMIX/ppdiffusers/deploy/sd3/README.md @@ -0,0 +1,77 @@ +# Stable Diffusion 3 高性能推理 + +- Paddle Inference提供Stable Diffusion 3 模型高性能推理实现,推理性能提升70%+ +环境准备: +```shell +# 安装 triton并适配paddle +python -m pip install triton +python -m pip install git+https://github.com/zhoutianzi666/UseTritonInPaddle.git +python -c "import use_triton_in_paddle; use_triton_in_paddle.make_triton_compatible_with_paddle()" + +# 安装develop版本的paddle,请根据自己的cuda版本选择对应的paddle版本,这里选择12.3的cuda版本 +python -m pip install --pre paddlepaddle-gpu -i https://www.paddlepaddle.org.cn/packages/nightly/cu123/ + +# 安装paddlemix库,使用集成在paddlemix库中的自定义算子。 +python -m pip install paddlemix + +# 指定 libCutlassGemmEpilogue.so 的路径 +# 详情请参考 https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/phi/kernels/fusion/cutlass/gemm_epilogue/README.md +export LD_LIBRARY_PATH=/your_dir/Paddle/paddle/phi/kernels/fusion/cutlass/gemm_epilogue/build:$LD_LIBRARY_PATH +- 请注意,该项用于在静态图推理时利用Cutlass融合算子提升推理性能,但是并不是必须项。 +如果不使用Cutlass可以将`./text_to_image_generation-stable_diffusion_3.py`中的`exp_enable_use_cutlass`设为False。 +- +``` + +高性能推理指令: +```shell +# 执行FP16推理 +python text_to_image_generation-stable_diffusion_3.py --dtype float16 --height 512 --width 512 \ +--num-inference-steps 50 --inference_optimize 1 \ +--benchmark 1 +``` +注:--inference_optimize 1 用于开启推理优化,--benchmark 1 用于开启性能测试。 + + +- 在 NVIDIA A100-SXM4-40GB 上测试的性能如下: + +| Paddle Inference| PyTorch | Paddle 动态图 | +| --------------- | ------------ | ------------ | +| 1.2 s | 1.78 s | 4.202 s | + + +## Paddle Stable Diffusion 3 模型多卡推理: +### Data Parallel 实现原理 +- 在SD3中,对于输入是一个prompt时,使用CFG需要同时进行unconditional guide和text guide的生成,此时 MM-DiT-blocks 的输入batch_size=2; +所以我们考虑在多卡并行的方案中,将batch为2的输入拆分到两张卡上进行计算,这样单卡的计算量就减少为原来的一半,降低了单卡所承载的浮点计算量。 +计算完成后,我们再把两张卡的计算结果聚合在一起,结果与单卡计算完全一致。 + +### Model parallel 实现原理 +- 在SD3中,在Linear和Attnetion中有大量的GEMM(General Matrix Multiply),当生成高分辨率图像时,GEMM的计算量以及模型的预训练权重大小都呈线性递增。 +因此,我们考虑在多卡并行方案中,将模型的这些GEMM拆分到两张卡上进行计算,这样单卡的计算量和权重大小就都减少为原来的一半,不仅降低了单卡所承载的浮点计算量,也降低了单卡的显存占用。 + +### 开启多卡推理方法 +- Paddle Inference 提供了SD3模型的多卡推理功能,用户可以通过设置 `mp_size 2` 来开启Model Parallel,使用 `dp_size 2`来开启Data Parallel。 +使用 `python -m paddle.distributed.launch --gpus “0,1,2,3”` 指定使用哪些卡进行推理,其中`--gpus “0,1,2,3”`即为启用的GPU卡号。 +如果只需使用两卡推理,则只需指定两卡即可,如 `python -m paddle.distributed.launch --gpus “0,1”`。同时需要指定使用的并行方法及并行度,如 `mp_size 2` 或者 `dp_size 2`。 + +- 注意,这里的`mp_size`需要设定为不大于输入的batch_size个,且`mp_size`和`dp_size`的和不能超过机器总卡数。 +- 高性能多卡推理指令: +```shell +# 执行多卡推理指令 +python -m paddle.distributed.launch --gpus "0,1,2,3" text_to_image_generation-stable_diffusion_3.py \ +--dtype float16 \ +--height 1024 \ +--width 1024 \ +--num-inference-steps 20 \ +--inference_optimize 1 \ +--mp_size 2 \ +--dp_size 2 \ +--benchmark 1 +``` +注:--inference_optimize 1 用于开启推理优化,--benchmark 1 用于开启性能测试。 + +## 在 NVIDIA A800-SXM4-80GB 上测试的性能如下: + +| Paddle mp_size=2 & dp_size=2 | Paddle mp_size=2 | Paddle dp_size=2 | Paddle Single Card | Paddle 动态图 | +| ---------------------------- | ------------------- | ---------------- | ------------------ | ------------ | +| 0.99s | 1.581 s | 1.319 s | 2.376 s | 3.2 s | diff --git a/VLMEvalKit_old/PaddleMIX/ppdiffusers/deploy/sd3/infer_dygraph_paddle.py b/VLMEvalKit_old/PaddleMIX/ppdiffusers/deploy/sd3/infer_dygraph_paddle.py new file mode 100644 index 0000000000000000000000000000000000000000..14d1f5f24683cf98ff48ce2978666e3e7f91fb5d --- /dev/null +++ b/VLMEvalKit_old/PaddleMIX/ppdiffusers/deploy/sd3/infer_dygraph_paddle.py @@ -0,0 +1,264 @@ +# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import argparse +import os +import time +import warnings + +import cv2 +import numpy as np +import paddle +from PIL import Image +from tqdm.auto import trange + +from ppdiffusers import ( + FlowMatchEulerDiscreteScheduler, + DDIMScheduler, + DDPMScheduler, + DEISMultistepScheduler, + DPMSolverMultistepScheduler, + DPMSolverSinglestepScheduler, + EulerAncestralDiscreteScheduler, + EulerDiscreteScheduler, + HeunDiscreteScheduler, + KDPM2AncestralDiscreteScheduler, + KDPM2DiscreteScheduler, + LMSDiscreteScheduler, + PNDMScheduler, + StableDiffusion3Pipeline, + UniPCMultistepScheduler, +) +from ppdiffusers.utils import load_image + + + +def strtobool(v): + if isinstance(v, bool): + return v + if v.lower() in ("yes", "true", "t", "y", "1"): + return True + elif v.lower() in ("no", "false", "f", "n", "0"): + return False + else: + raise ValueError( + f"Truthy value expected: got {v} but expected one of yes/no, true/false, t/f, y/n, 1/0 (case insensitive)." + ) + + +def change_scheduler(self, scheduler_type="ddim"): + self.orginal_scheduler_config = self.scheduler.config + scheduler_type = scheduler_type.lower() + if scheduler_type == "flow": + scheduler = FlowMatchEulerDiscreteScheduler.from_config(self.orginal_scheduler_config, skip_prk_steps=True) + elif scheduler_type == "pndm": + scheduler = PNDMScheduler.from_config(self.orginal_scheduler_config, skip_prk_steps=True) + elif scheduler_type == "lms": + scheduler = LMSDiscreteScheduler.from_config(self.orginal_scheduler_config) + elif scheduler_type == "heun": + scheduler = HeunDiscreteScheduler.from_config(self.orginal_scheduler_config) + elif scheduler_type == "euler": + scheduler = EulerDiscreteScheduler.from_config(self.orginal_scheduler_config) + elif scheduler_type == "euler-ancestral": + scheduler = EulerAncestralDiscreteScheduler.from_config(self.orginal_scheduler_config) + elif scheduler_type == "dpm-multi": + scheduler = DPMSolverMultistepScheduler.from_config(self.orginal_scheduler_config) + elif scheduler_type == "dpm-single": + scheduler = DPMSolverSinglestepScheduler.from_config(self.orginal_scheduler_config) + elif scheduler_type == "kdpm2-ancestral": + scheduler = KDPM2AncestralDiscreteScheduler.from_config(self.orginal_scheduler_config) + elif scheduler_type == "kdpm2": + scheduler = KDPM2DiscreteScheduler.from_config(self.orginal_scheduler_config) + elif scheduler_type == "unipc-multi": + scheduler = UniPCMultistepScheduler.from_config(self.orginal_scheduler_config) + elif scheduler_type == "ddim": + scheduler = DDIMScheduler.from_config( + self.orginal_scheduler_config, + steps_offset=1, + clip_sample=False, + set_alpha_to_one=False, + ) + elif scheduler_type == "ddpm": + scheduler = DDPMScheduler.from_config( + self.orginal_scheduler_config, + ) + elif scheduler_type == "deis-multi": + scheduler = DEISMultistepScheduler.from_config( + self.orginal_scheduler_config, + ) + else: + raise ValueError(f"Scheduler of type {scheduler_type} doesn't exist!") + return scheduler + + +def parse_arguments(): + + parser = argparse.ArgumentParser() + parser.add_argument( + "--pretrained_model_name_or_path", + type=str, + default="stabilityai/stable-diffusion-3-medium-diffusers", + help="Path to the `diffusers` checkpoint to convert (either a local directory or on the bos).", + ) + parser.add_argument( + "--inference_steps", + type=int, + default=50, + help="The number of unet inference steps.", + ) + parser.add_argument( + "--benchmark_steps", + type=int, + default=10, + help="The number of performance benchmark steps.", + ) + parser.add_argument( + "--task_name", + type=str, + default="all", + choices=[ + "text2img", + "img2img", + "inpaint_legacy", + "all", + ], + help="The task can be one of [text2img, img2img, inpaint_legacy, all]. ", + ) + parser.add_argument( + "--parse_prompt_type", + type=str, + default="raw", + choices=[ + "raw", + "lpw", + ], + help="The parse_prompt_type can be one of [raw, lpw]. ", + ) + parser.add_argument("--use_fp16", type=strtobool, default=True, help="Wheter to use FP16 mode") + parser.add_argument( + "--attention_type", type=str, default="raw", choices=["raw", "cutlass", "flash", "all"], help="attention_type." + ) + parser.add_argument("--device_id", type=int, default=0, help="The selected gpu id. -1 means use cpu") + parser.add_argument( + "--scheduler", + type=str, + default="euler-ancestral", + choices=[ + "flow", + "pndm", + "lms", + "euler", + "euler-ancestral", + "dpm-multi", + "dpm-single", + "unipc-multi", + "ddim", + "ddpm", + "deis-multi", + "heun", + "kdpm2-ancestral", + "kdpm2", + ], + help="The scheduler type of stable diffusion.", + ) + parser.add_argument("--height", type=int, default=512, help="Height of input image") + parser.add_argument("--width", type=int, default=512, help="Width of input image") + parser.add_argument("--strength", type=float, default=1.0, help="Strength for img2img / inpaint") + return parser.parse_args() + + +def main(args): + + seed = 1024 + paddle_dtype = paddle.float16 if args.use_fp16 else paddle.float32 + pipe = StableDiffusion3Pipeline.from_pretrained( + args.pretrained_model_name_or_path, + safety_checker=None, + feature_extractor=None, + requires_safety_checker=False, + paddle_dtype=paddle_dtype, + ) + scheduler = change_scheduler(pipe, args.scheduler) + pipe.scheduler = scheduler + + if args.attention_type == "all": + args.attention_type = ["raw", "cutlass", "flash"] + else: + args.attention_type = [args.attention_type] + + for attention_type in args.attention_type: + if attention_type == "raw": + pipe.disable_xformers_memory_efficient_attention() + else: + try: + pipe.enable_xformers_memory_efficient_attention(attention_type) + except Exception as e: + if attention_type == "flash": + warnings.warn( + "Attention type flash is not supported on your GPU! We need to use 3060、3070、3080、3090、4060、4070、4080、4090、A30、A100 etc." + ) + continue + else: + raise ValueError(e) + + if not args.use_fp16 and attention_type == "flash": + print("Flash attention is not supported dtype=float32! Please use float16 or bfloat16. We will skip this!") + continue + + width = args.width + height = args.height + pipe.set_progress_bar_config(disable=False) + + folder = f"paddle_attn_{attention_type}_fp16" if args.use_fp16 else f"paddle_attn_{attention_type}_fp32" + os.makedirs(folder, exist_ok=True) + if args.task_name in ["text2img", "all"]: + init_image = load_image( + "https://paddlenlp.bj.bcebos.com/models/community/junnyu/develop/control_bird_canny_demo.png" + ) + # text2img + prompt = "bird" + time_costs = [] + # warmup + pipe( + prompt, + num_inference_steps=10, + height=height, + width=width, + ) + print("==> Test text2img performance.") + for step in trange(args.benchmark_steps): + start = time.time() + paddle.seed(seed) + images = pipe( + prompt, + num_inference_steps=args.inference_steps, + height=height, + width=width, + ).images + latency = time.time() - start + time_costs += [latency] + # print(f"No {step:3d} time cost: {latency:2f} s") + print( + f"Attention type: {attention_type}, " + f"Use fp16: {'true' if args.use_fp16 else 'false'}, " + f"Mean iter/sec: {1 / (np.mean(time_costs) / args.inference_steps):2f} it/s, " + f"Mean latency: {np.mean(time_costs):2f} s, p50 latency: {np.percentile(time_costs, 50):2f} s, " + f"p90 latency: {np.percentile(time_costs, 90):2f} s, p95 latency: {np.percentile(time_costs, 95):2f} s." + ) + images[0].save(f"{folder}/text2img.png") + + +if __name__ == "__main__": + args = parse_arguments() + main(args) diff --git a/VLMEvalKit_old/PaddleMIX/ppdiffusers/deploy/sd3/infer_dygraph_torch.py b/VLMEvalKit_old/PaddleMIX/ppdiffusers/deploy/sd3/infer_dygraph_torch.py new file mode 100644 index 0000000000000000000000000000000000000000..14c547b5605833d2c25b775136cea0b4112ee94d --- /dev/null +++ b/VLMEvalKit_old/PaddleMIX/ppdiffusers/deploy/sd3/infer_dygraph_torch.py @@ -0,0 +1,325 @@ +# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import argparse +import os +import time + +import torch + +# torch.nn.functional.scaled_dot_product_attention_ = torch.nn.functional.scaled_dot_product_attention +# delattr(torch.nn.functional, "scaled_dot_product_attention") + +import cv2 +import numpy as np +from diffusers import ( + FlowMatchEulerDiscreteScheduler, + DDIMScheduler, + DDPMScheduler, + DEISMultistepScheduler, + DPMSolverMultistepScheduler, + DPMSolverSinglestepScheduler, + EulerAncestralDiscreteScheduler, + EulerDiscreteScheduler, + HeunDiscreteScheduler, + KDPM2AncestralDiscreteScheduler, + KDPM2DiscreteScheduler, + LMSDiscreteScheduler, + PNDMScheduler, + StableDiffusion3Pipeline, + UniPCMultistepScheduler, +) +from diffusers.models.attention_processor import AttnProcessor, AttnProcessor2_0 +from diffusers.utils import load_image +from PIL import Image +from tqdm.auto import trange + + + +def strtobool(v): + if isinstance(v, bool): + return v + if v.lower() in ("yes", "true", "t", "y", "1"): + return True + elif v.lower() in ("no", "false", "f", "n", "0"): + return False + else: + raise ValueError( + f"Truthy value expected: got {v} but expected one of yes/no, true/false, t/f, y/n, 1/0 (case insensitive)." + ) + + +def change_scheduler(self, scheduler_type="ddim"): + self.orginal_scheduler_config = self.scheduler.config + scheduler_type = scheduler_type.lower() + if scheduler_type == "flow": + scheduler = FlowMatchEulerDiscreteScheduler.from_config(self.orginal_scheduler_config, skip_prk_steps=True) + elif scheduler_type == "pndm": + scheduler = PNDMScheduler.from_config(self.orginal_scheduler_config, skip_prk_steps=True) + elif scheduler_type == "lms": + scheduler = LMSDiscreteScheduler.from_config(self.orginal_scheduler_config) + elif scheduler_type == "heun": + scheduler = HeunDiscreteScheduler.from_config(self.orginal_scheduler_config) + elif scheduler_type == "euler": + scheduler = EulerDiscreteScheduler.from_config(self.orginal_scheduler_config) + elif scheduler_type == "euler-ancestral": + scheduler = EulerAncestralDiscreteScheduler.from_config(self.orginal_scheduler_config) + elif scheduler_type == "dpm-multi": + scheduler = DPMSolverMultistepScheduler.from_config(self.orginal_scheduler_config) + elif scheduler_type == "dpm-single": + scheduler = DPMSolverSinglestepScheduler.from_config(self.orginal_scheduler_config) + elif scheduler_type == "kdpm2-ancestral": + scheduler = KDPM2AncestralDiscreteScheduler.from_config(self.orginal_scheduler_config) + elif scheduler_type == "kdpm2": + scheduler = KDPM2DiscreteScheduler.from_config(self.orginal_scheduler_config) + elif scheduler_type == "unipc-multi": + scheduler = UniPCMultistepScheduler.from_config(self.orginal_scheduler_config) + elif scheduler_type == "ddim": + scheduler = DDIMScheduler.from_config( + self.orginal_scheduler_config, + steps_offset=1, + clip_sample=False, + set_alpha_to_one=False, + ) + elif scheduler_type == "ddpm": + scheduler = DDPMScheduler.from_config( + self.orginal_scheduler_config, + ) + elif scheduler_type == "deis-multi": + scheduler = DEISMultistepScheduler.from_config( + self.orginal_scheduler_config, + ) + else: + raise ValueError(f"Scheduler of type {scheduler_type} doesn't exist!") + return scheduler + + +def parse_arguments(): + + parser = argparse.ArgumentParser() + parser.add_argument( + "--pretrained_model_name_or_path", + type=str, + default="stabilityai/stable-diffusion-3-medium-diffusers", + help="Path to the `diffusers` checkpoint to convert (either a local directory or on the bos).", + ) + parser.add_argument( + "--inference_steps", + type=int, + default=50, + help="The number of unet inference steps.", + ) + parser.add_argument( + "--benchmark_steps", + type=int, + default=10, + help="The number of performance benchmark steps.", + ) + parser.add_argument( + "--task_name", + type=str, + default="all", + choices=[ + "text2img", + "img2img", + "inpaint_legacy", + "all", + ], + help="The task can be one of [text2img, img2img, inpaint_legacy, all]. ", + ) + parser.add_argument( + "--parse_prompt_type", + type=str, + default="raw", + choices=[ + "raw", + "lpw", + ], + help="The parse_prompt_type can be one of [raw, lpw]. ", + ) + parser.add_argument( + "--channels_last", + type=strtobool, + default=False, + help="Wheter to use channels_last", + ) + parser.add_argument("--use_fp16", type=strtobool, default=True, help="Wheter to use FP16 mode") + parser.add_argument("--tf32", type=strtobool, default=True, help="tf32") + parser.add_argument("--compile", type=strtobool, default=False, help="compile") + parser.add_argument( + "--attention_type", + type=str, + default="sdp", + choices=[ + "raw", + "sdp", + ], + help="attention_type.", + ) + parser.add_argument("--device_id", type=int, default=0, help="The selected gpu id. -1 means use cpu") + parser.add_argument( + "--scheduler", + type=str, + default="euler-ancestral", + choices=[ + "flow", + "pndm", + "lms", + "euler", + "euler-ancestral", + "dpm-multi", + "dpm-single", + "unipc-multi", + "ddim", + "ddpm", + "deis-multi", + "heun", + "kdpm2-ancestral", + "kdpm2", + ], + help="The scheduler type of stable diffusion.", + ) + parser.add_argument("--height", type=int, default=512, help="Height of input image") + parser.add_argument("--width", type=int, default=512, help="Width of input image") + parser.add_argument("--strength", type=float, default=1.0, help="Strength for img2img / inpaint") + return parser.parse_args() + + +def attn_processors(self): + processors = {} + + def fn_recursive_add_processors(name: str, module, processors): + if hasattr(module, "set_processor"): + processors[f"{name}.processor"] = module.processor + + for sub_name, child in module.named_children(): + fn_recursive_add_processors(f"{name}.{sub_name}", child, processors) + + return processors + + for name, module in self.named_children(): + fn_recursive_add_processors(name, module, processors) + + return processors + + +def set_attn_processor(self, processor): + count = len(attn_processors(self).keys()) + + if isinstance(processor, dict) and len(processor) != count: + raise ValueError( + f"A dict of processors was passed, but the number of processors {len(processor)} does not match the" + f" number of attention layers: {count}. Please make sure to pass {count} processor classes." + ) + + def fn_recursive_attn_processor(name: str, module, processor): + if hasattr(module, "set_processor"): + if not isinstance(processor, dict): + module.set_processor(processor) + else: + module.set_processor(processor.pop(f"{name}.processor")) + + for sub_name, child in module.named_children(): + fn_recursive_attn_processor(f"{name}.{sub_name}", child, processor) + + for name, module in self.named_children(): + fn_recursive_attn_processor(name, module, processor) + + +def main(args): + if args.tf32: + torch.backends.cuda.matmul.allow_tf32 = True + else: + torch.backends.cuda.matmul.allow_tf32 = False + + seed = 1024 + torch_dtype = torch.float16 if args.use_fp16 else torch.float32 + pipe = StableDiffusion3Pipeline.from_pretrained( + args.pretrained_model_name_or_path, + safety_checker=None, + feature_extractor=None, + requires_safety_checker=False, + torch_dtype=torch_dtype, + ) + scheduler = change_scheduler(pipe, args.scheduler) + pipe.scheduler = scheduler + if args.device_id >= 0: + pipe.to(f"cuda:{args.device_id}") + + if args.attention_type == "all": + args.attention_type = ["raw", "sdp"] + else: + args.attention_type = [args.attention_type] + + for attention_type in args.attention_type: + # attn_prrocessor_cls = AttnProcessor if attention_type == "raw" else AttnProcessor2_0 + # if attention_type == "sdp": + # torch.nn.functional.scaled_dot_product_attention = torch.nn.functional.scaled_dot_product_attention_ + # set_attn_processor(pipe.transformer, attn_prrocessor_cls()) + # set_attn_processor(pipe.vae, attn_prrocessor_cls()) + + # if args.channels_last: + # pipe.transformer.to(memory_format=torch.channels_last) + + # if args.compile: + # print("Run torch compile") + # pipe.unet = torch.compile(pipe.transformer, mode="reduce-overhead", fullgraph=True) + + width = args.width + height = args.height + pipe.set_progress_bar_config(disable=False) + + folder = f"torch_attn_{attention_type}_fp16" if args.use_fp16 else f"torch_attn_{attention_type}_fp32" + os.makedirs(folder, exist_ok=True) + if args.task_name in ["text2img", "all"]: + init_image = load_image( + "https://paddlenlp.bj.bcebos.com/models/community/junnyu/develop/control_bird_canny_demo.png" + ) + # text2img + prompt = "bird" + time_costs = [] + # warmup + pipe( + prompt, + num_inference_steps=10, + height=height, + width=width, + ) + print("==> Test text2img performance.") + for step in trange(args.benchmark_steps): + start = time.time() + torch.cuda.manual_seed(seed) + images = pipe( + prompt, + num_inference_steps=args.inference_steps, + height=height, + width=width, + ).images + latency = time.time() - start + time_costs += [latency] + # print(f"No {step:3d} time cost: {latency:2f} s") + print( + f"Attention type: {attention_type}, " + f"Use fp16: {'true' if args.use_fp16 else 'false'}, " + f"Mean iter/sec: {1 / (np.mean(time_costs) / args.inference_steps):2f} it/s, " + f"Mean latency: {np.mean(time_costs):2f} s, p50 latency: {np.percentile(time_costs, 50):2f} s, " + f"p90 latency: {np.percentile(time_costs, 90):2f} s, p95 latency: {np.percentile(time_costs, 95):2f} s." + ) + images[0].save(f"{folder}/text2img.png") + + + +if __name__ == "__main__": + args = parse_arguments() + main(args) diff --git a/VLMEvalKit_old/PaddleMIX/ppdiffusers/deploy/sd3/scripts/benchmark_paddle.sh b/VLMEvalKit_old/PaddleMIX/ppdiffusers/deploy/sd3/scripts/benchmark_paddle.sh new file mode 100644 index 0000000000000000000000000000000000000000..a0c2d8d45763db9d01e9a0245c02d55c6c0925ae --- /dev/null +++ b/VLMEvalKit_old/PaddleMIX/ppdiffusers/deploy/sd3/scripts/benchmark_paddle.sh @@ -0,0 +1,32 @@ +# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# attention raw fp16 +python infer_dygraph_paddle.py --scheduler "flow" --task_name all --attention_type raw --use_fp16 True --inference_steps 50 --height 1024 --width 1024 --benchmark_steps 10 + +# attention cutlass fp16 +python infer_dygraph_paddle.py --scheduler "flow" --task_name all --attention_type cutlass --use_fp16 True --inference_steps 50 --height 1024 --width 1024 --benchmark_steps 10 + +# attention flash fp16 +python infer_dygraph_paddle.py --scheduler "flow" --task_name all --attention_type flash --use_fp16 True --inference_steps 50 --height 1024 --width 1024 --benchmark_steps 10 + + +# attention raw fp32 +python infer_dygraph_paddle.py --scheduler "flow" --task_name all --attention_type raw --use_fp16 False --inference_steps 50 --height 1024 --width 1024 --benchmark_steps 10 + +# attention cutlass fp32 +python infer_dygraph_paddle.py --scheduler "flow" --task_name all --attention_type cutlass --use_fp16 False --inference_steps 50 --height 1024 --width 1024 --benchmark_steps 10 + +# attention flash fp32 +python infer_dygraph_paddle.py --scheduler "flow" --task_name all --attention_type flash --use_fp16 False --inference_steps 50 --height 1024 --width 1024 --benchmark_steps 10 diff --git a/VLMEvalKit_old/PaddleMIX/ppdiffusers/deploy/sd3/scripts/benchmark_torch.sh b/VLMEvalKit_old/PaddleMIX/ppdiffusers/deploy/sd3/scripts/benchmark_torch.sh new file mode 100644 index 0000000000000000000000000000000000000000..020c54969a75651f919585dab0e67beaf016306e --- /dev/null +++ b/VLMEvalKit_old/PaddleMIX/ppdiffusers/deploy/sd3/scripts/benchmark_torch.sh @@ -0,0 +1,21 @@ +# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# sd3 do ot supprot attention raw + +# attention sdp +python infer_dygraph_torch.py --scheduler "flow" --task_name all --attention_type sdp --use_fp16 True --inference_steps 50 --height 1024 --width 1024 --benchmark_steps 10 + +# attention sdp fp32 +python infer_dygraph_torch.py --scheduler "flow" --task_name all --attention_type sdp --use_fp16 False --inference_steps 50 --height 1024 --width 1024 --benchmark_steps 10 \ No newline at end of file diff --git a/VLMEvalKit_old/PaddleMIX/ppdiffusers/deploy/sd3/text_to_image_generation-stable_diffusion_3.py b/VLMEvalKit_old/PaddleMIX/ppdiffusers/deploy/sd3/text_to_image_generation-stable_diffusion_3.py new file mode 100644 index 0000000000000000000000000000000000000000..61d490d683af75b2fc0af87435f7656e9e6d9b42 --- /dev/null +++ b/VLMEvalKit_old/PaddleMIX/ppdiffusers/deploy/sd3/text_to_image_generation-stable_diffusion_3.py @@ -0,0 +1,149 @@ +# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import argparse +import os + +import paddle + + +def parse_args(): + parser = argparse.ArgumentParser( + description=" Use PaddleMIX to accelerate the Stable Diffusion3 image generation model." + ) + parser.add_argument( + "--benchmark", + type=(lambda x: str(x).lower() in ["true", "1", "yes"]), + default=False, + help="if set to True, measure inference performance", + ) + parser.add_argument( + "--inference_optimize", + type=(lambda x: str(x).lower() in ["true", "1", "yes"]), + default=False, + help="If set to True, all optimizations except Triton are enabled.", + ) + + parser.add_argument("--height", type=int, default=512, help="Height of the generated image.") + parser.add_argument("--width", type=int, default=512, help="Width of the generated image.") + parser.add_argument("--num-inference-steps", type=int, default=50, help="Number of inference steps.") + parser.add_argument("--dtype", type=str, default="float32", help="Inference data types.") + parser.add_argument( + "--mp_size", type=int, default=1, help="This size refers to the degree of parallelism using model parallel." + ) + parser.add_argument( + "--dp_size", type=int, default=1, help="This size refers to the degree of parallelism using data parallel." + ) + + return parser.parse_args() + + +args = parse_args() + +if args.inference_optimize: + os.environ["INFERENCE_OPTIMIZE"] = "True" + os.environ["INFERENCE_OPTIMIZE_TRITON"] = "True" + os.environ["INFERENCE_MP_SIZE"] = str(args.mp_size) + os.environ["INFERENCE_DP_SIZE"] = str(args.dp_size) +if args.dtype == "float32": + inference_dtype = paddle.float32 +elif args.dtype == "float16": + inference_dtype = paddle.float16 + + +import paddle.distributed as dist +import paddle.distributed.fleet as fleet + +if args.mp_size > 1 or args.dp_size > 1: + strategy = fleet.DistributedStrategy() + model_parallel_size = args.mp_size + data_parallel_size = args.dp_size + strategy.hybrid_configs = {"dp_degree": data_parallel_size, "mp_degree": model_parallel_size, "pp_degree": 1} + fleet.init(is_collective=True, strategy=strategy) + hcg = fleet.get_hybrid_communicate_group() + mp_id = hcg.get_model_parallel_rank() + dp_id = hcg.get_data_parallel_rank() + rank_id = dist.get_rank() + mp_degree = hcg.get_model_parallel_world_size() + dp_degree = hcg.get_data_parallel_world_size() + assert mp_degree == args.mp_size + assert dp_degree == args.dp_size + + # this is for triton kernel cache for dynamic graph + # os.environ["TRITON_KERNEL_CACHE_DIR"] = f"./tmp/sd3_parallel/{rank_id}" + +import datetime + +from ppdiffusers import StableDiffusion3Pipeline + +pipe = StableDiffusion3Pipeline.from_pretrained( + "stabilityai/stable-diffusion-3-medium-diffusers", + paddle_dtype=inference_dtype, +) + +pipe.transformer = paddle.incubate.jit.inference( + pipe.transformer, + save_model_dir="./tmp/sd3", + enable_new_ir=True, + cache_static_model=True, + exp_enable_use_cutlass=True, + delete_pass_lists=["add_norm_fuse_pass"], +) + +generator = paddle.Generator().manual_seed(42) +prompt = "A cat holding a sign that says hello world" + + +image = pipe( + prompt, num_inference_steps=args.num_inference_steps, width=args.width, height=args.height, generator=generator +).images[0] + +if args.benchmark: + # warmup + for i in range(3): + image = pipe( + prompt, + num_inference_steps=args.num_inference_steps, + width=args.width, + height=args.height, + generator=generator, + ).images[0] + + repeat_times = 10 + sumtime = 0.0 + for i in range(repeat_times): + paddle.device.synchronize() + starttime = datetime.datetime.now() + image = pipe( + prompt, + num_inference_steps=args.num_inference_steps, + width=args.width, + height=args.height, + generator=generator, + ).images[0] + paddle.device.synchronize() + endtime = datetime.datetime.now() + duringtime = endtime - starttime + duringtime = duringtime.seconds * 1000 + duringtime.microseconds / 1000.0 + sumtime += duringtime + print("SD3 end to end time : ", duringtime, "ms") + + print("SD3 ave end to end time : ", sumtime / repeat_times, "ms") + + cuda_mem_after_used = paddle.device.cuda.max_memory_allocated() / (1024**3) + print(f"Max used CUDA memory : {cuda_mem_after_used:.3f} GiB") + + +rank_id = dist.get_rank() +if rank_id == 0: + image.save("text_to_image_generation-stable_diffusion_3-result.png") diff --git a/VLMEvalKit_old/PaddleMIX/ppdiffusers/deploy/sdxl/README.md b/VLMEvalKit_old/PaddleMIX/ppdiffusers/deploy/sdxl/README.md new file mode 100644 index 0000000000000000000000000000000000000000..991712e0582c5dad1598e450eaef1c0b09873be1 --- /dev/null +++ b/VLMEvalKit_old/PaddleMIX/ppdiffusers/deploy/sdxl/README.md @@ -0,0 +1,44 @@ +# PaddleInfer Stable Diffusion XL 模型高性能部署 + + **目录** + * [环境依赖](#环境依赖) + * [快速体验](#快速体验) + * [文图生成(Text-to-Image Generation)](#文图生成) + * [文本引导的图像变换(Image-to-Image Text-Guided Generation)](#文本引导的图像变换) + * [文本引导的图像编辑(Text-Guided Image Inpainting)](#文本引导的图像编辑) + +⚡️[PaddleInfer]是一款全场景、易用灵活、极致高效的AI推理部署工具,为开发者提供多硬件、多推理引擎后端的部署能力。开发者只需调用一行代码即可随意切换硬件、推理引擎后端。本示例展现如何通过 PaddleInfer 将我们 PPDiffusers 训练好的 Stable Diffusion XL模型进行多硬件、多推理引擎后端高性能部署。 + + + +## 环境依赖 + +在示例中使用了 PaddleInfer,需要执行以下命令安装依赖。 + +```shell +python -m pip install paddlepaddle-gpu==2.6.0.post112 -f https://www.paddlepaddle.org.cn/whl/linux/mkl/avx/stable.html +``` + + + +## 静态图模型导出 (static model export) +``` +export USE_PPXFORMERS=False +python export_model.py --pretrained_model_name_or_path stabilityai/stable-diffusion-xl-base-1.0 --output_path static_model/stable-diffusion-xl-base-1.0 +``` +导出模型在static_model/stable-diffusion-xl-base-1.0目录下。 + +### 文图生成(Text-to-Image Generation) +``` +python infer.py --model_dir static_model/stable-diffusion-xl-base-1.0 --scheduler "preconfig-euler-ancestral" --backend paddle --device gpu --task_name text2img +``` + +### 文本引导的图像变换(Image-to-Image Text-Guided Generation) +``` +python infer.py --model_dir static_model/stable-diffusion-xl-base-1.0 --scheduler "preconfig-euler-ancestral" --backend paddle --device gpu --task_name img2img +``` + +### 文本引导的图像编辑(Text-Guided Image Inpainting) +``` +python infer.py --model_dir static_model/stable-diffusion-xl-base-1.0 --scheduler "preconfig-euler-ancestral" --backend paddle --device gpu --task_name inpaint +``` diff --git a/VLMEvalKit_old/PaddleMIX/ppdiffusers/ppdiffusers/experimental/rl/__init__.py b/VLMEvalKit_old/PaddleMIX/ppdiffusers/ppdiffusers/experimental/rl/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..afe6428281af43f57efb59b68bd1f918bf3bbd4c --- /dev/null +++ b/VLMEvalKit_old/PaddleMIX/ppdiffusers/ppdiffusers/experimental/rl/__init__.py @@ -0,0 +1,15 @@ +# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from .value_guided_sampling import ValueGuidedRLPipeline diff --git a/VLMEvalKit_old/PaddleMIX/ppdiffusers/ppdiffusers/experimental/rl/value_guided_sampling.py b/VLMEvalKit_old/PaddleMIX/ppdiffusers/ppdiffusers/experimental/rl/value_guided_sampling.py new file mode 100644 index 0000000000000000000000000000000000000000..7024c5c94358fb40b62f653b1d7891dff12cd762 --- /dev/null +++ b/VLMEvalKit_old/PaddleMIX/ppdiffusers/ppdiffusers/experimental/rl/value_guided_sampling.py @@ -0,0 +1,153 @@ +# Copyright 2023 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np +import paddle + +from ...models.unet_1d import UNet1DModel +from ...pipelines import DiffusionPipeline +from ...utils.dummy_paddle_objects import DDPMScheduler +from ...utils.paddle_utils import randn_tensor + + +class ValueGuidedRLPipeline(DiffusionPipeline): + r""" + Pipeline for value-guided sampling from a diffusion model trained to predict sequences of states. + + This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods + implemented for all pipelines (downloading, saving, running on a particular device, etc.). + + Parameters: + value_function ([`UNet1DModel`]): + A specialized UNet for fine-tuning trajectories base on reward. + unet ([`UNet1DModel`]): + UNet architecture to denoise the encoded trajectories. + scheduler ([`SchedulerMixin`]): + A scheduler to be used in combination with `unet` to denoise the encoded trajectories. Default for this + application is [`DDPMScheduler`]. + env (): + An environment following the OpenAI gym API to act in. For now only Hopper has pretrained models. + """ + + def __init__( + self, + value_function: UNet1DModel, + unet: UNet1DModel, + scheduler: DDPMScheduler, + env, + ): + super().__init__() + self.value_function = value_function + self.unet = unet + self.scheduler = scheduler + self.env = env + self.data = env.get_dataset() + self.means = {} + for key in self.data.keys(): + try: + self.means[key] = self.data[key].mean() + except Exception: + pass + self.stds = {} + for key in self.data.keys(): + try: + self.stds[key] = self.data[key].std() + except Exception: + pass + self.state_dim = env.observation_space.shape[0] + self.action_dim = env.action_space.shape[0] + + def normalize(self, x_in, key): + return (x_in - self.means[key]) / self.stds[key] + + def de_normalize(self, x_in, key): + return x_in * self.stds[key] + self.means[key] + + def to_paddle(self, x_in): + if isinstance(x_in, dict): + return {k: self.to_paddle(v) for k, v in x_in.items()} + elif paddle.is_tensor(x_in): + return x_in + return paddle.to_tensor(x_in) + + def reset_x0(self, x_in, cond, act_dim): + for key, val in cond.items(): + x_in[:, key, act_dim:] = val.clone() + return x_in + + def run_diffusion(self, x, conditions, n_guide_steps, scale): + batch_size = x.shape[0] + y = None + for i in self.progress_bar(self.scheduler.timesteps): + # create batch of timesteps to pass into model + timesteps = paddle.full((batch_size,), i, dtype=paddle.int64) + for _ in range(n_guide_steps): + with paddle.set_grad_enabled(True): + x.stop_gradient = False + + # permute to match dimension for pre-trained models + y = self.value_function(x.transpose([0, 2, 1]), timesteps).sample + grad = paddle.autograd.grad([y.sum()], [x])[0] + + posterior_variance = self.scheduler._get_variance(i) + model_std = paddle.exp(0.5 * posterior_variance) + grad = model_std * grad + + grad[timesteps < 2] = 0 + x = x.detach() + x = x + scale * grad + x = self.reset_x0(x, conditions, self.action_dim) + + prev_x = self.unet(x.transpose([0, 2, 1]), timesteps).sample.transpose([0, 2, 1]) + + # TODO: verify deprecation of this kwarg + x = self.scheduler.step(prev_x, i, x, predict_epsilon=False)["prev_sample"] + + # apply conditions to the trajectory (set the initial state) + x = self.reset_x0(x, conditions, self.action_dim) + x = self.to_paddle(x) + return x, y + + def __call__(self, obs, batch_size=64, planning_horizon=32, n_guide_steps=2, scale=0.1): + # normalize the observations and create batch dimension + obs = self.normalize(obs, "observations") + obs = obs[None].repeat(batch_size, axis=0) + + conditions = {0: self.to_paddle(obs)} + shape = (batch_size, planning_horizon, self.state_dim + self.action_dim) + + # generate initial noise and apply our conditions (to make the trajectories start at current state) + x1 = randn_tensor(shape, dtype=self.unet.dtype) + x = self.reset_x0(x1, conditions, self.action_dim) + x = self.to_paddle(x) + + # run the diffusion process + x, y = self.run_diffusion(x, conditions, n_guide_steps, scale) + + # sort output trajectories by value + sorted_idx = paddle.argsort(y, 0, descending=True).squeeze() + sorted_values = x[sorted_idx] + actions = sorted_values[:, :, : self.action_dim] + actions = actions.detach().cpu().numpy() + denorm_actions = self.de_normalize(actions, key="actions") + + # select the action with the highest value + if y is not None: + selected_index = 0 + else: + # if we didn't run value guiding, select a random action + selected_index = np.random.randint(0, batch_size) + + denorm_actions = denorm_actions[selected_index, 0] + return denorm_actions diff --git a/VLMEvalKit_old/PaddleMIX/ppdiffusers/ppdiffusers/models/animate_anyone/pose_guider.py b/VLMEvalKit_old/PaddleMIX/ppdiffusers/ppdiffusers/models/animate_anyone/pose_guider.py new file mode 100644 index 0000000000000000000000000000000000000000..7bd36a6caa677ae6910f01acbb87777d8cfc1430 --- /dev/null +++ b/VLMEvalKit_old/PaddleMIX/ppdiffusers/ppdiffusers/models/animate_anyone/pose_guider.py @@ -0,0 +1,68 @@ +# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import Tuple + +import paddle + +from ppdiffusers.models.animate_anyone.motion_module import zero_module +from ppdiffusers.models.animate_anyone.resnet import InflatedConv3d +from ppdiffusers.models.modeling_utils import ContextManagers, ModelMixin + + +class PoseGuider(ModelMixin): + def __init__( + self, + conditioning_embedding_channels: int, + conditioning_channels: int = 3, + block_out_channels: Tuple[int] = (16, 32, 64, 128), + weight_dtype=None, + ): + super().__init__() + + init_contexts = [] + if weight_dtype is not None: + init_contexts.append(paddle.dtype_guard(weight_dtype)) + + with ContextManagers(init_contexts): + self.conv_in = InflatedConv3d(conditioning_channels, block_out_channels[0], kernel_size=3, padding=1) + + self.blocks = paddle.nn.LayerList(sublayers=[]) + + for i in range(len(block_out_channels) - 1): + channel_in = block_out_channels[i] + channel_out = block_out_channels[i + 1] + self.blocks.append(InflatedConv3d(channel_in, channel_in, kernel_size=3, padding=1)) + self.blocks.append(InflatedConv3d(channel_in, channel_out, kernel_size=3, padding=1, stride=2)) + + self.conv_out = zero_module( + InflatedConv3d( + block_out_channels[-1], + conditioning_embedding_channels, + kernel_size=3, + padding=1, + ) + ) + + def forward(self, conditioning): + embedding = self.conv_in(conditioning) + embedding = paddle.nn.functional.silu(x=embedding) + + for block in self.blocks: + embedding = block(embedding) + embedding = paddle.nn.functional.silu(x=embedding) + + embedding = self.conv_out(embedding) + + return embedding diff --git a/VLMEvalKit_old/PaddleMIX/ppdiffusers/ppdiffusers/models/animate_anyone/resnet.py b/VLMEvalKit_old/PaddleMIX/ppdiffusers/ppdiffusers/models/animate_anyone/resnet.py new file mode 100644 index 0000000000000000000000000000000000000000..1aff93940aef9e99752f86fada1ce7cf8f96d69d --- /dev/null +++ b/VLMEvalKit_old/PaddleMIX/ppdiffusers/ppdiffusers/models/animate_anyone/resnet.py @@ -0,0 +1,235 @@ +# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Adapted from https://github.com/huggingface/ppdiffusers/blob/main/src/ppdiffusers/models/resnet.py + +import paddle +from einops import rearrange + + +class InflatedConv3d(paddle.nn.Conv2D): + def forward(self, x): + video_length = x.shape[2] + x = rearrange(x, "b c f h w -> (b f) c h w") + x = super().forward(x) + + x = rearrange(x, "(b f) c h w -> b c f h w", f=video_length) + + return x + + +class InflatedGroupNorm(paddle.nn.GroupNorm): + def forward(self, x): + video_length = x.shape[2] + + x = rearrange(x, "b c f h w -> (b f) c h w") + x = super().forward(x) + x = rearrange(x, "(b f) c h w -> b c f h w", f=video_length) + + return x + + +class Upsample3D(paddle.nn.Layer): + def __init__( + self, + channels, + use_conv=False, + use_conv_transpose=False, + out_channels=None, + name="conv", + ): + super().__init__() + self.channels = channels + self.out_channels = out_channels or channels + self.use_conv = use_conv + self.use_conv_transpose = use_conv_transpose + self.name = name + + if use_conv_transpose: + raise NotImplementedError + elif use_conv: + self.conv = InflatedConv3d(self.channels, self.out_channels, 3, padding=1) + + def forward(self, hidden_states, output_size=None): + assert hidden_states.shape[1] == self.channels + + if self.use_conv_transpose: + raise NotImplementedError + + # Cast to float32 to as 'upsample_nearest2d_out_frame' op does not support bfloat16 + dtype = hidden_states.dtype + if dtype == "bfloat16": + hidden_states = hidden_states.to("float32") + + # upsample_nearest_nhwc fails with large batch sizes. see https://github.com/huggingface/ppdiffusers/issues/984 + if hidden_states.shape[0] >= 64: + hidden_states = hidden_states.contiguous() + + if output_size is None: + hidden_states = paddle.nn.functional.interpolate( + x=hidden_states, scale_factor=[1.0, 2.0, 2.0], mode="nearest", data_format="NCDHW" + ) + else: + hidden_states = paddle.nn.functional.interpolate( + x=hidden_states, size=output_size, mode="nearest", data_format="NCDHW" + ) + + # If the input is bfloat16, we cast back to bfloat16 + if dtype == "bfloat16": + hidden_states = hidden_states.to(dtype) + + hidden_states = self.conv(hidden_states) + + return hidden_states + + +class Downsample3D(paddle.nn.Layer): + def __init__(self, channels, use_conv=False, out_channels=None, padding=1, name="conv"): + super().__init__() + self.channels = channels + self.out_channels = out_channels or channels + self.use_conv = use_conv + self.padding = padding + stride = 2 + self.name = name + + if use_conv: + self.conv = InflatedConv3d(self.channels, self.out_channels, 3, stride=stride, padding=padding) + else: + raise NotImplementedError + + def forward(self, hidden_states): + assert hidden_states.shape[1] == self.channels + if self.use_conv and self.padding == 0: + raise NotImplementedError + + assert hidden_states.shape[1] == self.channels + hidden_states = self.conv(hidden_states) + + return hidden_states + + +class ResnetBlock3D(paddle.nn.Layer): + def __init__( + self, + *, + in_channels, + out_channels=None, + conv_shortcut=False, + dropout=0.0, + temb_channels=512, + groups=32, + groups_out=None, + pre_norm=True, + eps=1e-6, + non_linearity="swish", + time_embedding_norm="default", + output_scale_factor=1.0, + use_in_shortcut=None, + use_inflated_groupnorm=None, + ): + super().__init__() + self.pre_norm = pre_norm + self.pre_norm = True + self.in_channels = in_channels + out_channels = in_channels if out_channels is None else out_channels + self.out_channels = out_channels + self.use_conv_shortcut = conv_shortcut + self.time_embedding_norm = time_embedding_norm + self.output_scale_factor = output_scale_factor + + if groups_out is None: + groups_out = groups + + assert use_inflated_groupnorm is not None + if use_inflated_groupnorm: + self.norm1 = InflatedGroupNorm(num_groups=groups, num_channels=in_channels, epsilon=eps) + else: + + self.norm1 = paddle.nn.GroupNorm( + num_groups=groups, num_channels=in_channels, epsilon=eps, weight_attr=True, bias_attr=True + ) + + self.conv1 = InflatedConv3d(in_channels, out_channels, kernel_size=3, stride=1, padding=1) + + if temb_channels is not None: + if self.time_embedding_norm == "default": + time_emb_proj_out_channels = out_channels + elif self.time_embedding_norm == "scale_shift": + time_emb_proj_out_channels = out_channels * 2 + else: + raise ValueError(f"unknown time_embedding_norm : {self.time_embedding_norm} ") + + self.time_emb_proj = paddle.nn.Linear(in_features=temb_channels, out_features=time_emb_proj_out_channels) + else: + self.time_emb_proj = None + + if use_inflated_groupnorm: + self.norm2 = InflatedGroupNorm(num_groups=groups_out, num_channels=out_channels, epsilon=eps) + else: + self.norm2 = paddle.nn.GroupNorm( + num_groups=groups_out, num_channels=out_channels, epsilon=eps, weight_attr=True, bias_attr=True + ) + self.dropout = paddle.nn.Dropout(p=dropout) + self.conv2 = InflatedConv3d(out_channels, out_channels, kernel_size=3, stride=1, padding=1) + + if non_linearity == "swish": + self.nonlinearity = lambda x: paddle.nn.functional.silu(x=x) + elif non_linearity == "mish": + self.nonlinearity = Mish() + elif non_linearity == "silu": + self.nonlinearity = paddle.nn.Silu() + + self.use_in_shortcut = self.in_channels != self.out_channels if use_in_shortcut is None else use_in_shortcut + + self.conv_shortcut = None + if self.use_in_shortcut: + self.conv_shortcut = InflatedConv3d(in_channels, out_channels, kernel_size=1, stride=1, padding=0) + + def forward(self, input_tensor, temb): + hidden_states = input_tensor + + hidden_states = self.norm1(hidden_states) + hidden_states = self.nonlinearity(hidden_states) + + hidden_states = self.conv1(hidden_states) + + if temb is not None: + temb = self.time_emb_proj(self.nonlinearity(temb))[:, :, None, None, None] + + if temb is not None and self.time_embedding_norm == "default": + hidden_states = hidden_states + temb + + hidden_states = self.norm2(hidden_states) + + if temb is not None and self.time_embedding_norm == "scale_shift": + scale, shift = paddle.chunk(x=temb, chunks=2, axis=1) + hidden_states = hidden_states * (1 + scale) + shift + + hidden_states = self.nonlinearity(hidden_states) + + hidden_states = self.dropout(hidden_states) + hidden_states = self.conv2(hidden_states) + + if self.conv_shortcut is not None: + input_tensor = self.conv_shortcut(input_tensor) + + output_tensor = (input_tensor + hidden_states) / self.output_scale_factor + + return output_tensor + + +class Mish(paddle.nn.Layer): + def forward(self, hidden_states): + return hidden_states * paddle.nn.functional.tanh(x=paddle.nn.functional.softplus(x=hidden_states)) diff --git a/VLMEvalKit_old/PaddleMIX/ppdiffusers/ppdiffusers/models/animate_anyone/transformer_3d.py b/VLMEvalKit_old/PaddleMIX/ppdiffusers/ppdiffusers/models/animate_anyone/transformer_3d.py new file mode 100644 index 0000000000000000000000000000000000000000..3f294f9afec90a6699bda8b9f7dc994cbad46654 --- /dev/null +++ b/VLMEvalKit_old/PaddleMIX/ppdiffusers/ppdiffusers/models/animate_anyone/transformer_3d.py @@ -0,0 +1,155 @@ +# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from dataclasses import dataclass +from typing import Optional + +import paddle +from einops import rearrange, repeat + +from ppdiffusers.configuration_utils import ConfigMixin, register_to_config +from ppdiffusers.models import ModelMixin +from ppdiffusers.utils import BaseOutput + +from .attention import TemporalBasicTransformerBlock + + +@dataclass +class Transformer3DModelOutput(BaseOutput): + sample: paddle.Tensor + + +class Transformer3DModel(ModelMixin, ConfigMixin): + _supports_gradient_checkpointing = True + + @register_to_config + def __init__( + self, + num_attention_heads: int = 16, + attention_head_dim: int = 88, + in_channels: Optional[int] = None, + num_layers: int = 1, + dropout: float = 0.0, + norm_num_groups: int = 32, + cross_attention_dim: Optional[int] = None, + attention_bias: bool = False, + activation_fn: str = "geglu", + num_embeds_ada_norm: Optional[int] = None, + use_linear_projection: bool = False, + only_cross_attention: bool = False, + upcast_attention: bool = False, + unet_use_cross_frame_attention=None, + unet_use_temporal_attention=None, + ): + super().__init__() + self.use_linear_projection = use_linear_projection + self.num_attention_heads = num_attention_heads + self.attention_head_dim = attention_head_dim + inner_dim = num_attention_heads * attention_head_dim + + # Define input layers + self.in_channels = in_channels + + self.norm = paddle.nn.GroupNorm( + num_groups=norm_num_groups, num_channels=in_channels, epsilon=1e-06, weight_attr=True, bias_attr=True + ) + if use_linear_projection: + self.proj_in = paddle.nn.Linear(in_features=in_channels, out_features=inner_dim) + else: + self.proj_in = paddle.nn.Conv2D( + in_channels=in_channels, out_channels=inner_dim, kernel_size=1, stride=1, padding=0 + ) + self.transformer_blocks = paddle.nn.LayerList( + sublayers=[ + TemporalBasicTransformerBlock( + inner_dim, + num_attention_heads, + attention_head_dim, + dropout=dropout, + cross_attention_dim=cross_attention_dim, + activation_fn=activation_fn, + num_embeds_ada_norm=num_embeds_ada_norm, + attention_bias=attention_bias, + only_cross_attention=only_cross_attention, + upcast_attention=upcast_attention, + unet_use_cross_frame_attention=unet_use_cross_frame_attention, + unet_use_temporal_attention=unet_use_temporal_attention, + ) + for d in range(num_layers) + ] + ) + if use_linear_projection: + self.proj_out = paddle.nn.Linear(in_features=in_channels, out_features=inner_dim) + else: + self.proj_out = paddle.nn.Conv2D( + in_channels=inner_dim, out_channels=in_channels, kernel_size=1, stride=1, padding=0 + ) + + self.gradient_checkpointing = False + + def _set_gradient_checkpointing(self, module, value=False): + if hasattr(module, "gradient_checkpointing"): + module.gradient_checkpointing = value + + def forward( + self, + hidden_states, + encoder_hidden_states=None, + timestep=None, + return_dict: bool = True, + ): + # Input + assert hidden_states.dim() == 5, f"Expected hidden_states to have ndim=5, but got ndim={hidden_states.dim()}." + video_length = hidden_states.shape[2] + hidden_states = rearrange(hidden_states, "b c f h w -> (b f) c h w") + if encoder_hidden_states.shape[0] != hidden_states.shape[0]: + encoder_hidden_states = repeat(encoder_hidden_states, "b n c -> (b f) n c", f=video_length) + + batch, channel, height, weight = hidden_states.shape + residual = hidden_states + + hidden_states = self.norm(hidden_states) + if not self.use_linear_projection: + hidden_states = self.proj_in(hidden_states) + inner_dim = hidden_states.shape[1] + hidden_states = hidden_states.transpose(perm=[0, 2, 3, 1]).reshape((batch, height * weight, inner_dim)) + else: + inner_dim = hidden_states.shape[1] + hidden_states = hidden_states.transpose(perm=[0, 2, 3, 1]).reshape((batch, height * weight, inner_dim)) + hidden_states = self.proj_in(hidden_states) + + # Blocks + for i, block in enumerate(self.transformer_blocks): + hidden_states = block( + hidden_states, + encoder_hidden_states=encoder_hidden_states, + timestep=timestep, + video_length=video_length, + ) + + # Output + if not self.use_linear_projection: + hidden_states = hidden_states.reshape((batch, height, weight, inner_dim)).transpose(perm=[0, 3, 1, 2]) + hidden_states = self.proj_out(hidden_states) + else: + hidden_states = self.proj_out(hidden_states) + hidden_states = hidden_states.reshape((batch, height, weight, inner_dim)).transpose(perm=[0, 3, 1, 2]) + + output = hidden_states + residual + + output = rearrange(output, "(b f) c h w -> b c f h w", f=video_length) + if not return_dict: + return (output,) + + return Transformer3DModelOutput(sample=output) diff --git a/VLMEvalKit_old/PaddleMIX/ppdiffusers/ppdiffusers/models/animate_anyone/unet_3d.py b/VLMEvalKit_old/PaddleMIX/ppdiffusers/ppdiffusers/models/animate_anyone/unet_3d.py new file mode 100644 index 0000000000000000000000000000000000000000..c5e5645e7abe55191e5dfe004a3446270aa22df1 --- /dev/null +++ b/VLMEvalKit_old/PaddleMIX/ppdiffusers/ppdiffusers/models/animate_anyone/unet_3d.py @@ -0,0 +1,615 @@ +# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Adapted from https://github.com/guoyww/AnimateDiff/blob/main/animatediff/models/unet_blocks.py + +from dataclasses import dataclass +from os import PathLike +from pathlib import Path +from typing import Dict, List, Optional, Tuple, Union + +import paddle + +from ppdiffusers.configuration_utils import ConfigMixin, register_to_config +from ppdiffusers.models.attention_processor import AttentionProcessor +from ppdiffusers.models.embeddings import TimestepEmbedding, Timesteps +from ppdiffusers.models.modeling_utils import ContextManagers, ModelMixin +from ppdiffusers.utils import BaseOutput, logging + +from .resnet import InflatedConv3d, InflatedGroupNorm +from .unet_3d_blocks import UNetMidBlock3DCrossAttn, get_down_block, get_up_block + +logger = logging.get_logger(__name__) # pylint: disable=invalid-name + + +@dataclass +class UNet3DConditionOutput(BaseOutput): + sample: paddle.Tensor + + +class UNet3DConditionModel(ModelMixin, ConfigMixin): + _supports_gradient_checkpointing = True + + @register_to_config + def __init__( + self, + sample_size: Optional[int] = None, + in_channels: int = 4, + out_channels: int = 4, + center_input_sample: bool = False, + flip_sin_to_cos: bool = True, + freq_shift: int = 0, + down_block_types: Tuple[str] = ( + "CrossAttnDownBlock3D", + "CrossAttnDownBlock3D", + "CrossAttnDownBlock3D", + "DownBlock3D", + ), + mid_block_type: str = "UNetMidBlock3DCrossAttn", + up_block_types: Tuple[str] = ( + "UpBlock3D", + "CrossAttnUpBlock3D", + "CrossAttnUpBlock3D", + "CrossAttnUpBlock3D", + ), + only_cross_attention: Union[bool, Tuple[bool]] = False, + block_out_channels: Tuple[int] = (320, 640, 1280, 1280), + layers_per_block: int = 2, + downsample_padding: int = 1, + mid_block_scale_factor: float = 1, + act_fn: str = "silu", + norm_num_groups: int = 32, + norm_eps: float = 1e-5, + cross_attention_dim: int = 1280, + attention_head_dim: Union[int, Tuple[int]] = 8, + dual_cross_attention: bool = False, + use_linear_projection: bool = False, + class_embed_type: Optional[str] = None, + num_class_embeds: Optional[int] = None, + upcast_attention: bool = False, + resnet_time_scale_shift: str = "default", + use_inflated_groupnorm=False, + # Additional + use_motion_module=False, + motion_module_resolutions=(1, 2, 4, 8), + motion_module_mid_block=False, + motion_module_decoder_only=False, + motion_module_type=None, + motion_module_kwargs={}, + unet_use_cross_frame_attention=None, + unet_use_temporal_attention=None, + ): + super().__init__() + + self.sample_size = sample_size + time_embed_dim = block_out_channels[0] * 4 + + # input + self.conv_in = InflatedConv3d(in_channels, block_out_channels[0], kernel_size=3, padding=(1, 1)) + + # time + self.time_proj = Timesteps(block_out_channels[0], flip_sin_to_cos, freq_shift) + timestep_input_dim = block_out_channels[0] + + self.time_embedding = TimestepEmbedding(timestep_input_dim, time_embed_dim) + + # class embedding + if class_embed_type is None and num_class_embeds is not None: + self.class_embedding = paddle.nn.Embedding(num_embeddings=num_class_embeds, embedding_dim=time_embed_dim) + elif class_embed_type == "timestep": + self.class_embedding = TimestepEmbedding(timestep_input_dim, time_embed_dim) + elif class_embed_type == "identity": + self.class_embedding = paddle.nn.Identity(time_embed_dim, time_embed_dim) + else: + self.class_embedding = None + + self.down_blocks = paddle.nn.LayerList(sublayers=[]) + self.mid_block = None + self.up_blocks = paddle.nn.LayerList(sublayers=[]) + + if isinstance(only_cross_attention, bool): + only_cross_attention = [only_cross_attention] * len(down_block_types) + + if isinstance(attention_head_dim, int): + attention_head_dim = (attention_head_dim,) * len(down_block_types) + + # down + output_channel = block_out_channels[0] + for i, down_block_type in enumerate(down_block_types): + res = 2**i + input_channel = output_channel + output_channel = block_out_channels[i] + is_final_block = i == len(block_out_channels) - 1 + + down_block = get_down_block( + down_block_type, + num_layers=layers_per_block, + in_channels=input_channel, + out_channels=output_channel, + temb_channels=time_embed_dim, + add_downsample=not is_final_block, + resnet_eps=norm_eps, + resnet_act_fn=act_fn, + resnet_groups=norm_num_groups, + cross_attention_dim=cross_attention_dim, + attn_num_head_channels=attention_head_dim[i], + downsample_padding=downsample_padding, + dual_cross_attention=dual_cross_attention, + use_linear_projection=use_linear_projection, + only_cross_attention=only_cross_attention[i], + upcast_attention=upcast_attention, + resnet_time_scale_shift=resnet_time_scale_shift, + unet_use_cross_frame_attention=unet_use_cross_frame_attention, + unet_use_temporal_attention=unet_use_temporal_attention, + use_inflated_groupnorm=use_inflated_groupnorm, + use_motion_module=use_motion_module + and (res in motion_module_resolutions) + and (not motion_module_decoder_only), + motion_module_type=motion_module_type, + motion_module_kwargs=motion_module_kwargs, + ) + self.down_blocks.append(down_block) + + # mid + if mid_block_type == "UNetMidBlock3DCrossAttn": + self.mid_block = UNetMidBlock3DCrossAttn( + in_channels=block_out_channels[-1], + temb_channels=time_embed_dim, + resnet_eps=norm_eps, + resnet_act_fn=act_fn, + output_scale_factor=mid_block_scale_factor, + resnet_time_scale_shift=resnet_time_scale_shift, + cross_attention_dim=cross_attention_dim, + attn_num_head_channels=attention_head_dim[-1], + resnet_groups=norm_num_groups, + dual_cross_attention=dual_cross_attention, + use_linear_projection=use_linear_projection, + upcast_attention=upcast_attention, + unet_use_cross_frame_attention=unet_use_cross_frame_attention, + unet_use_temporal_attention=unet_use_temporal_attention, + use_inflated_groupnorm=use_inflated_groupnorm, + use_motion_module=use_motion_module and motion_module_mid_block, + motion_module_type=motion_module_type, + motion_module_kwargs=motion_module_kwargs, + ) + else: + raise ValueError(f"unknown mid_block_type : {mid_block_type}") + + # count how many layers upsample the videos + self.num_upsamplers = 0 + + # up + reversed_block_out_channels = list(reversed(block_out_channels)) + reversed_attention_head_dim = list(reversed(attention_head_dim)) + only_cross_attention = list(reversed(only_cross_attention)) + output_channel = reversed_block_out_channels[0] + for i, up_block_type in enumerate(up_block_types): + res = 2 ** (3 - i) + is_final_block = i == len(block_out_channels) - 1 + + prev_output_channel = output_channel + output_channel = reversed_block_out_channels[i] + input_channel = reversed_block_out_channels[min(i + 1, len(block_out_channels) - 1)] + + # add upsample block for all BUT final layer + if not is_final_block: + add_upsample = True + self.num_upsamplers += 1 + else: + add_upsample = False + + up_block = get_up_block( + up_block_type, + num_layers=layers_per_block + 1, + in_channels=input_channel, + out_channels=output_channel, + prev_output_channel=prev_output_channel, + temb_channels=time_embed_dim, + add_upsample=add_upsample, + resnet_eps=norm_eps, + resnet_act_fn=act_fn, + resnet_groups=norm_num_groups, + cross_attention_dim=cross_attention_dim, + attn_num_head_channels=reversed_attention_head_dim[i], + dual_cross_attention=dual_cross_attention, + use_linear_projection=use_linear_projection, + only_cross_attention=only_cross_attention[i], + upcast_attention=upcast_attention, + resnet_time_scale_shift=resnet_time_scale_shift, + unet_use_cross_frame_attention=unet_use_cross_frame_attention, + unet_use_temporal_attention=unet_use_temporal_attention, + use_inflated_groupnorm=use_inflated_groupnorm, + use_motion_module=use_motion_module and (res in motion_module_resolutions), + motion_module_type=motion_module_type, + motion_module_kwargs=motion_module_kwargs, + ) + self.up_blocks.append(up_block) + prev_output_channel = output_channel + + # out + if use_inflated_groupnorm: + self.conv_norm_out = InflatedGroupNorm( + num_channels=block_out_channels[0], + num_groups=norm_num_groups, + epsilon=norm_eps, + ) + else: + + self.conv_norm_out = paddle.nn.GroupNorm( + num_channels=block_out_channels[0], num_groups=norm_num_groups, epsilon=norm_eps + ) + self.conv_act = paddle.nn.Silu() + self.conv_out = InflatedConv3d(block_out_channels[0], out_channels, kernel_size=3, padding=1) + + @property + # Copied from ppdiffusers.models.unet_2d_condition.UNet2DConditionModel.attn_processors + def attn_processors(self) -> Dict[str, AttentionProcessor]: + r""" + Returns: + `dict` of attention processors: A dictionary containing all attention processors used in the model with + indexed by its weight name. + """ + # set recursively + processors = {} + + def fn_recursive_add_processors( + name: str, + module: paddle.nn.Layer, + processors: Dict[str, AttentionProcessor], + ): + if hasattr(module, "set_processor"): + processors[f"{name}.processor"] = module.processor + + for sub_name, child in module.named_children(): + if "temporal_transformer" not in sub_name: + fn_recursive_add_processors(f"{name}.{sub_name}", child, processors) + + return processors + + for name, module in self.named_children(): + if "temporal_transformer" not in name: + fn_recursive_add_processors(name, module, processors) + + return processors + + def set_attention_slice(self, slice_size): + r""" + Enable sliced attention computation. + + When this option is enabled, the attention module will split the input tensor in slices, to compute attention + in several steps. This is useful to save some memory in exchange for a small speed decrease. + + Args: + slice_size (`str` or `int` or `list(int)`, *optional*, defaults to `"auto"`): + When `"auto"`, halves the input to the attention heads, so attention will be computed in two steps. If + `"max"`, maxium amount of memory will be saved by running only one slice at a time. If a number is + provided, uses as many slices as `attention_head_dim // slice_size`. In this case, `attention_head_dim` + must be a multiple of `slice_size`. + """ + sliceable_head_dims = [] + + def fn_recursive_retrieve_slicable_dims(module: paddle.nn.Layer): + if hasattr(module, "set_attention_slice"): + sliceable_head_dims.append(module.sliceable_head_dim) + + for child in module.children(): + fn_recursive_retrieve_slicable_dims(child) + + # retrieve number of attention layers + for module in self.children(): + fn_recursive_retrieve_slicable_dims(module) + + num_slicable_layers = len(sliceable_head_dims) + + if slice_size == "auto": + # half the attention head size is usually a good trade-off between + # speed and memory + slice_size = [dim // 2 for dim in sliceable_head_dims] + elif slice_size == "max": + # make smallest slice possible + slice_size = num_slicable_layers * [1] + + slice_size = num_slicable_layers * [slice_size] if not isinstance(slice_size, list) else slice_size + + if len(slice_size) != len(sliceable_head_dims): + raise ValueError( + f"You have provided {len(slice_size)}, but {self.config} has {len(sliceable_head_dims)} different" + f" attention layers. Make sure to match `len(slice_size)` to be {len(sliceable_head_dims)}." + ) + + for i in range(len(slice_size)): + size = slice_size[i] + dim = sliceable_head_dims[i] + if size is not None and size > dim: + raise ValueError(f"size {size} has to be smaller or equal to {dim}.") + + # Recursively walk through all the children. + # Any children which exposes the set_attention_slice method + # gets the message + def fn_recursive_set_attention_slice(module: paddle.nn.Layer, slice_size: List[int]): + if hasattr(module, "set_attention_slice"): + module.set_attention_slice(slice_size.pop()) + + for child in module.children(): + fn_recursive_set_attention_slice(child, slice_size) + + reversed_slice_size = list(reversed(slice_size)) + for module in self.children(): + fn_recursive_set_attention_slice(module, reversed_slice_size) + + def _set_gradient_checkpointing(self, module, value=False): + if hasattr(module, "gradient_checkpointing"): + module.gradient_checkpointing = value + + # Copied from ppdiffusers.models.unet_2d_condition.UNet2DConditionModel.set_attn_processor + def set_attn_processor(self, processor: Union[AttentionProcessor, Dict[str, AttentionProcessor]]): + r""" + Sets the attention processor to use to compute attention. + + Parameters: + processor (`dict` of `AttentionProcessor` or only `AttentionProcessor`): + The instantiated processor class or a dictionary of processor classes that will be set as the processor + for **all** `Attention` layers. + + If `processor` is a dict, the key needs to define the path to the corresponding cross attention + processor. This is strongly recommended when setting trainable attention processors. + + """ + count = len(self.attn_processors.keys()) + + if isinstance(processor, dict) and len(processor) != count: + raise ValueError( + f"A dict of processors was passed, but the number of processors {len(processor)} does not match the" + f" number of attention layers: {count}. Please make sure to pass {count} processor classes." + ) + + def fn_recursive_attn_processor(name: str, module: paddle.nn.Layer, processor): + if hasattr(module, "set_processor"): + if not isinstance(processor, dict): + module.set_processor(processor) + else: + module.set_processor(processor.pop(f"{name}.processor")) + + for sub_name, child in module.named_children(): + if "temporal_transformer" not in sub_name: + fn_recursive_attn_processor(f"{name}.{sub_name}", child, processor) + + for name, module in self.named_children(): + if "temporal_transformer" not in name: + fn_recursive_attn_processor(name, module, processor) + + def forward( + self, + sample: paddle.Tensor, + timestep: Union[paddle.Tensor, float, int], + encoder_hidden_states: paddle.Tensor, + class_labels: Optional[paddle.Tensor] = None, + pose_cond_fea: Optional[paddle.Tensor] = None, + attention_mask: Optional[paddle.Tensor] = None, + down_block_additional_residuals: Optional[Tuple[paddle.Tensor]] = None, + mid_block_additional_residual: Optional[paddle.Tensor] = None, + return_dict: bool = True, + ) -> Union[UNet3DConditionOutput, Tuple]: + r""" + Args: + sample (`paddle.Tensor`): (batch, channel, height, width) noisy inputs tensor + timestep (`paddle.Tensor` or `float` or `int`): (batch) timesteps + encoder_hidden_states (`paddle.Tensor`): (batch, sequence_length, feature_dim) encoder hidden states + return_dict (`bool`, *optional*, defaults to `True`): + Whether or not to return a [`models.unet_2d_condition.UNet2DConditionOutput`] instead of a plain tuple. + + Returns: + [`~models.unet_2d_condition.UNet2DConditionOutput`] or `tuple`: + [`~models.unet_2d_condition.UNet2DConditionOutput`] if `return_dict` is True, otherwise a `tuple`. When + returning a tuple, the first element is the sample tensor. + """ + # By default samples have to be AT least a multiple of the overall upsampling factor. + # The overall upsampling factor is equal to 2 ** (# num of upsampling layears). + # However, the upsampling interpolation output size can be forced to fit any upsampling size + # on the fly if necessary. + default_overall_up_factor = 2**self.num_upsamplers + + # upsample size should be forwarded when sample is not a multiple of `default_overall_up_factor` + forward_upsample_size = False + upsample_size = None + + if any(s % default_overall_up_factor != 0 for s in sample.shape[-2:]): + logger.info("Forward upsample size to force interpolation output size.") + forward_upsample_size = True + + # prepare attention_mask + if attention_mask is not None: + attention_mask = (1 - attention_mask.to(sample.dtype)) * -10000.0 + attention_mask = attention_mask.unsqueeze(1) + + # center input if necessary + if self.config.center_input_sample: + sample = 2 * sample - 1.0 + + # time + timesteps = timestep + if not paddle.is_tensor(timesteps): + # This would be a good case for the `match` statement (Python 3.10+) + is_mps = sample.device.type == "mps" + if isinstance(timestep, float): + dtype = "float32" if is_mps else "float64" + else: + dtype = "int32" if is_mps else "int64" + timesteps = paddle.Tensor([timesteps], dtype=dtype) + elif len(timesteps.shape) == 0: + timesteps = timesteps[None] + + # broadcast to batch dimension in a way that's compatible with ONNX/Core ML + timesteps = timesteps.expand(sample.shape[0]) + + t_emb = self.time_proj(timesteps) + + # timesteps does not contain any weights and will always return f32 tensors + # but time_embedding might actually be running in fp16. so we need to cast here. + # there might be better ways to encapsulate this. + t_emb = t_emb.to(dtype=self.dtype) + emb = self.time_embedding(t_emb) + + if self.class_embedding is not None: + if class_labels is None: + raise ValueError("class_labels should be provided when num_class_embeds > 0") + + if self.config.class_embed_type == "timestep": + class_labels = self.time_proj(class_labels) + + class_emb = self.class_embedding(class_labels).to(dtype=self.dtype) + emb = emb + class_emb + + # pre-process + + sample = self.conv_in(sample) + + if pose_cond_fea is not None: + sample = sample + pose_cond_fea + + # down + down_block_res_samples = (sample,) + for downsample_block in self.down_blocks: + if hasattr(downsample_block, "has_cross_attention") and downsample_block.has_cross_attention: + + sample, res_samples = downsample_block( + hidden_states=sample, + temb=emb, + encoder_hidden_states=encoder_hidden_states, + attention_mask=attention_mask, + ) + + else: + sample, res_samples = downsample_block( + hidden_states=sample, + temb=emb, + encoder_hidden_states=encoder_hidden_states, + ) + + down_block_res_samples += res_samples + + if down_block_additional_residuals is not None: + new_down_block_res_samples = () + + for down_block_res_sample, down_block_additional_residual in zip( + down_block_res_samples, down_block_additional_residuals + ): + down_block_res_sample = down_block_res_sample + down_block_additional_residual + new_down_block_res_samples += (down_block_res_sample,) + + down_block_res_samples = new_down_block_res_samples + + # mid + sample = self.mid_block( + sample, + emb, + encoder_hidden_states=encoder_hidden_states, + attention_mask=attention_mask, + ) + + if mid_block_additional_residual is not None: + sample = sample + mid_block_additional_residual + + # up + for i, upsample_block in enumerate(self.up_blocks): + is_final_block = i == len(self.up_blocks) - 1 + + res_samples = down_block_res_samples[-len(upsample_block.resnets) :] + down_block_res_samples = down_block_res_samples[: -len(upsample_block.resnets)] + + # if we have not reached the final block and need to forward the + # upsample size, we do it here + if not is_final_block and forward_upsample_size: + upsample_size = down_block_res_samples[-1].shape[2:] + + if hasattr(upsample_block, "has_cross_attention") and upsample_block.has_cross_attention: + sample = upsample_block( + hidden_states=sample, + temb=emb, + res_hidden_states_tuple=res_samples, + encoder_hidden_states=encoder_hidden_states, + upsample_size=upsample_size, + attention_mask=attention_mask, + ) + else: + sample = upsample_block( + hidden_states=sample, + temb=emb, + res_hidden_states_tuple=res_samples, + upsample_size=upsample_size, + encoder_hidden_states=encoder_hidden_states, + ) + + # post-process + sample = self.conv_norm_out(sample) + sample = self.conv_act(sample) + sample = self.conv_out(sample) + + if not return_dict: + return (sample,) + + return UNet3DConditionOutput(sample=sample) + + @classmethod + def from_pretrained_2d( + cls, + denoising_unet_config_path: Optional[Union[str, PathLike]], + base_model_path: Optional[Union[str, PathLike]] = None, + motion_module_path: Optional[Union[str, PathLike]] = None, + weight_dtype=None, + unet_additional_kwargs=None, + ): + + config_file = denoising_unet_config_path + if not (Path(config_file).exists() and Path(config_file).is_file()): + raise RuntimeError(f"{config_file} does not exist or is not a file") + + unet_config = cls.load_config(config_file) + unet_config["_class_name"] = cls.__name__ + unet_config["down_block_types"] = [ + "CrossAttnDownBlock3D", + "CrossAttnDownBlock3D", + "CrossAttnDownBlock3D", + "DownBlock3D", + ] + unet_config["up_block_types"] = [ + "UpBlock3D", + "CrossAttnUpBlock3D", + "CrossAttnUpBlock3D", + "CrossAttnUpBlock3D", + ] + unet_config["mid_block_type"] = "UNetMidBlock3DCrossAttn" + + init_contexts = [] + if weight_dtype is not None: + init_contexts.append(paddle.dtype_guard(weight_dtype)) + + with ContextManagers(init_contexts): + model = cls.from_config(unet_config, **unet_additional_kwargs) + + state_dict = paddle.load(base_model_path) + + # motion module updating + if motion_module_path is not None: + motion_state_dict = paddle.load(motion_module_path) + state_dict.update(motion_state_dict) + + if weight_dtype is not None: + for k in state_dict.keys(): + state_dict[k] = state_dict[k].astype(weight_dtype) + + m, u = model.set_state_dict(state_dict) + print(f"### missing keys: {len(m)}; \n### unexpected keys: {len(u)};") + + return model diff --git a/VLMEvalKit_old/PaddleMIX/ppdiffusers/ppdiffusers/models/hotshot_xl/__init__.py b/VLMEvalKit_old/PaddleMIX/ppdiffusers/ppdiffusers/models/hotshot_xl/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..ef05224cf6aff170028d4e2e50ce4f3572bc9387 --- /dev/null +++ b/VLMEvalKit_old/PaddleMIX/ppdiffusers/ppdiffusers/models/hotshot_xl/__init__.py @@ -0,0 +1,28 @@ +# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from dataclasses import dataclass +from typing import Union + +import numpy as np +import paddle + +import ppdiffusers + +from .unet import UNet3DConditionModel # noqa: * + + +@dataclass +class HotshotPipelineXLOutput(ppdiffusers.utils.BaseOutput): + videos: Union[paddle.Tensor, np.ndarray] diff --git a/VLMEvalKit_old/PaddleMIX/ppdiffusers/ppdiffusers/models/hotshot_xl/resnet.py b/VLMEvalKit_old/PaddleMIX/ppdiffusers/ppdiffusers/models/hotshot_xl/resnet.py new file mode 100644 index 0000000000000000000000000000000000000000..85f2f60e155b2094be815f83b900548368027939 --- /dev/null +++ b/VLMEvalKit_old/PaddleMIX/ppdiffusers/ppdiffusers/models/hotshot_xl/resnet.py @@ -0,0 +1,124 @@ +# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import paddle +from einops import rearrange + +import ppdiffusers +from ppdiffusers.models import resnet + + +class Upsample3D(resnet.Upsample2D): + def forward(self, hidden_states, output_size=None, scale: float = 1.0): + f = tuple(hidden_states.shape)[2] + hidden_states = rearrange(hidden_states, "b c f h w -> (b f) c h w") + hidden_states = super(Upsample3D, self).forward(hidden_states, output_size, scale) + return rearrange(hidden_states, "(b f) c h w -> b c f h w", f=f) + + +class Downsample3D(ppdiffusers.models.resnet.Downsample2D): + def forward(self, hidden_states, scale: float = 1.0): + f = tuple(hidden_states.shape)[2] + hidden_states = rearrange(hidden_states, "b c f h w -> (b f) c h w") + hidden_states = super(Downsample3D, self).forward(hidden_states, scale) + return rearrange(hidden_states, "(b f) c h w -> b c f h w", f=f) + + +class Conv3d(ppdiffusers.models.resnet.LoRACompatibleConv): + def forward(self, hidden_states, scale: float = 1.0): + f = tuple(hidden_states.shape)[2] + hidden_states = rearrange(hidden_states, "b c f h w -> (b f) c h w") + hidden_states = super().forward(hidden_states, scale) + return rearrange(hidden_states, "(b f) c h w -> b c f h w", f=f) + + +class ResnetBlock3D(paddle.nn.Layer): + def __init__( + self, + *, + in_channels, + out_channels=None, + conv_shortcut=False, + dropout=0.0, + temb_channels=512, + groups=32, + groups_out=None, + pre_norm=True, + eps=1e-06, + non_linearity="silu", + time_embedding_norm="default", + output_scale_factor=1.0, + use_in_shortcut=None, + conv_shortcut_bias: bool = True + ): + super().__init__() + self.pre_norm = pre_norm + self.pre_norm = True + self.in_channels = in_channels + out_channels = in_channels if out_channels is None else out_channels + self.out_channels = out_channels + self.use_conv_shortcut = conv_shortcut + self.time_embedding_norm = time_embedding_norm + self.output_scale_factor = output_scale_factor + if groups_out is None: + groups_out = groups + self.norm1 = paddle.nn.GroupNorm( + num_groups=groups, num_channels=in_channels, epsilon=eps, weight_attr=True, bias_attr=True + ) + self.conv1 = Conv3d(in_channels, out_channels, kernel_size=3, stride=1, padding=1) + if temb_channels is not None: + if self.time_embedding_norm == "default": + time_emb_proj_out_channels = out_channels + elif self.time_embedding_norm == "scale_shift": + time_emb_proj_out_channels = out_channels * 2 + else: + raise ValueError(f"unknown time_embedding_norm : {self.time_embedding_norm} ") + self.time_emb_proj = paddle.nn.Linear(in_features=temb_channels, out_features=time_emb_proj_out_channels) + else: + self.time_emb_proj = None + self.norm2 = paddle.nn.GroupNorm( + num_groups=groups_out, num_channels=out_channels, epsilon=eps, weight_attr=True, bias_attr=True + ) + self.dropout = paddle.nn.Dropout(p=dropout) + self.conv2 = Conv3d(out_channels, out_channels, kernel_size=3, stride=1, padding=1) + assert non_linearity == "silu" + self.nonlinearity = paddle.nn.Silu() + self.use_in_shortcut = self.in_channels != self.out_channels if use_in_shortcut is None else use_in_shortcut + self.conv_shortcut = None + if self.use_in_shortcut: + self.conv_shortcut = Conv3d( + in_channels, out_channels, kernel_size=1, stride=1, padding=0, bias_attr=conv_shortcut_bias + ) + + def forward(self, input_tensor, temb): + hidden_states = input_tensor + hidden_states = self.norm1(hidden_states) + hidden_states = self.nonlinearity(hidden_states) + hidden_states = self.conv1(hidden_states) + if temb is not None: + temb = self.nonlinearity(temb) + temb = self.time_emb_proj(temb)[:, :, None, None, None] + if temb is not None and self.time_embedding_norm == "default": + hidden_states = hidden_states + temb + hidden_states = self.norm2(hidden_states) + if temb is not None and self.time_embedding_norm == "scale_shift": + scale, shift = paddle.chunk(x=temb, chunks=2, axis=1) + hidden_states = hidden_states * (1 + scale) + shift + hidden_states = self.nonlinearity(hidden_states) + hidden_states = self.dropout(hidden_states) + hidden_states = self.conv2(hidden_states) + if self.conv_shortcut is not None: + input_tensor = self.conv_shortcut(input_tensor) + output_tensor = (input_tensor + hidden_states) / self.output_scale_factor + return output_tensor diff --git a/VLMEvalKit_old/PaddleMIX/ppdiffusers/ppdiffusers/models/hotshot_xl/transformer_3d.py b/VLMEvalKit_old/PaddleMIX/ppdiffusers/ppdiffusers/models/hotshot_xl/transformer_3d.py new file mode 100644 index 0000000000000000000000000000000000000000..3c387c4a905e5f7207cf576fc4a06bc88066d9ba --- /dev/null +++ b/VLMEvalKit_old/PaddleMIX/ppdiffusers/ppdiffusers/models/hotshot_xl/transformer_3d.py @@ -0,0 +1,77 @@ +# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from dataclasses import dataclass +from typing import Any, Dict, Optional + +import paddle +from einops import rearrange, repeat + +import ppdiffusers + + +@dataclass +class Transformer3DModelOutput(ppdiffusers.utils.BaseOutput): + """ + The output of [`Transformer3DModel`]. + + Args: + sample (`paddle.FloatTensor` of shape `(batch_size, num_channels, height, width)`: + The hidden states output conditioned on the `encoder_hidden_states` input. + """ + + sample: paddle.float32 + + +class Transformer3DModel(ppdiffusers.models.transformer_2d.Transformer2DModel): + def __init__(self, *args, **kwargs): + super(Transformer3DModel, self).__init__(*args, **kwargs) + init_Constant = paddle.nn.initializer.Constant(value=0.0) + init_Constant(self.proj_out.weight.data) + init_Constant = paddle.nn.initializer.Constant(value=0.0) + init_Constant(self.proj_out.bias.data) + + def forward( + self, + hidden_states: paddle.Tensor, + encoder_hidden_states: Optional[paddle.Tensor] = None, + timestep: Optional[int] = None, + class_labels: Optional[int] = None, + cross_attention_kwargs: Dict[str, Any] = None, + attention_mask: Optional[paddle.Tensor] = None, + encoder_attention_mask: Optional[paddle.Tensor] = None, + enable_temporal_layers: bool = True, + positional_embedding: Optional[paddle.Tensor] = None, + return_dict: bool = True, + ): + is_video = len(tuple(hidden_states.shape)) == 5 + if is_video: + f = tuple(hidden_states.shape)[2] + hidden_states = rearrange(hidden_states, "b c f h w -> (b f) c h w") + encoder_hidden_states = repeat(encoder_hidden_states, "b n c -> (b f) n c", f=f) + hidden_states = super(Transformer3DModel, self).forward( + hidden_states, + encoder_hidden_states, + timestep, + class_labels, + cross_attention_kwargs, + attention_mask, + encoder_attention_mask, + return_dict=False, + )[0] + if is_video: + hidden_states = rearrange(hidden_states, "(b f) c h w -> b c f h w", f=f) + if not return_dict: + return (hidden_states,) + return Transformer3DModelOutput(sample=hidden_states) diff --git a/VLMEvalKit_old/PaddleMIX/ppdiffusers/ppdiffusers/models/hotshot_xl/unet.py b/VLMEvalKit_old/PaddleMIX/ppdiffusers/ppdiffusers/models/hotshot_xl/unet.py new file mode 100644 index 0000000000000000000000000000000000000000..39fae6fe6ecdd4a6e619a415836d0aa57c0196ac --- /dev/null +++ b/VLMEvalKit_old/PaddleMIX/ppdiffusers/ppdiffusers/models/hotshot_xl/unet.py @@ -0,0 +1,778 @@ +# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +from dataclasses import dataclass +from typing import Any, Dict, List, Optional, Tuple, Union + +import paddle + +import ppdiffusers +from ppdiffusers import loaders, transformers # noqa: * + +from .resnet import Conv3d +from .unet_blocks import ( + CrossAttnDownBlock3D, + CrossAttnUpBlock3D, + DownBlock3D, + UNetMidBlock3DCrossAttn, + UpBlock3D, + get_down_block, + get_up_block, +) + +logger = ppdiffusers.utils.logging.get_logger(__name__) + + +@dataclass +class UNet3DConditionOutput(ppdiffusers.utils.BaseOutput): + """ + The output of [`UNet2DConditionModel`]. + + Args: + sample (`paddle.FloatTensor` of shape `(batch_size, num_channels, height, width)`): + The hidden states output conditioned on `encoder_hidden_states` input. Output of last layer of model. + """ + + sample: paddle.float32 = None + + +class UNet3DConditionModel( + ppdiffusers.models.modeling_utils.ModelMixin, + ppdiffusers.configuration_utils.ConfigMixin, + loaders.UNet2DConditionLoadersMixin, +): + _supports_gradient_checkpointing = True + + @ppdiffusers.configuration_utils.register_to_config + def __init__( + self, + sample_size: Optional[int] = None, + in_channels: int = 4, + out_channels: int = 4, + center_input_sample: bool = False, + flip_sin_to_cos: bool = True, + freq_shift: int = 0, + down_block_types: Tuple[str] = ("CrossAttnDownBlock3D", "CrossAttnDownBlock3D", "DownBlock3D"), + mid_block_type: Optional[str] = "UNetMidBlock3DCrossAttn", + up_block_types: Tuple[str] = ("UpBlock3D", "CrossAttnUpBlock3D", "CrossAttnUpBlock3D"), + only_cross_attention: Union[bool, Tuple[bool]] = False, + block_out_channels: Tuple[int] = (320, 640, 1280, 1280), + layers_per_block: Union[int, Tuple[int]] = 2, + downsample_padding: int = 1, + mid_block_scale_factor: float = 1, + act_fn: str = "silu", + norm_num_groups: Optional[int] = 32, + norm_eps: float = 1e-05, + cross_attention_dim: Union[int, Tuple[int]] = 1280, + transformer_layers_per_block: Union[int, Tuple[int]] = 1, + encoder_hid_dim: Optional[int] = None, + encoder_hid_dim_type: Optional[str] = None, + attention_head_dim: Union[int, Tuple[int]] = 8, + num_attention_heads: Optional[Union[int, Tuple[int]]] = None, + dual_cross_attention: bool = False, + use_linear_projection: bool = False, + class_embed_type: Optional[str] = None, + addition_embed_type: Optional[str] = None, + addition_time_embed_dim: Optional[int] = None, + num_class_embeds: Optional[int] = None, + upcast_attention: bool = False, + resnet_time_scale_shift: str = "default", + resnet_skip_time_act: bool = False, + resnet_out_scale_factor: int = 1.0, + time_embedding_type: str = "positional", + time_embedding_dim: Optional[int] = None, + time_embedding_act_fn: Optional[str] = None, + timestep_post_act: Optional[str] = None, + time_cond_proj_dim: Optional[int] = None, + conv_in_kernel: int = 3, + conv_out_kernel: int = 3, + projection_class_embeddings_input_dim: Optional[int] = None, + class_embeddings_concat: bool = False, + mid_block_only_cross_attention: Optional[bool] = None, + cross_attention_norm: Optional[str] = None, + addition_embed_type_num_heads=64, + ): + super().__init__() + self.sample_size = sample_size + if num_attention_heads is not None: + raise ValueError( + "At the moment it is not possible to define the number of attention heads via `num_attention_heads` because of a naming issue as described in https://github.com/huggingface/diffusers/issues/2011#issuecomment-1547958131. Passing `num_attention_heads` will only be supported in diffusers v0.19." + ) + num_attention_heads = num_attention_heads or attention_head_dim + if len(down_block_types) != len(up_block_types): + raise ValueError( + f"Must provide the same number of `down_block_types` as `up_block_types`. `down_block_types`: {down_block_types}. `up_block_types`: {up_block_types}." + ) + if len(block_out_channels) != len(down_block_types): + raise ValueError( + f"Must provide the same number of `block_out_channels` as `down_block_types`. `block_out_channels`: {block_out_channels}. `down_block_types`: {down_block_types}." + ) + if not isinstance(only_cross_attention, bool) and len(only_cross_attention) != len(down_block_types): + raise ValueError( + f"Must provide the same number of `only_cross_attention` as `down_block_types`. `only_cross_attention`: {only_cross_attention}. `down_block_types`: {down_block_types}." + ) + if not isinstance(num_attention_heads, int) and len(num_attention_heads) != len(down_block_types): + raise ValueError( + f"Must provide the same number of `num_attention_heads` as `down_block_types`. `num_attention_heads`: {num_attention_heads}. `down_block_types`: {down_block_types}." + ) + if not isinstance(attention_head_dim, int) and len(attention_head_dim) != len(down_block_types): + raise ValueError( + f"Must provide the same number of `attention_head_dim` as `down_block_types`. `attention_head_dim`: {attention_head_dim}. `down_block_types`: {down_block_types}." + ) + if isinstance(cross_attention_dim, list) and len(cross_attention_dim) != len(down_block_types): + raise ValueError( + f"Must provide the same number of `cross_attention_dim` as `down_block_types`. `cross_attention_dim`: {cross_attention_dim}. `down_block_types`: {down_block_types}." + ) + if not isinstance(layers_per_block, int) and len(layers_per_block) != len(down_block_types): + raise ValueError( + f"Must provide the same number of `layers_per_block` as `down_block_types`. `layers_per_block`: {layers_per_block}. `down_block_types`: {down_block_types}." + ) + conv_in_padding = (conv_in_kernel - 1) // 2 + self.conv_in = Conv3d(in_channels, block_out_channels[0], kernel_size=conv_in_kernel, padding=conv_in_padding) + if time_embedding_type == "fourier": + time_embed_dim = time_embedding_dim or block_out_channels[0] * 2 + if time_embed_dim % 2 != 0: + raise ValueError(f"`time_embed_dim` should be divisible by 2, but is {time_embed_dim}.") + self.time_proj = ppdiffusers.models.embeddings.GaussianFourierProjection( + time_embed_dim // 2, set_W_to_weight=False, log=False, flip_sin_to_cos=flip_sin_to_cos + ) + timestep_input_dim = time_embed_dim + elif time_embedding_type == "positional": + time_embed_dim = time_embedding_dim or block_out_channels[0] * 4 + self.time_proj = ppdiffusers.models.embeddings.Timesteps( + block_out_channels[0], flip_sin_to_cos, freq_shift + ) + timestep_input_dim = block_out_channels[0] + else: + raise ValueError( + f"{time_embedding_type} does not exist. Please make sure to use one of `fourier` or `positional`." + ) + self.time_embedding = ppdiffusers.models.embeddings.TimestepEmbedding( + timestep_input_dim, + time_embed_dim, + act_fn=act_fn, + post_act_fn=timestep_post_act, + cond_proj_dim=time_cond_proj_dim, + ) + if encoder_hid_dim_type is None and encoder_hid_dim is not None: + encoder_hid_dim_type = "text_proj" + self.register_to_config(encoder_hid_dim_type=encoder_hid_dim_type) + logger.info("encoder_hid_dim_type defaults to 'text_proj' as `encoder_hid_dim` is defined.") + if encoder_hid_dim is None and encoder_hid_dim_type is not None: + raise ValueError( + f"`encoder_hid_dim` has to be defined when `encoder_hid_dim_type` is set to {encoder_hid_dim_type}." + ) + if encoder_hid_dim_type == "text_proj": + self.encoder_hid_proj = paddle.nn.Linear(in_features=encoder_hid_dim, out_features=cross_attention_dim) + elif encoder_hid_dim_type == "text_image_proj": + self.encoder_hid_proj = ppdiffusers.models.embeddings.TextImageProjection( + text_embed_dim=encoder_hid_dim, + image_embed_dim=cross_attention_dim, + cross_attention_dim=cross_attention_dim, + ) + elif encoder_hid_dim_type == "image_proj": + self.encoder_hid_proj = ppdiffusers.models.embeddings.ImageProjection( + image_embed_dim=encoder_hid_dim, cross_attention_dim=cross_attention_dim + ) + elif encoder_hid_dim_type is not None: + raise ValueError( + f"encoder_hid_dim_type: {encoder_hid_dim_type} must be None, 'text_proj' or 'text_image_proj'." + ) + else: + self.encoder_hid_proj = None + if class_embed_type is None and num_class_embeds is not None: + self.class_embedding = paddle.nn.Embedding(num_embeddings=num_class_embeds, embedding_dim=time_embed_dim) + elif class_embed_type == "timestep": + self.class_embedding = ppdiffusers.models.embeddings.TimestepEmbedding( + timestep_input_dim, time_embed_dim, act_fn=act_fn + ) + elif class_embed_type == "identity": + self.class_embedding = paddle.nn.Identity(time_embed_dim, time_embed_dim) + elif class_embed_type == "projection": + if projection_class_embeddings_input_dim is None: + raise ValueError( + "`class_embed_type`: 'projection' requires `projection_class_embeddings_input_dim` be set" + ) + self.class_embedding = ppdiffusers.models.embeddings.TimestepEmbedding( + projection_class_embeddings_input_dim, time_embed_dim + ) + elif class_embed_type == "simple_projection": + if projection_class_embeddings_input_dim is None: + raise ValueError( + "`class_embed_type`: 'simple_projection' requires `projection_class_embeddings_input_dim` be set" + ) + self.class_embedding = paddle.nn.Linear( + in_features=projection_class_embeddings_input_dim, out_features=time_embed_dim + ) + else: + self.class_embedding = None + if addition_embed_type == "text": + if encoder_hid_dim is not None: + text_time_embedding_from_dim = encoder_hid_dim + else: + text_time_embedding_from_dim = cross_attention_dim + self.add_embedding = ppdiffusers.models.embeddings.TextTimeEmbedding( + text_time_embedding_from_dim, time_embed_dim, num_heads=addition_embed_type_num_heads + ) + elif addition_embed_type == "text_image": + self.add_embedding = ppdiffusers.models.embeddings.TextImageTimeEmbedding( + text_embed_dim=cross_attention_dim, image_embed_dim=cross_attention_dim, time_embed_dim=time_embed_dim + ) + elif addition_embed_type == "text_time": + self.add_time_proj = ppdiffusers.models.embeddings.Timesteps( + addition_time_embed_dim, flip_sin_to_cos, freq_shift + ) + self.add_embedding = ppdiffusers.models.embeddings.TimestepEmbedding( + projection_class_embeddings_input_dim, time_embed_dim + ) + elif addition_embed_type == "image": + self.add_embedding = ppdiffusers.models.embeddings.ImageTimeEmbedding( + image_embed_dim=encoder_hid_dim, time_embed_dim=time_embed_dim + ) + elif addition_embed_type == "image_hint": + self.add_embedding = ppdiffusers.models.embeddings.ImageHintTimeEmbedding( + image_embed_dim=encoder_hid_dim, time_embed_dim=time_embed_dim + ) + elif addition_embed_type is not None: + raise ValueError(f"addition_embed_type: {addition_embed_type} must be None, 'text' or 'text_image'.") + if time_embedding_act_fn is None: + self.time_embed_act = None + else: + self.time_embed_act = ppdiffusers.models.activations.get_activation(time_embedding_act_fn) + self.down_blocks = paddle.nn.LayerList(sublayers=[]) + self.up_blocks = paddle.nn.LayerList(sublayers=[]) + if isinstance(only_cross_attention, bool): + if mid_block_only_cross_attention is None: + mid_block_only_cross_attention = only_cross_attention + only_cross_attention = [only_cross_attention] * len(down_block_types) + if mid_block_only_cross_attention is None: + mid_block_only_cross_attention = False + if isinstance(num_attention_heads, int): + num_attention_heads = (num_attention_heads,) * len(down_block_types) + if isinstance(attention_head_dim, int): + attention_head_dim = (attention_head_dim,) * len(down_block_types) + if isinstance(cross_attention_dim, int): + cross_attention_dim = (cross_attention_dim,) * len(down_block_types) + if isinstance(layers_per_block, int): + layers_per_block = [layers_per_block] * len(down_block_types) + if isinstance(transformer_layers_per_block, int): + transformer_layers_per_block = [transformer_layers_per_block] * len(down_block_types) + if class_embeddings_concat: + blocks_time_embed_dim = time_embed_dim * 2 + else: + blocks_time_embed_dim = time_embed_dim + output_channel = block_out_channels[0] + for i, down_block_type in enumerate(down_block_types): + res = 2**i + input_channel = output_channel + output_channel = block_out_channels[i] + is_final_block = i == len(block_out_channels) - 1 + down_block = get_down_block( + down_block_type, + num_layers=layers_per_block[i], + transformer_layers_per_block=transformer_layers_per_block[i], + in_channels=input_channel, + out_channels=output_channel, + temb_channels=blocks_time_embed_dim, + add_downsample=not is_final_block, + resnet_eps=norm_eps, + resnet_act_fn=act_fn, + resnet_groups=norm_num_groups, + cross_attention_dim=cross_attention_dim[i], + num_attention_heads=num_attention_heads[i], + downsample_padding=downsample_padding, + dual_cross_attention=dual_cross_attention, + use_linear_projection=use_linear_projection, + only_cross_attention=only_cross_attention[i], + upcast_attention=upcast_attention, + resnet_time_scale_shift=resnet_time_scale_shift, + resnet_skip_time_act=resnet_skip_time_act, + resnet_out_scale_factor=resnet_out_scale_factor, + cross_attention_norm=cross_attention_norm, + attention_head_dim=attention_head_dim[i] if attention_head_dim[i] is not None else output_channel, + ) + self.down_blocks.append(down_block) + if mid_block_type == "UNetMidBlock3DCrossAttn": + self.mid_block = UNetMidBlock3DCrossAttn( + transformer_layers_per_block=transformer_layers_per_block[-1], + in_channels=block_out_channels[-1], + temb_channels=blocks_time_embed_dim, + resnet_eps=norm_eps, + resnet_act_fn=act_fn, + output_scale_factor=mid_block_scale_factor, + resnet_time_scale_shift=resnet_time_scale_shift, + cross_attention_dim=cross_attention_dim[-1], + num_attention_heads=num_attention_heads[-1], + resnet_groups=norm_num_groups, + dual_cross_attention=dual_cross_attention, + use_linear_projection=use_linear_projection, + upcast_attention=upcast_attention, + ) + elif mid_block_type == "UNetMidBlock2DSimpleCrossAttn": + raise ValueError("UNetMidBlock2DSimpleCrossAttn not supported") + elif mid_block_type is None: + self.mid_block = None + else: + raise ValueError(f"unknown mid_block_type : {mid_block_type}") + self.num_upsamplers = 0 + reversed_block_out_channels = list(reversed(block_out_channels)) + reversed_num_attention_heads = list(reversed(num_attention_heads)) + reversed_layers_per_block = list(reversed(layers_per_block)) + reversed_cross_attention_dim = list(reversed(cross_attention_dim)) + reversed_transformer_layers_per_block = list(reversed(transformer_layers_per_block)) + only_cross_attention = list(reversed(only_cross_attention)) + output_channel = reversed_block_out_channels[0] + for i, up_block_type in enumerate(up_block_types): + res = 2 ** (len(up_block_types) - 1 - i) # noqa: * + is_final_block = i == len(block_out_channels) - 1 + prev_output_channel = output_channel + output_channel = reversed_block_out_channels[i] + input_channel = reversed_block_out_channels[min(i + 1, len(block_out_channels) - 1)] + if not is_final_block: + add_upsample = True + self.num_upsamplers += 1 + else: + add_upsample = False + up_block = get_up_block( + up_block_type, + num_layers=reversed_layers_per_block[i] + 1, + transformer_layers_per_block=reversed_transformer_layers_per_block[i], + in_channels=input_channel, + out_channels=output_channel, + prev_output_channel=prev_output_channel, + temb_channels=blocks_time_embed_dim, + add_upsample=add_upsample, + resnet_eps=norm_eps, + resnet_act_fn=act_fn, + resnet_groups=norm_num_groups, + cross_attention_dim=reversed_cross_attention_dim[i], + num_attention_heads=reversed_num_attention_heads[i], + dual_cross_attention=dual_cross_attention, + use_linear_projection=use_linear_projection, + only_cross_attention=only_cross_attention[i], + upcast_attention=upcast_attention, + resnet_time_scale_shift=resnet_time_scale_shift, + resnet_skip_time_act=resnet_skip_time_act, + resnet_out_scale_factor=resnet_out_scale_factor, + cross_attention_norm=cross_attention_norm, + attention_head_dim=attention_head_dim[i] if attention_head_dim[i] is not None else output_channel, + ) + self.up_blocks.append(up_block) + prev_output_channel = output_channel + if norm_num_groups is not None: + self.conv_norm_out = paddle.nn.GroupNorm( + num_channels=block_out_channels[0], num_groups=norm_num_groups, epsilon=norm_eps + ) + self.conv_act = ppdiffusers.models.activations.get_activation(act_fn) + else: + self.conv_norm_out = None + self.conv_act = None + conv_out_padding = (conv_out_kernel - 1) // 2 + self.conv_out = Conv3d( + block_out_channels[0], out_channels, kernel_size=conv_out_kernel, padding=conv_out_padding + ) + + def temporal_parameters(self) -> list: + output = [] + all_blocks = list(self.down_blocks) + list(self.up_blocks) + [self.mid_block] + for block in all_blocks: + output.extend(block.temporal_parameters()) + return output + + @property + def attn_processors(self) -> Dict[str, ppdiffusers.models.attention_processor.AttentionProcessor]: + return self.get_attn_processors(include_temporal_layers=False) + + def get_attn_processors( + self, include_temporal_layers=True + ) -> Dict[str, ppdiffusers.models.attention_processor.AttentionProcessor]: + """ + Returns: + `dict` of attention processors: A dictionary containing all attention processors used in the model with + indexed by its weight name. + """ + processors = {} + + def fn_recursive_add_processors( + name: str, + module: paddle.nn.Layer, + processors: Dict[str, ppdiffusers.models.attention_processor.AttentionProcessor], + ): + if not include_temporal_layers: + if "temporal" in name: + return processors + if hasattr(module, "set_processor"): + processors[f"{name}.processor"] = module.processor + for sub_name, child in module.named_children(): + fn_recursive_add_processors(f"{name}.{sub_name}", child, processors) + return processors + + for name, module in self.named_children(): + fn_recursive_add_processors(name, module, processors) + return processors + + def set_attn_processor( + self, + processor: Union[ + ppdiffusers.models.attention_processor.AttentionProcessor, + Dict[str, ppdiffusers.models.attention_processor.AttentionProcessor], + ], + include_temporal_layers=False, + ): + """ + Sets the attention processor to use to compute attention. + + Parameters: + processor (`dict` of `AttentionProcessor` or only `AttentionProcessor`): + The instantiated processor class or a dictionary of processor classes that will be set as the processor + for **all** `Attention` layers. + + If `processor` is a dict, the key needs to define the path to the corresponding cross attention + processor. This is strongly recommended when setting trainable attention processors. + + """ + count = len(self.get_attn_processors(include_temporal_layers=include_temporal_layers).keys()) + if isinstance(processor, dict) and len(processor) != count: + raise ValueError( + f"A dict of processors was passed, but the number of processors {len(processor)} does not match the number of attention layers: {count}. Please make sure to pass {count} processor classes." + ) + + def fn_recursive_attn_processor(name: str, module: paddle.nn.Layer, processor): + if not include_temporal_layers: + if "temporal" in name: + return + if hasattr(module, "set_processor"): + if not isinstance(processor, dict): + module.set_processor(processor) + else: + module.set_processor(processor.pop(f"{name}.processor")) + for sub_name, child in module.named_children(): + fn_recursive_attn_processor(f"{name}.{sub_name}", child, processor) + + for name, module in self.named_children(): + fn_recursive_attn_processor(name, module, processor) + + def set_default_attn_processor(self): + """ + Disables custom attention processors and sets the default attention implementation. + """ + self.set_attn_processor(ppdiffusers.models.attention_processor.AttnProcessor()) + + def set_attention_slice(self, slice_size): + """ + Enable sliced attention computation. + + When this option is enabled, the attention module splits the input tensor in slices to compute attention in + several steps. This is useful for saving some memory in exchange for a small decrease in speed. + + Args: + slice_size (`str` or `int` or `list(int)`, *optional*, defaults to `"auto"`): + When `"auto"`, input to the attention heads is halved, so attention is computed in two steps. If + `"max"`, maximum amount of memory is saved by running only one slice at a time. If a number is + provided, uses as many slices as `attention_head_dim // slice_size`. In this case, `attention_head_dim` + must be a multiple of `slice_size`. + """ + sliceable_head_dims = [] + + def fn_recursive_retrieve_sliceable_dims(module: paddle.nn.Layer): + if hasattr(module, "set_attention_slice"): + sliceable_head_dims.append(module.sliceable_head_dim) + for child in module.children(): + fn_recursive_retrieve_sliceable_dims(child) + + for module in self.children(): + fn_recursive_retrieve_sliceable_dims(module) + num_sliceable_layers = len(sliceable_head_dims) + if slice_size == "auto": + slice_size = [(dim // 2) for dim in sliceable_head_dims] + elif slice_size == "max": + slice_size = num_sliceable_layers * [1] + slice_size = num_sliceable_layers * [slice_size] if not isinstance(slice_size, list) else slice_size + if len(slice_size) != len(sliceable_head_dims): + raise ValueError( + f"You have provided {len(slice_size)}, but {self.config} has {len(sliceable_head_dims)} different attention layers. Make sure to match `len(slice_size)` to be {len(sliceable_head_dims)}." + ) + for i in range(len(slice_size)): + size = slice_size[i] + dim = sliceable_head_dims[i] + if size is not None and size > dim: + raise ValueError(f"size {size} has to be smaller or equal to {dim}.") + + def fn_recursive_set_attention_slice(module: paddle.nn.Layer, slice_size: List[int]): + if hasattr(module, "set_attention_slice"): + module.set_attention_slice(slice_size.pop()) + for child in module.children(): + fn_recursive_set_attention_slice(child, slice_size) + + reversed_slice_size = list(reversed(slice_size)) + for module in self.children(): + fn_recursive_set_attention_slice(module, reversed_slice_size) + + def _set_gradient_checkpointing(self, module, value=False): + if isinstance(module, (CrossAttnDownBlock3D, DownBlock3D, CrossAttnUpBlock3D, UpBlock3D)): + module.gradient_checkpointing = value + + def forward( + self, + sample: paddle.float32, + timestep: Union[paddle.Tensor, float, int], + encoder_hidden_states: paddle.Tensor, + class_labels: Optional[paddle.Tensor] = None, + timestep_cond: Optional[paddle.Tensor] = None, + attention_mask: Optional[paddle.Tensor] = None, + cross_attention_kwargs: Optional[Dict[str, Any]] = None, + added_cond_kwargs: Optional[Dict[str, paddle.Tensor]] = None, + down_block_additional_residuals: Optional[Tuple[paddle.Tensor]] = None, + mid_block_additional_residual: Optional[paddle.Tensor] = None, + encoder_attention_mask: Optional[paddle.Tensor] = None, + return_dict: bool = True, + enable_temporal_attentions: bool = True, + ) -> Union[UNet3DConditionOutput, Tuple]: + """ + The [`UNet2DConditionModel`] forward method. + + Args: + sample (`paddle.FloatTensor`): + The noisy input tensor with the following shape `(batch, channel, height, width)`. + timestep (`paddle.FloatTensor` or `float` or `int`): The number of timesteps to denoise an input. + encoder_hidden_states (`paddle.FloatTensor`): + The encoder hidden states with shape `(batch, sequence_length, feature_dim)`. + encoder_attention_mask (`paddle.Tensor`): + A cross-attention mask of shape `(batch, sequence_length)` is applied to `encoder_hidden_states`. If + `True` the mask is kept, otherwise if `False` it is discarded. Mask will be converted into a bias, + which adds large negative values to the attention scores corresponding to "discard" tokens. + return_dict (`bool`, *optional*, defaults to `True`): + Whether or not to return a [`~models.unet_2d_condition.UNet2DConditionOutput`] instead of a plain + tuple. + cross_attention_kwargs (`dict`, *optional*): + A kwargs dictionary that if specified is passed along to the [`AttnProcessor`]. + added_cond_kwargs: (`dict`, *optional*): + A kwargs dictionary containin additional embeddings that if specified are added to the embeddings that + are passed along to the UNet blocks. + + Returns: + [`~models.unet_2d_condition.UNet2DConditionOutput`] or `tuple`: + If `return_dict` is True, an [`~models.unet_2d_condition.UNet2DConditionOutput`] is returned, otherwise + a `tuple` is returned where the first element is the sample tensor. + """ + default_overall_up_factor = 2**self.num_upsamplers + forward_upsample_size = False + upsample_size = None + if any(s % default_overall_up_factor != 0 for s in tuple(sample.shape)[-2:]): + logger.info("Forward upsample size to force interpolation output size.") + forward_upsample_size = True + if attention_mask is not None: + attention_mask = (1 - attention_mask.to(sample.dtype)) * -10000.0 + attention_mask = attention_mask.unsqueeze(axis=1) + if encoder_attention_mask is not None: + encoder_attention_mask = (1 - encoder_attention_mask.to(sample.dtype)) * -10000.0 + encoder_attention_mask = encoder_attention_mask.unsqueeze(axis=1) + if self.config.center_input_sample: + sample = 2 * sample - 1.0 + timesteps = timestep + if not paddle.is_tensor(x=timesteps): + is_mps = sample.device.type == "mps" + if isinstance(timestep, float): + dtype = "float32" if is_mps else "float64" + else: + dtype = "int32" if is_mps else "int64" + timesteps = paddle.to_tensor(data=[timesteps], dtype=dtype, place=sample.place) + elif len(tuple(timesteps.shape)) == 0: + timesteps = timesteps[None].to(sample.place) + timesteps = timesteps.expand(shape=tuple(sample.shape)[0]) + t_emb = self.time_proj(timesteps) + t_emb = t_emb.to(dtype=sample.dtype) + emb = self.time_embedding(t_emb, timestep_cond) + aug_emb = None + if self.class_embedding is not None: + if class_labels is None: + raise ValueError("class_labels should be provided when num_class_embeds > 0") + if self.config.class_embed_type == "timestep": + class_labels = self.time_proj(class_labels) + class_labels = class_labels.to(dtype=sample.dtype) + class_emb = self.class_embedding(class_labels).to(dtype=sample.dtype) + if self.config.class_embeddings_concat: + emb = paddle.concat(x=[emb, class_emb], axis=-1) + else: + emb = emb + class_emb + if self.config.addition_embed_type == "text": + aug_emb = self.add_embedding(encoder_hidden_states) + elif self.config.addition_embed_type == "text_image": + if "image_embeds" not in added_cond_kwargs: + raise ValueError( + f"{self.__class__} has the config param `addition_embed_type` set to 'text_image' which requires the keyword argument `image_embeds` to be passed in `added_cond_kwargs`" + ) + image_embs = added_cond_kwargs.get("image_embeds") + text_embs = added_cond_kwargs.get("text_embeds", encoder_hidden_states) + aug_emb = self.add_embedding(text_embs, image_embs) + elif self.config.addition_embed_type == "text_time": + if "text_embeds" not in added_cond_kwargs: + raise ValueError( + f"{self.__class__} has the config param `addition_embed_type` set to 'text_time' which requires the keyword argument `text_embeds` to be passed in `added_cond_kwargs`" + ) + text_embeds = added_cond_kwargs.get("text_embeds") + if "time_ids" not in added_cond_kwargs: + raise ValueError( + f"{self.__class__} has the config param `addition_embed_type` set to 'text_time' which requires the keyword argument `time_ids` to be passed in `added_cond_kwargs`" + ) + time_ids = added_cond_kwargs.get("time_ids") + time_embeds = self.add_time_proj(time_ids.flatten()) + time_embeds = time_embeds.reshape((tuple(text_embeds.shape)[0], -1)) + add_embeds = paddle.concat(x=[text_embeds, time_embeds], axis=-1) + add_embeds = add_embeds.to(emb.dtype) + aug_emb = self.add_embedding(add_embeds) + elif self.config.addition_embed_type == "image": + if "image_embeds" not in added_cond_kwargs: + raise ValueError( + f"{self.__class__} has the config param `addition_embed_type` set to 'image' which requires the keyword argument `image_embeds` to be passed in `added_cond_kwargs`" + ) + image_embs = added_cond_kwargs.get("image_embeds") + aug_emb = self.add_embedding(image_embs) + elif self.config.addition_embed_type == "image_hint": + if "image_embeds" not in added_cond_kwargs or "hint" not in added_cond_kwargs: + raise ValueError( + f"{self.__class__} has the config param `addition_embed_type` set to 'image_hint' which requires the keyword arguments `image_embeds` and `hint` to be passed in `added_cond_kwargs`" + ) + image_embs = added_cond_kwargs.get("image_embeds") + hint = added_cond_kwargs.get("hint") + aug_emb, hint = self.add_embedding(image_embs, hint) + sample = paddle.concat(x=[sample, hint], axis=1) + emb = emb + aug_emb if aug_emb is not None else emb + if self.time_embed_act is not None: + emb = self.time_embed_act(emb) + if self.encoder_hid_proj is not None and self.config.encoder_hid_dim_type == "text_proj": + encoder_hidden_states = self.encoder_hid_proj(encoder_hidden_states) + elif self.encoder_hid_proj is not None and self.config.encoder_hid_dim_type == "text_image_proj": + if "image_embeds" not in added_cond_kwargs: + raise ValueError( + f"{self.__class__} has the config param `encoder_hid_dim_type` set to 'text_image_proj' which requires the keyword argument `image_embeds` to be passed in `added_conditions`" + ) + image_embeds = added_cond_kwargs.get("image_embeds") + encoder_hidden_states = self.encoder_hid_proj(encoder_hidden_states, image_embeds) + elif self.encoder_hid_proj is not None and self.config.encoder_hid_dim_type == "image_proj": + if "image_embeds" not in added_cond_kwargs: + raise ValueError( + f"{self.__class__} has the config param `encoder_hid_dim_type` set to 'image_proj' which requires the keyword argument `image_embeds` to be passed in `added_conditions`" + ) + image_embeds = added_cond_kwargs.get("image_embeds") + encoder_hidden_states = self.encoder_hid_proj(image_embeds) + sample = self.conv_in(sample) + down_block_res_samples = (sample,) + for downsample_block in self.down_blocks: + if hasattr(downsample_block, "has_cross_attention") and downsample_block.has_cross_attention: + sample, res_samples = downsample_block( + hidden_states=sample, + temb=emb, + encoder_hidden_states=encoder_hidden_states, + attention_mask=attention_mask, + cross_attention_kwargs=cross_attention_kwargs, + enable_temporal_attentions=enable_temporal_attentions, + ) + else: + sample, res_samples = downsample_block( + hidden_states=sample, + temb=emb, + encoder_hidden_states=encoder_hidden_states, + enable_temporal_attentions=enable_temporal_attentions, + ) + down_block_res_samples += res_samples + if down_block_additional_residuals is not None: + new_down_block_res_samples = () + for down_block_res_sample, down_block_additional_residual in zip( + down_block_res_samples, down_block_additional_residuals + ): + down_block_res_sample = down_block_res_sample + down_block_additional_residual + new_down_block_res_samples = new_down_block_res_samples + (down_block_res_sample,) + down_block_res_samples = new_down_block_res_samples + if self.mid_block is not None: + sample = self.mid_block( + sample, + emb, + encoder_hidden_states=encoder_hidden_states, + attention_mask=attention_mask, + cross_attention_kwargs=cross_attention_kwargs, + enable_temporal_attentions=enable_temporal_attentions, + ) + if mid_block_additional_residual is not None: + sample = sample + mid_block_additional_residual + for i, upsample_block in enumerate(self.up_blocks): + is_final_block = i == len(self.up_blocks) - 1 + res_samples = down_block_res_samples[-len(upsample_block.resnets) :] + down_block_res_samples = down_block_res_samples[: -len(upsample_block.resnets)] + if not is_final_block and forward_upsample_size: + upsample_size = tuple(down_block_res_samples[-1].shape)[2:] + if hasattr(upsample_block, "has_cross_attention") and upsample_block.has_cross_attention: + sample = upsample_block( + hidden_states=sample, + temb=emb, + res_hidden_states_tuple=res_samples, + encoder_hidden_states=encoder_hidden_states, + cross_attention_kwargs=cross_attention_kwargs, + upsample_size=upsample_size, + attention_mask=attention_mask, + enable_temporal_attentions=enable_temporal_attentions, + ) + else: + sample = upsample_block( + hidden_states=sample, + temb=emb, + res_hidden_states_tuple=res_samples, + upsample_size=upsample_size, + encoder_hidden_states=encoder_hidden_states, + enable_temporal_attentions=enable_temporal_attentions, + ) + if self.conv_norm_out: + sample = self.conv_norm_out(sample) + sample = self.conv_act(sample) + sample = self.conv_out(sample) + if not return_dict: + return (sample,) + return UNet3DConditionOutput(sample=sample) + + @classmethod + def from_pretrained_spatial(cls, pretrained_model_path, subfolder=None): + import json + + if subfolder is not None: + pretrained_model_path = os.path.join(pretrained_model_path, subfolder) + config_file = os.path.join(pretrained_model_path, "config.json") + with open(config_file, "r") as f: + config = json.load(f) + config["_class_name"] = "UNet3DConditionModel" + config["down_block_types"] = ["DownBlock3D", "CrossAttnDownBlock3D", "CrossAttnDownBlock3D"] + config["up_block_types"] = ["CrossAttnUpBlock3D", "CrossAttnUpBlock3D", "UpBlock3D"] + config["mid_block_type"] = "UNetMidBlock3DCrossAttn" + model = cls.from_config(config) + model_files = [ + os.path.join(pretrained_model_path, "diffusion_paddle_model.bin"), + os.path.join(pretrained_model_path, "diffusion_paddle_model.safetensors"), + ] + model_file = None + for fp in model_files: + if os.path.exists(fp): + model_file = fp + if not model_file: + raise RuntimeError(f"{model_file} does not exist") + if model_file.split(".")[-1] == "safetensors": + from safetensors import safe_open + + state_dict = {} + with safe_open(model_file, framework="pt", device="cuda") as f: + for key in f.keys(): + state_dict[key] = f.get_tensor(key) + else: + state_dict = paddle.load(path=model_file) + model.set_state_dict(state_dict=state_dict, use_structured_name=False) + return model diff --git a/VLMEvalKit_old/PaddleMIX/ppdiffusers/ppdiffusers/models/hotshot_xl/unet_blocks.py b/VLMEvalKit_old/PaddleMIX/ppdiffusers/ppdiffusers/models/hotshot_xl/unet_blocks.py new file mode 100644 index 0000000000000000000000000000000000000000..093c3b912d7c5d4e382848fba1a984d7450bd1ad --- /dev/null +++ b/VLMEvalKit_old/PaddleMIX/ppdiffusers/ppdiffusers/models/hotshot_xl/unet_blocks.py @@ -0,0 +1,717 @@ +# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import paddle +from paddle.distributed.fleet.utils import recompute + +from .resnet import Downsample3D, ResnetBlock3D, Upsample3D +from .transformer_3d import Transformer3DModel +from .transformer_temporal import TransformerTemporal + + +def get_down_block( + down_block_type, + num_layers, + in_channels, + out_channels, + temb_channels, + add_downsample, + resnet_eps, + resnet_act_fn, + transformer_layers_per_block=1, + num_attention_heads=None, + resnet_groups=None, + cross_attention_dim=None, + downsample_padding=None, + dual_cross_attention=False, + use_linear_projection=False, + only_cross_attention=False, + upcast_attention=False, + resnet_time_scale_shift="default", + resnet_skip_time_act=False, + resnet_out_scale_factor=1.0, + cross_attention_norm=None, + attention_head_dim=None, + downsample_type=None, +): + down_block_type = down_block_type[7:] if down_block_type.startswith("UNetRes") else down_block_type + if down_block_type == "DownBlock3D": + return DownBlock3D( + num_layers=num_layers, + in_channels=in_channels, + out_channels=out_channels, + temb_channels=temb_channels, + add_downsample=add_downsample, + resnet_eps=resnet_eps, + resnet_act_fn=resnet_act_fn, + resnet_groups=resnet_groups, + downsample_padding=downsample_padding, + resnet_time_scale_shift=resnet_time_scale_shift, + ) + elif down_block_type == "CrossAttnDownBlock3D": + if cross_attention_dim is None: + raise ValueError("cross_attention_dim must be specified for CrossAttnDownBlock3D") + return CrossAttnDownBlock3D( + num_layers=num_layers, + in_channels=in_channels, + out_channels=out_channels, + transformer_layers_per_block=transformer_layers_per_block, + temb_channels=temb_channels, + add_downsample=add_downsample, + resnet_eps=resnet_eps, + resnet_act_fn=resnet_act_fn, + resnet_groups=resnet_groups, + downsample_padding=downsample_padding, + cross_attention_dim=cross_attention_dim, + num_attention_heads=num_attention_heads, + dual_cross_attention=dual_cross_attention, + use_linear_projection=use_linear_projection, + only_cross_attention=only_cross_attention, + upcast_attention=upcast_attention, + resnet_time_scale_shift=resnet_time_scale_shift, + ) + raise ValueError(f"{down_block_type} does not exist.") + + +def get_up_block( + up_block_type, + num_layers, + in_channels, + out_channels, + prev_output_channel, + temb_channels, + add_upsample, + resnet_eps, + resnet_act_fn, + transformer_layers_per_block=1, + num_attention_heads=None, + resnet_groups=None, + cross_attention_dim=None, + dual_cross_attention=False, + use_linear_projection=False, + only_cross_attention=False, + upcast_attention=False, + resnet_time_scale_shift="default", + resnet_skip_time_act=False, + resnet_out_scale_factor=1.0, + cross_attention_norm=None, + attention_head_dim=None, + upsample_type=None, +): + up_block_type = up_block_type[7:] if up_block_type.startswith("UNetRes") else up_block_type + if up_block_type == "UpBlock3D": + return UpBlock3D( + num_layers=num_layers, + in_channels=in_channels, + out_channels=out_channels, + prev_output_channel=prev_output_channel, + temb_channels=temb_channels, + add_upsample=add_upsample, + resnet_eps=resnet_eps, + resnet_act_fn=resnet_act_fn, + resnet_groups=resnet_groups, + resnet_time_scale_shift=resnet_time_scale_shift, + ) + elif up_block_type == "CrossAttnUpBlock3D": + if cross_attention_dim is None: + raise ValueError("cross_attention_dim must be specified for CrossAttnUpBlock3D") + return CrossAttnUpBlock3D( + num_layers=num_layers, + in_channels=in_channels, + transformer_layers_per_block=transformer_layers_per_block, + out_channels=out_channels, + prev_output_channel=prev_output_channel, + temb_channels=temb_channels, + add_upsample=add_upsample, + resnet_eps=resnet_eps, + resnet_act_fn=resnet_act_fn, + resnet_groups=resnet_groups, + cross_attention_dim=cross_attention_dim, + num_attention_heads=num_attention_heads, + dual_cross_attention=dual_cross_attention, + use_linear_projection=use_linear_projection, + only_cross_attention=only_cross_attention, + upcast_attention=upcast_attention, + resnet_time_scale_shift=resnet_time_scale_shift, + ) + raise ValueError(f"{up_block_type} does not exist.") + + +class UNetMidBlock3DCrossAttn(paddle.nn.Layer): + def __init__( + self, + in_channels: int, + temb_channels: int, + dropout: float = 0.0, + num_layers: int = 1, + transformer_layers_per_block: int = 1, + resnet_eps: float = 1e-06, + resnet_time_scale_shift: str = "default", + resnet_act_fn: str = "swish", + resnet_groups: int = 32, + resnet_pre_norm: bool = True, + num_attention_heads=1, + output_scale_factor=1.0, + cross_attention_dim=1280, + dual_cross_attention=False, + use_linear_projection=False, + upcast_attention=False, + ): + super().__init__() + self.has_cross_attention = True + self.num_attention_heads = num_attention_heads + resnet_groups = resnet_groups if resnet_groups is not None else min(in_channels // 4, 32) + resnets = [ + ResnetBlock3D( + in_channels=in_channels, + out_channels=in_channels, + temb_channels=temb_channels, + eps=resnet_eps, + groups=resnet_groups, + dropout=dropout, + time_embedding_norm=resnet_time_scale_shift, + non_linearity=resnet_act_fn, + output_scale_factor=output_scale_factor, + pre_norm=resnet_pre_norm, + ) + ] + attentions = [] + for _ in range(num_layers): + if dual_cross_attention: + raise NotImplementedError + attentions.append( + Transformer3DModel( + num_attention_heads, + in_channels // num_attention_heads, + in_channels=in_channels, + num_layers=transformer_layers_per_block, + cross_attention_dim=cross_attention_dim, + norm_num_groups=resnet_groups, + use_linear_projection=use_linear_projection, + upcast_attention=upcast_attention, + ) + ) + resnets.append( + ResnetBlock3D( + in_channels=in_channels, + out_channels=in_channels, + temb_channels=temb_channels, + eps=resnet_eps, + groups=resnet_groups, + dropout=dropout, + time_embedding_norm=resnet_time_scale_shift, + non_linearity=resnet_act_fn, + output_scale_factor=output_scale_factor, + pre_norm=resnet_pre_norm, + ) + ) + self.attentions = paddle.nn.LayerList(sublayers=attentions) + self.resnets = paddle.nn.LayerList(sublayers=resnets) + + def forward( + self, + hidden_states, + temb=None, + encoder_hidden_states=None, + attention_mask=None, + cross_attention_kwargs=None, + enable_temporal_attentions: bool = True, + ): + hidden_states = self.resnets[0](hidden_states, temb) + for attn, resnet in zip(self.attentions, self.resnets[1:]): + hidden_states = attn(hidden_states, encoder_hidden_states=encoder_hidden_states).sample + hidden_states = resnet(hidden_states, temb) + return hidden_states + + def temporal_parameters(self) -> list: + return [] + + +class CrossAttnDownBlock3D(paddle.nn.Layer): + def __init__( + self, + in_channels: int, + out_channels: int, + temb_channels: int, + dropout: float = 0.0, + num_layers: int = 1, + transformer_layers_per_block: int = 1, + resnet_eps: float = 1e-06, + resnet_time_scale_shift: str = "default", + resnet_act_fn: str = "swish", + resnet_groups: int = 32, + resnet_pre_norm: bool = True, + num_attention_heads=1, + cross_attention_dim=1280, + output_scale_factor=1.0, + downsample_padding=1, + add_downsample=True, + dual_cross_attention=False, + use_linear_projection=False, + only_cross_attention=False, + upcast_attention=False, + ): + super().__init__() + resnets = [] + attentions = [] + temporal_attentions = [] + self.has_cross_attention = True + self.num_attention_heads = num_attention_heads + for i in range(num_layers): + in_channels = in_channels if i == 0 else out_channels + resnets.append( + ResnetBlock3D( + in_channels=in_channels, + out_channels=out_channels, + temb_channels=temb_channels, + eps=resnet_eps, + groups=resnet_groups, + dropout=dropout, + time_embedding_norm=resnet_time_scale_shift, + non_linearity=resnet_act_fn, + output_scale_factor=output_scale_factor, + pre_norm=resnet_pre_norm, + ) + ) + if dual_cross_attention: + raise NotImplementedError + attentions.append( + Transformer3DModel( + num_attention_heads, + out_channels // num_attention_heads, + in_channels=out_channels, + num_layers=transformer_layers_per_block, + cross_attention_dim=cross_attention_dim, + norm_num_groups=resnet_groups, + use_linear_projection=use_linear_projection, + only_cross_attention=only_cross_attention, + upcast_attention=upcast_attention, + ) + ) + temporal_attentions.append( + TransformerTemporal( + num_attention_heads=8, + attention_head_dim=out_channels // 8, + in_channels=out_channels, + cross_attention_dim=None, + ) + ) + self.attentions = paddle.nn.LayerList(sublayers=attentions) + self.resnets = paddle.nn.LayerList(sublayers=resnets) + self.temporal_attentions = paddle.nn.LayerList(sublayers=temporal_attentions) + if add_downsample: + self.downsamplers = paddle.nn.LayerList( + sublayers=[ + Downsample3D( + out_channels, use_conv=True, out_channels=out_channels, padding=downsample_padding, name="op" + ) + ] + ) + else: + self.downsamplers = None + self.gradient_checkpointing = False + + def forward( + self, + hidden_states, + temb=None, + encoder_hidden_states=None, + attention_mask=None, + cross_attention_kwargs=None, + enable_temporal_attentions: bool = True, + ): + output_states = () + for resnet, attn, temporal_attention in zip(self.resnets, self.attentions, self.temporal_attentions): + if self.training and self.gradient_checkpointing and not hidden_states.stop_gradient: + + def create_custom_forward(module, return_dict=None): + def custom_forward(*inputs): + if return_dict is not None: + return module(*inputs, return_dict=return_dict) + else: + return module(*inputs) + + return custom_forward + + hidden_states = recompute(create_custom_forward(resnet), hidden_states, temb, use_reentrant=False) + hidden_states = recompute( + create_custom_forward(attn, return_dict=False), + hidden_states, + encoder_hidden_states, + use_reentrant=False, + )[0] + if enable_temporal_attentions and temporal_attention is not None: + hidden_states = recompute( + create_custom_forward(temporal_attention), + hidden_states, + encoder_hidden_states, + use_reentrant=False, + ) + else: + hidden_states = resnet(hidden_states, temb) + hidden_states = attn(hidden_states, encoder_hidden_states=encoder_hidden_states).sample + if temporal_attention and enable_temporal_attentions: + hidden_states = temporal_attention(hidden_states, encoder_hidden_states=encoder_hidden_states) + output_states += (hidden_states,) + if self.downsamplers is not None: + for downsampler in self.downsamplers: + hidden_states = downsampler(hidden_states) + output_states += (hidden_states,) + return hidden_states, output_states + + def temporal_parameters(self) -> list: + output = [] + for block in self.temporal_attentions: + if block: + output.extend(block.parameters()) + return output + + +class DownBlock3D(paddle.nn.Layer): + def __init__( + self, + in_channels: int, + out_channels: int, + temb_channels: int, + dropout: float = 0.0, + num_layers: int = 1, + resnet_eps: float = 1e-06, + resnet_time_scale_shift: str = "default", + resnet_act_fn: str = "swish", + resnet_groups: int = 32, + resnet_pre_norm: bool = True, + output_scale_factor=1.0, + add_downsample=True, + downsample_padding=1, + ): + super().__init__() + resnets = [] + temporal_attentions = [] + for i in range(num_layers): + in_channels = in_channels if i == 0 else out_channels + resnets.append( + ResnetBlock3D( + in_channels=in_channels, + out_channels=out_channels, + temb_channels=temb_channels, + eps=resnet_eps, + groups=resnet_groups, + dropout=dropout, + time_embedding_norm=resnet_time_scale_shift, + non_linearity=resnet_act_fn, + output_scale_factor=output_scale_factor, + pre_norm=resnet_pre_norm, + ) + ) + temporal_attentions.append( + TransformerTemporal( + num_attention_heads=8, + attention_head_dim=out_channels // 8, + in_channels=out_channels, + cross_attention_dim=None, + ) + ) + self.resnets = paddle.nn.LayerList(sublayers=resnets) + self.temporal_attentions = paddle.nn.LayerList(sublayers=temporal_attentions) + if add_downsample: + self.downsamplers = paddle.nn.LayerList( + sublayers=[ + Downsample3D( + out_channels, use_conv=True, out_channels=out_channels, padding=downsample_padding, name="op" + ) + ] + ) + else: + self.downsamplers = None + self.gradient_checkpointing = False + + def forward(self, hidden_states, temb=None, encoder_hidden_states=None, enable_temporal_attentions: bool = True): + output_states = () + for resnet, temporal_attention in zip(self.resnets, self.temporal_attentions): + if self.training and self.gradient_checkpointing and not hidden_states.stop_gradient: + + def create_custom_forward(module): + def custom_forward(*inputs): + return module(*inputs) + + return custom_forward + + hidden_states = recompute(create_custom_forward(resnet), hidden_states, temb, use_reentrant=False) + if enable_temporal_attentions and temporal_attention is not None: + hidden_states = recompute( + create_custom_forward(temporal_attention), + hidden_states, + encoder_hidden_states, + use_reentrant=False, + ) + else: + hidden_states = resnet(hidden_states, temb) + if enable_temporal_attentions and temporal_attention: + hidden_states = temporal_attention(hidden_states, encoder_hidden_states=encoder_hidden_states) + output_states += (hidden_states,) + if self.downsamplers is not None: + for downsampler in self.downsamplers: + hidden_states = downsampler(hidden_states) + output_states += (hidden_states,) + return hidden_states, output_states + + def temporal_parameters(self) -> list: + output = [] + for block in self.temporal_attentions: + if block: + output.extend(block.parameters()) + return output + + +class CrossAttnUpBlock3D(paddle.nn.Layer): + def __init__( + self, + in_channels: int, + out_channels: int, + prev_output_channel: int, + temb_channels: int, + dropout: float = 0.0, + num_layers: int = 1, + transformer_layers_per_block: int = 1, + resnet_eps: float = 1e-06, + resnet_time_scale_shift: str = "default", + resnet_act_fn: str = "swish", + resnet_groups: int = 32, + resnet_pre_norm: bool = True, + num_attention_heads=1, + cross_attention_dim=1280, + output_scale_factor=1.0, + add_upsample=True, + dual_cross_attention=False, + use_linear_projection=False, + only_cross_attention=False, + upcast_attention=False, + ): + super().__init__() + resnets = [] + attentions = [] + temporal_attentions = [] + self.has_cross_attention = True + self.num_attention_heads = num_attention_heads + for i in range(num_layers): + res_skip_channels = in_channels if i == num_layers - 1 else out_channels + resnet_in_channels = prev_output_channel if i == 0 else out_channels + resnets.append( + ResnetBlock3D( + in_channels=resnet_in_channels + res_skip_channels, + out_channels=out_channels, + temb_channels=temb_channels, + eps=resnet_eps, + groups=resnet_groups, + dropout=dropout, + time_embedding_norm=resnet_time_scale_shift, + non_linearity=resnet_act_fn, + output_scale_factor=output_scale_factor, + pre_norm=resnet_pre_norm, + ) + ) + if dual_cross_attention: + raise NotImplementedError + attentions.append( + Transformer3DModel( + num_attention_heads, + out_channels // num_attention_heads, + in_channels=out_channels, + num_layers=transformer_layers_per_block, + cross_attention_dim=cross_attention_dim, + norm_num_groups=resnet_groups, + use_linear_projection=use_linear_projection, + only_cross_attention=only_cross_attention, + upcast_attention=upcast_attention, + ) + ) + temporal_attentions.append( + TransformerTemporal( + num_attention_heads=8, + attention_head_dim=out_channels // 8, + in_channels=out_channels, + cross_attention_dim=None, + ) + ) + self.attentions = paddle.nn.LayerList(sublayers=attentions) + self.resnets = paddle.nn.LayerList(sublayers=resnets) + self.temporal_attentions = paddle.nn.LayerList(sublayers=temporal_attentions) + if add_upsample: + self.upsamplers = paddle.nn.LayerList( + sublayers=[Upsample3D(out_channels, use_conv=True, out_channels=out_channels)] + ) + else: + self.upsamplers = None + self.gradient_checkpointing = False + + def forward( + self, + hidden_states, + res_hidden_states_tuple, + temb=None, + encoder_hidden_states=None, + upsample_size=None, + cross_attention_kwargs=None, + attention_mask=None, + enable_temporal_attentions: bool = True, + ): + for resnet, attn, temporal_attention in zip(self.resnets, self.attentions, self.temporal_attentions): + res_hidden_states = res_hidden_states_tuple[-1] + res_hidden_states_tuple = res_hidden_states_tuple[:-1] + hidden_states = paddle.concat(x=[hidden_states, res_hidden_states], axis=1) + if self.training and self.gradient_checkpointing and not hidden_states.stop_gradient: + + def create_custom_forward(module, return_dict=None): + def custom_forward(*inputs): + if return_dict is not None: + return module(*inputs, return_dict=return_dict) + else: + return module(*inputs) + + return custom_forward + + hidden_states = recompute(create_custom_forward(resnet), hidden_states, temb, use_reentrant=False) + hidden_states = recompute( + create_custom_forward(attn, return_dict=False), + hidden_states, + encoder_hidden_states, + use_reentrant=False, + )[0] + if enable_temporal_attentions and temporal_attention is not None: + hidden_states = recompute( + create_custom_forward(temporal_attention), + hidden_states, + encoder_hidden_states, + use_reentrant=False, + ) + else: + hidden_states = resnet(hidden_states, temb) + hidden_states = attn(hidden_states, encoder_hidden_states=encoder_hidden_states).sample + if enable_temporal_attentions and temporal_attention: + hidden_states = temporal_attention(hidden_states, encoder_hidden_states=encoder_hidden_states) + if self.upsamplers is not None: + for upsampler in self.upsamplers: + hidden_states = upsampler(hidden_states, upsample_size) + return hidden_states + + def temporal_parameters(self) -> list: + output = [] + for block in self.temporal_attentions: + if block: + output.extend(block.parameters()) + return output + + +class UpBlock3D(paddle.nn.Layer): + def __init__( + self, + in_channels: int, + prev_output_channel: int, + out_channels: int, + temb_channels: int, + dropout: float = 0.0, + num_layers: int = 1, + resnet_eps: float = 1e-06, + resnet_time_scale_shift: str = "default", + resnet_act_fn: str = "swish", + resnet_groups: int = 32, + resnet_pre_norm: bool = True, + output_scale_factor=1.0, + add_upsample=True, + ): + super().__init__() + resnets = [] + temporal_attentions = [] + for i in range(num_layers): + res_skip_channels = in_channels if i == num_layers - 1 else out_channels + resnet_in_channels = prev_output_channel if i == 0 else out_channels + resnets.append( + ResnetBlock3D( + in_channels=resnet_in_channels + res_skip_channels, + out_channels=out_channels, + temb_channels=temb_channels, + eps=resnet_eps, + groups=resnet_groups, + dropout=dropout, + time_embedding_norm=resnet_time_scale_shift, + non_linearity=resnet_act_fn, + output_scale_factor=output_scale_factor, + pre_norm=resnet_pre_norm, + ) + ) + temporal_attentions.append( + TransformerTemporal( + num_attention_heads=8, + attention_head_dim=out_channels // 8, + in_channels=out_channels, + cross_attention_dim=None, + ) + ) + self.resnets = paddle.nn.LayerList(sublayers=resnets) + self.temporal_attentions = paddle.nn.LayerList(sublayers=temporal_attentions) + if add_upsample: + self.upsamplers = paddle.nn.LayerList( + sublayers=[Upsample3D(out_channels, use_conv=True, out_channels=out_channels)] + ) + else: + self.upsamplers = None + self.gradient_checkpointing = False + + def forward( + self, + hidden_states, + res_hidden_states_tuple, + temb=None, + upsample_size=None, + encoder_hidden_states=None, + enable_temporal_attentions: bool = True, + ): + for resnet, temporal_attention in zip(self.resnets, self.temporal_attentions): + res_hidden_states = res_hidden_states_tuple[-1] + res_hidden_states_tuple = res_hidden_states_tuple[:-1] + hidden_states = paddle.concat(x=[hidden_states, res_hidden_states], axis=1) + if self.training and self.gradient_checkpointing and not hidden_states.stop_gradient: + + def create_custom_forward(module): + def custom_forward(*inputs): + return module(*inputs) + + return custom_forward + + hidden_states = recompute(create_custom_forward(resnet), hidden_states, temb, use_reentrant=False) + if enable_temporal_attentions and temporal_attention is not None: + hidden_states = recompute( + create_custom_forward(temporal_attention), + hidden_states, + encoder_hidden_states, + use_reentrant=False, + ) + else: + hidden_states = resnet(hidden_states, temb) + hidden_states = ( + temporal_attention(hidden_states, encoder_hidden_states=encoder_hidden_states) + if enable_temporal_attentions and temporal_attention is not None + else hidden_states + ) + if self.upsamplers is not None: + for upsampler in self.upsamplers: + hidden_states = upsampler(hidden_states, upsample_size) + return hidden_states + + def temporal_parameters(self) -> list: + output = [] + for block in self.temporal_attentions: + if block: + output.extend(block.parameters()) + return output diff --git a/VLMEvalKit_old/PaddleMIX/ppdiffusers/ppdiffusers/models/stable_cascade/gdf/__init__.py b/VLMEvalKit_old/PaddleMIX/ppdiffusers/ppdiffusers/models/stable_cascade/gdf/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..b2cc31a03a59f05a9bc2b53fb21829b2dbd83cbe --- /dev/null +++ b/VLMEvalKit_old/PaddleMIX/ppdiffusers/ppdiffusers/models/stable_cascade/gdf/__init__.py @@ -0,0 +1,162 @@ +# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import sys + +import paddle +import paddle_aux + +import ppdiffusers + +from .loss_weights import * +from .noise_conditions import * +from .samplers import * +from .scalers import * +from .schedulers import * +from .targets import * + + +class GDF: + def __init__(self, schedule, input_scaler, target, noise_cond, loss_weight, offset_noise=0): + self.schedule = schedule + self.input_scaler = input_scaler + self.target = target + self.noise_cond = noise_cond + self.loss_weight = loss_weight + self.offset_noise = offset_noise + + def setup_limits(self, stretch_max=True, stretch_min=True, shift=1): + stretched_limits = self.input_scaler.setup_limits( + self.schedule, self.input_scaler, stretch_max, stretch_min, shift + ) + return stretched_limits + + def diffuse(self, x0, epsilon=None, t=None, shift=1, loss_shift=1, offset=None): + if epsilon is None: + epsilon = paddle.randn(shape=x0.shape, dtype=x0.dtype) + + if self.offset_noise > 0: + if offset is None: + offset = paddle.randn( + shape=[x0.shape[0], x0.shape[1]] + [1] * (len(x0.shape) - 2), + ) + epsilon = epsilon + offset * self.offset_noise + logSNR = self.schedule(x0.shape[0] if t is None else t, shift=shift) + a, b = self.input_scaler(logSNR) + if len(a.shape) == 1: + a, b = a.reshape([-1, *([1] * (len(x0.shape) - 1))]), b.reshape([-1, *([1] * (len(x0.shape) - 1))]) + target = self.target(x0, epsilon, logSNR, a, b) + return ( + x0 * a + epsilon * b, + epsilon, + target, + logSNR, + self.noise_cond(logSNR), + self.loss_weight(logSNR, shift=loss_shift), + ) + + def undiffuse(self, x, logSNR, pred): + a, b = self.input_scaler(logSNR) + if len(a.shape) == 1: + a, b = a.reshape([-1, *([1] * (len(x.shape) - 1))]), b.reshape([-1, *([1] * (len(x.shape) - 1))]) + return self.target.x0(x, pred, logSNR, a, b), self.target.epsilon(x, pred, logSNR, a, b) + + def sample( + self, + model, + model_inputs, + shape, + unconditional_inputs=None, + sampler=None, + schedule=None, + t_start=1.0, + t_end=0.0, + timesteps=20, + x_init=None, + cfg=3.0, + cfg_t_stop=None, + cfg_t_start=None, + cfg_rho=0.7, + sampler_params=None, + shift=1, + device="cpu", + ): + sampler_params = {} if sampler_params is None else sampler_params + if sampler is None: + sampler = DDPMSampler(self) # noqa + r_range = paddle.linspace(start=t_start, stop=t_end, num=timesteps + 1) + schedule = self.schedule if schedule is None else schedule + logSNR_range = ( + schedule(r_range, shift=shift)[:, None] + .expand(shape=[-1, shape[0] if x_init is None else x_init.shape[0]]) + .to(device) + ) + x = sampler.init_x(shape).to(device) if x_init is None else x_init.clone() + if cfg is not None: + if unconditional_inputs is None: + unconditional_inputs = {k: paddle.zeros_like(x=v) for k, v in model_inputs.items()} + model_inputs = { + k: ( + paddle.concat(x=[v, v_u], axis=0) + if isinstance(v, paddle.Tensor) + else [ + ( + paddle.concat(x=[vi, vi_u], axis=0) + if isinstance(vi, paddle.Tensor) and isinstance(vi_u, paddle.Tensor) + else None + ) + for vi, vi_u in zip(v, v_u) + ] + if isinstance(v, list) + else {vk: paddle.concat(x=[v[vk], v_u.get(vk, paddle.zeros_like(x=v[vk]))], axis=0) for vk in v} + if isinstance(v, dict) + else None + ) + for (k, v), (k_u, v_u) in zip(model_inputs.items(), unconditional_inputs.items()) + } + for i in range(0, timesteps): + noise_cond = self.noise_cond(logSNR_range[i]) + if ( + cfg is not None + and (cfg_t_stop is None or r_range[i].item() >= cfg_t_stop) + and (cfg_t_start is None or r_range[i].item() <= cfg_t_start) + ): + cfg_val = cfg + if isinstance(cfg_val, (list, tuple)): + assert len(cfg_val) == 2, "cfg must be a float or a list/tuple of length 2" + cfg_val = cfg_val[0] * r_range[i].item() + cfg_val[1] * (1 - r_range[i].item()) + + pred, pred_unconditional = model( + paddle.concat(x=[x, x], axis=0), noise_cond.repeat(2), **model_inputs + ).chunk(chunks=2) + + pred_cfg = paddle.lerp(pred_unconditional, pred, paddle.to_tensor(cfg_val, dtype=paddle.float32)) + if cfg_rho > 0: + std_pos, std_cfg = pred.std(), pred_cfg.std() + pred = cfg_rho * (pred_cfg * std_pos / (std_cfg + 1e-9)) + pred_cfg * (1 - cfg_rho) + else: + pred = pred_cfg + else: + pred = model(x, noise_cond, **model_inputs) + + x0, epsilon = self.undiffuse(x, logSNR_range[i], pred) + x = sampler(x, x0, epsilon, logSNR_range[i], logSNR_range[i + 1], **sampler_params) + altered_vars = yield x0, x, pred + if altered_vars is not None: + cfg = altered_vars.get("cfg", cfg) + cfg_rho = altered_vars.get("cfg_rho", cfg_rho) + sampler = altered_vars.get("sampler", sampler) + model_inputs = altered_vars.get("model_inputs", model_inputs) + x = altered_vars.get("x", x) + x_init = altered_vars.get("x_init", x_init) diff --git a/VLMEvalKit_old/PaddleMIX/ppdiffusers/ppdiffusers/models/stable_cascade/gdf/loss_weights.py b/VLMEvalKit_old/PaddleMIX/ppdiffusers/ppdiffusers/models/stable_cascade/gdf/loss_weights.py new file mode 100644 index 0000000000000000000000000000000000000000..e2fefb2dd19a63300881b315e085661da7ca16a2 --- /dev/null +++ b/VLMEvalKit_old/PaddleMIX/ppdiffusers/ppdiffusers/models/stable_cascade/gdf/loss_weights.py @@ -0,0 +1,128 @@ +# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np +import paddle +import paddle_aux # noqa + + +class BaseLossWeight: + def weight(self, logSNR): + raise NotImplementedError("this method needs to be overridden") + + def __call__(self, logSNR, *args, shift=1, clamp_range=None, **kwargs): + clamp_range = [-1000000000.0, 1000000000.0] if clamp_range is None else clamp_range + if shift != 1: + logSNR = logSNR.clone() + 2 * np.log(shift) + return self.weight(logSNR, *args, **kwargs).clip(*clamp_range) + + +class ComposedLossWeight(BaseLossWeight): + def __init__(self, div, mul): + self.mul = [mul] if isinstance(mul, BaseLossWeight) else mul + self.div = [div] if isinstance(div, BaseLossWeight) else div + + def weight(self, logSNR): + prod, div = 1, 1 + for m in self.mul: + prod *= m.weight(logSNR) + for d in self.div: + div *= d.weight(logSNR) + return prod / div + + +class ConstantLossWeight(BaseLossWeight): + def __init__(self, v=1): + self.v = v + + def weight(self, logSNR): + return paddle.ones_like(x=logSNR) * self.v + + +class SNRLossWeight(BaseLossWeight): + def weight(self, logSNR): + return logSNR.exp() + + +class P2LossWeight(BaseLossWeight): + def __init__(self, k=1.0, gamma=1.0, s=1.0): + self.k, self.gamma, self.s = k, gamma, s + + def weight(self, logSNR): + return (self.k + (logSNR * self.s).exp()) ** -self.gamma + + +class SNRPlusOneLossWeight(BaseLossWeight): + def weight(self, logSNR): + return logSNR.exp() + 1 + + +class MinSNRLossWeight(BaseLossWeight): + def __init__(self, max_snr=5): + self.max_snr = max_snr + + def weight(self, logSNR): + return logSNR.exp().clip(max=self.max_snr) + + +class MinSNRPlusOneLossWeight(BaseLossWeight): + def __init__(self, max_snr=5): + self.max_snr = max_snr + + def weight(self, logSNR): + return (logSNR.exp() + 1).clip(max=self.max_snr) + + +class TruncatedSNRLossWeight(BaseLossWeight): + def __init__(self, min_snr=1): + self.min_snr = min_snr + + def weight(self, logSNR): + return logSNR.exp().clip(min=self.min_snr) + + +class SechLossWeight(BaseLossWeight): + def __init__(self, div=2): + self.div = div + + def weight(self, logSNR): + return 1 / (logSNR / self.div).cosh() + + +class DebiasedLossWeight(BaseLossWeight): + def weight(self, logSNR): + return 1 / logSNR.exp().sqrt() + + +class SigmoidLossWeight(BaseLossWeight): + def __init__(self, s=1): + self.s = s + + def weight(self, logSNR): + return (logSNR * self.s).sigmoid() + + +class AdaptiveLossWeight(BaseLossWeight): + def __init__(self, logsnr_range=[-10, 10], buckets=300, weight_range=[1e-07, 10000000.0]): + self.bucket_ranges = paddle.linspace(start=logsnr_range[0], stop=logsnr_range[1], num=buckets - 1) + self.bucket_losses = paddle.ones(shape=buckets) + self.weight_range = weight_range + + def weight(self, logSNR): + indices = paddle.searchsorted(sorted_sequence=self.bucket_ranges.to(logSNR.place), values=logSNR) + return (1 / self.bucket_losses.to(logSNR.place)[indices]).clip([*self.weight_range]) + + def update_buckets(self, logSNR, loss, beta=0.99): + indices = paddle.searchsorted(sorted_sequence=self.bucket_ranges.to(logSNR.place), values=logSNR).cpu() + self.bucket_losses[indices] = self.bucket_losses[indices] * beta + loss.detach().cpu() * (1 - beta) diff --git a/VLMEvalKit_old/PaddleMIX/ppdiffusers/ppdiffusers/models/stable_cascade/gdf/scalers.py b/VLMEvalKit_old/PaddleMIX/ppdiffusers/ppdiffusers/models/stable_cascade/gdf/scalers.py new file mode 100644 index 0000000000000000000000000000000000000000..6ea70592b8882b8261a52a8e6d2717fb7c28c3cb --- /dev/null +++ b/VLMEvalKit_old/PaddleMIX/ppdiffusers/ppdiffusers/models/stable_cascade/gdf/scalers.py @@ -0,0 +1,58 @@ +# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import paddle + + +class BaseScaler: + def __init__(self): + self.stretched_limits = None + + def setup_limits(self, schedule, input_scaler, stretch_max=True, stretch_min=True, shift=1): + min_logSNR = schedule(paddle.ones(shape=[1]), shift=shift) + max_logSNR = schedule(paddle.zeros(shape=[1]), shift=shift) + min_a, max_b = [v.item() for v in input_scaler(min_logSNR)] if stretch_max else [0, 1] + max_a, min_b = [v.item() for v in input_scaler(max_logSNR)] if stretch_min else [1, 0] + self.stretched_limits = [min_a, max_a, min_b, max_b] + return self.stretched_limits + + def stretch_limits(self, a, b): + min_a, max_a, min_b, max_b = self.stretched_limits + return (a - min_a) / (max_a - min_a), (b - min_b) / (max_b - min_b) + + def scalers(self, logSNR): + raise NotImplementedError("this method needs to be overridden") + + def __call__(self, logSNR): + a, b = self.scalers(logSNR) + if self.stretched_limits is not None: + a, b = self.stretch_limits(a, b) + return a, b + + +class VPScaler(BaseScaler): + def scalers(self, logSNR): + a_squared = logSNR.sigmoid() + a = a_squared.sqrt() + b = (1 - a_squared).sqrt() + return a, b + + +class LERPScaler(BaseScaler): + def scalers(self, logSNR): + _a = logSNR.exp() - 1 + _a[_a == 0] = 0.001 + a = 1 + (2 - (2**2 + 4 * _a) ** 0.5) / (2 * _a) + b = 1 - a + return a, b diff --git a/VLMEvalKit_old/PaddleMIX/ppdiffusers/ppdiffusers/models/stable_cascade/gdf/targets.py b/VLMEvalKit_old/PaddleMIX/ppdiffusers/ppdiffusers/models/stable_cascade/gdf/targets.py new file mode 100644 index 0000000000000000000000000000000000000000..51fb2e2e4601cbff4910892b861f06b2040d6e2d --- /dev/null +++ b/VLMEvalKit_old/PaddleMIX/ppdiffusers/ppdiffusers/models/stable_cascade/gdf/targets.py @@ -0,0 +1,59 @@ +# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +class EpsilonTarget: + def __call__(self, x0, epsilon, logSNR, a, b): + return epsilon + + def x0(self, noised, pred, logSNR, a, b): + return (noised - pred * b) / a + + def epsilon(self, noised, pred, logSNR, a, b): + return pred + + +class X0Target: + def __call__(self, x0, epsilon, logSNR, a, b): + return x0 + + def x0(self, noised, pred, logSNR, a, b): + return pred + + def epsilon(self, noised, pred, logSNR, a, b): + return (noised - pred * a) / b + + +class VTarget: + def __call__(self, x0, epsilon, logSNR, a, b): + return a * epsilon - b * x0 + + def x0(self, noised, pred, logSNR, a, b): + squared_sum = a**2 + b**2 + return a / squared_sum * noised - b / squared_sum * pred + + def epsilon(self, noised, pred, logSNR, a, b): + squared_sum = a**2 + b**2 + return b / squared_sum * noised + a / squared_sum * pred + + +class RectifiedFlowsTarget: + def __call__(self, x0, epsilon, logSNR, a, b): + return epsilon - x0 + + def x0(self, noised, pred, logSNR, a, b): + return noised - pred * b + + def epsilon(self, noised, pred, logSNR, a, b): + return noised + pred * a diff --git a/VLMEvalKit_old/PaddleMIX/ppdiffusers/ppdiffusers/models/stable_cascade/modules/__init__.py b/VLMEvalKit_old/PaddleMIX/ppdiffusers/ppdiffusers/models/stable_cascade/modules/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..063951a2f34e2da6d2ac9dd82221183876e22354 --- /dev/null +++ b/VLMEvalKit_old/PaddleMIX/ppdiffusers/ppdiffusers/models/stable_cascade/modules/__init__.py @@ -0,0 +1,17 @@ +# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from .effnet import EfficientNetEncoder +from .previewer import Previewer +from .stage_c import AttnBlock, FeedForwardBlock, ResBlock, StageC, TimestepBlock diff --git a/VLMEvalKit_old/PaddleMIX/ppdiffusers/ppdiffusers/models/stable_cascade/modules/common.py b/VLMEvalKit_old/PaddleMIX/ppdiffusers/ppdiffusers/models/stable_cascade/modules/common.py new file mode 100644 index 0000000000000000000000000000000000000000..93724d128cab9e8b7d34438c1ae1f0bc467cc963 --- /dev/null +++ b/VLMEvalKit_old/PaddleMIX/ppdiffusers/ppdiffusers/models/stable_cascade/modules/common.py @@ -0,0 +1,151 @@ +# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np +import paddle +import paddle.nn as nn + + +def load(path="../x.npy"): + return paddle.to_tensor(np.load(path)) + + +def diff(a, b): + return (a - b).abs().mean() + + +class Linear(nn.Linear): + def reset_parameters(self): + return None + + +class Conv2d(nn.Conv2D): + def reset_parameters(self): + return None + + +class Attention2D(nn.Layer): + def __init__(self, c, nhead, dropout=0.0): + super().__init__() + self.attn = nn.MultiHeadAttention(c, nhead, dropout=dropout) + + def forward(self, x, kv, self_attn=False): + orig_shape = x.shape + x = x.reshape([x.shape[0], x.shape[1], -1]).transpose([0, 2, 1]) + if self_attn: + kv = paddle.concat([x, kv], axis=1) + x = self.attn(x, kv, kv) + x = x.transpose([0, 2, 1]).reshape(orig_shape) + return x + + +class LayerNorm2d(nn.LayerNorm): + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + + def forward(self, x): + return super().forward(x.transpose(perm=[0, 2, 3, 1])).transpose(perm=[0, 3, 1, 2]) + + +class GlobalResponseNorm(nn.Layer): + def __init__(self, dim): + super(GlobalResponseNorm, self).__init__() + self.gamma = self.create_parameter( + shape=[1, 1, 1, dim], default_initializer=paddle.nn.initializer.Constant(value=0.0) + ) + self.beta = self.create_parameter( + shape=[1, 1, 1, dim], default_initializer=paddle.nn.initializer.Constant(value=0.0) + ) + self.gamma.stop_gradient = False + self.beta.stop_gradient = False + + def forward(self, x): + Gx = paddle.norm(x, p=2, axis=(1, 2), keepdim=True) + Nx = Gx / (paddle.mean(Gx, axis=-1, keepdim=True) + 1e-6) + x = self.gamma * (x * Nx) + self.beta + x + return x + + +class ResBlock(nn.Layer): + def __init__(self, c, c_skip=0, kernel_size=3, dropout=0.0): + super().__init__() + self.depthwise = Conv2d(c, c, kernel_size=kernel_size, padding=kernel_size // 2, groups=c) + self.norm = LayerNorm2d(c, weight_attr=False, bias_attr=False, epsilon=1e-06) + self.channelwise = nn.Sequential( + Linear(c + c_skip, c * 4), + nn.GELU(), + GlobalResponseNorm(c * 4), + nn.Dropout(p=dropout), + Linear(c * 4, c), + ) + + def forward(self, x, x_skip=None): + x_res = x + x = self.depthwise(x) + x = self.norm(x) + if x_skip is not None: + x = paddle.concat(x=[x, x_skip], axis=1) + + x = self.channelwise(x.transpose(perm=[0, 2, 3, 1])).transpose(perm=[0, 3, 1, 2]) + return x + x_res + + +class AttnBlock(nn.Layer): + def __init__(self, c, c_cond, nhead, self_attn=True, dropout=0.0): + super().__init__() + self.self_attn = self_attn + self.norm = LayerNorm2d(c, weight_attr=False, bias_attr=False, epsilon=1e-06) + self.attention = Attention2D(c, nhead, dropout) + self.kv_mapper = nn.Sequential(nn.Silu(), Linear(c_cond, c)) + + def forward(self, x, kv): + kv = self.kv_mapper(kv) + x = x + self.attention(self.norm(x), kv, self_attn=self.self_attn) + return x + + +class FeedForwardBlock(nn.Layer): + def __init__(self, c, dropout=0.0): + super().__init__() + self.norm = LayerNorm2d(c, weight_attr=False, bias_attr=False, epsilon=1e-06) + self.channelwise = nn.Sequential( + Linear(c, c * 4), + nn.GELU(), + GlobalResponseNorm(c * 4), + nn.Dropout(p=dropout), + Linear(c * 4, c), + ) + + def forward(self, x): + x = x + self.channelwise(self.norm(x).transpose(perm=[0, 2, 3, 1])).transpose(perm=[0, 3, 1, 2]) + return x + + +class TimestepBlock(nn.Layer): + def __init__(self, c, c_timestep, conds=["sca"], trainable=True): + super(TimestepBlock, self).__init__() + self.mapper = nn.Linear(c_timestep, c * 2, bias_attr=trainable) + self.conds = conds + for cname in conds: + setattr(self, f"mapper_{cname}", nn.Linear(c_timestep, c * 2, bias_attr=trainable)) + + def forward(self, x, t): + t = paddle.split(t, num_or_sections=len(self.conds) + 1, axis=1) + a_b = self.mapper(t[0]) + a, b = a_b[:, : a_b.shape[1] // 2, None, None], a_b[:, a_b.shape[1] // 2 :, None, None] + for i, c in enumerate(self.conds): + ac_bc = getattr(self, f"mapper_{c}")(t[i + 1]) + ac, bc = ac_bc[:, : ac_bc.shape[1] // 2, None, None], ac_bc[:, ac_bc.shape[1] // 2 :, None, None] + a, b = a + ac, b + bc + return x * (1 + a) + b diff --git a/VLMEvalKit_old/PaddleMIX/ppdiffusers/ppdiffusers/models/stable_cascade/modules/efficientnet_v2_s.py b/VLMEvalKit_old/PaddleMIX/ppdiffusers/ppdiffusers/models/stable_cascade/modules/efficientnet_v2_s.py new file mode 100644 index 0000000000000000000000000000000000000000..c9497b6373f4b8f289fbadc9b318ff4bd14a1741 --- /dev/null +++ b/VLMEvalKit_old/PaddleMIX/ppdiffusers/ppdiffusers/models/stable_cascade/modules/efficientnet_v2_s.py @@ -0,0 +1,561 @@ +# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import copy +import math +from dataclasses import dataclass +from functools import partial +from typing import Any, Callable, Dict, List, Optional, Sequence, Tuple, Union + +import paddle +import paddle.nn as nn +from paddle import Tensor +from paddle.nn import ( + AdaptiveAvgPool2D, + BatchNorm, + BatchNorm2D, + Conv2D, + Dropout, + GroupNorm, + Layer, + Linear, + ReLU, + Sequential, + Sigmoid, + Silu, +) +from paddle.nn.initializer import Constant, KaimingNormal, Uniform +from paddle.utils.download import get_weights_path_from_url + +__all__ = ["EfficientNet", "EfficientNet_V2_S_Weights", "efficientnet_v2_s"] + + +class SqueezeExcitation(paddle.nn.Layer): + """ + This block implements the Squeeze-and-Excitation block from https://arxiv.org/abs/1709.01507 (see Fig. 1). + Parameters ``activation`` and ``scale_activation`` correspond to ``delta`` and ``sigma`` in eq. 3. + + Args: + input_channels (int): Number of channels in the input feature maps + squeeze_channels (int): Number of squeeze channels + activation (Callable[[Tensor], Tensor], optional): ``delta`` activation. Default: ReLU + scale_activation (Callable[[Tensor], Tensor], optional): ``sigma`` activation. Default: Sigmoid + """ + + def __init__( + self, + input_channels: int, + squeeze_channels: int, + activation: Callable[[Tensor], Tensor] = ReLU(), + scale_activation: Callable[[Tensor], Tensor] = Sigmoid(), + ) -> None: + super(SqueezeExcitation, self).__init__() + self.avgpool = AdaptiveAvgPool2D(1) + self.fc1 = Conv2D(in_channels=input_channels, out_channels=squeeze_channels, kernel_size=1) + self.fc2 = Conv2D(in_channels=squeeze_channels, out_channels=input_channels, kernel_size=1) + self.activation = activation + self.scale_activation = scale_activation + + def forward(self, input: paddle.Tensor) -> paddle.Tensor: + scale = self.avgpool(input) + scale = self.fc1(scale) + scale = self.activation(scale) + scale = self.fc2(scale) + scale = self.scale_activation(scale) + return scale * input + + +def stochastic_depth(input, p, mode, training=True): + """ + Implements the Stochastic Depth from `"Deep Networks with Stochastic Depth" + `_ used for randomly dropping residual + branches of residual architectures. + + Args: + input (paddle.Tensor): The input tensor or arbitrary dimensions with the first one + being its batch i.e. a batch with ``N`` rows. + p (float): probability of the input to be zeroed. + mode (str): ``"batch"`` or ``"row"``. + ``"batch"`` randomly zeroes the entire input, ``"row"`` zeroes + randomly selected rows from the batch. + training (bool): apply stochastic depth if is ``True``. Default: ``True`` + + Returns: + paddle.Tensor: The randomly zeroed tensor. + """ + if p < 0.0 or p > 1.0: + raise ValueError(f"drop probability has to be between 0 and 1, but got {p}") + if mode not in ["batch", "row"]: + raise ValueError(f"mode has to be either 'batch' or 'row', but got {mode}") + if not training or p == 0.0: + return input + + survival_rate = 1.0 - p + if mode == "row": + size = [input.shape[0]] + [1] * (input.ndim - 1) + else: + size = [1] * input.ndim + noise = paddle.empty(size, dtype=input.dtype) + survival_rate = paddle.to_tensor(survival_rate, dtype=input.dtype) + paddle.assign(paddle.bernoulli(paddle.broadcast_to(survival_rate, noise.shape)), noise) + if survival_rate > 0.0: + noise /= survival_rate + return input * noise + + +class StochasticDepth(Layer): + """ + See :func:`stochastic_depth`. + """ + + def __init__(self, p: float, mode: str) -> None: + super(StochasticDepth, self).__init__() + self.p = p + self.mode = mode + + def forward(self, input): + return stochastic_depth(input, self.p, self.mode, self.training) + + def __repr__(self): + s = f"{self.__class__.__name__}(p={self.p}, mode={self.mode})" + return s + + +def _make_ntuple(value, n): + """Helper function to create a tuple of size n with the given value.""" + if isinstance(value, int): + return (value,) * n + return value + + +class ConvNormActivation(Sequential): + def __init__( + self, + in_channels: int, + out_channels: int, + kernel_size: Union[int, Sequence[int]] = 3, + stride: Union[int, Sequence[int]] = 1, + padding: Optional[Union[int, Sequence[int], str]] = None, + groups: int = 1, + norm_layer: Optional[Callable[..., paddle.nn.Layer]] = BatchNorm, + activation_layer: Optional[Callable[..., paddle.nn.Layer]] = ReLU, + dilation: Union[int, Sequence[int]] = 1, + inplace: Optional[bool] = True, + bias: Optional[bool] = None, + conv_layer: Callable[..., Conv2D] = Conv2D, + ) -> None: + if padding is None: + padding = (kernel_size - 1) // 2 * dilation + else: + padding = _make_ntuple(padding, len(kernel_size)) + + layers = [ + conv_layer( + in_channels, + out_channels, + kernel_size, + stride, + padding, + dilation=dilation, + groups=groups, + bias_attr=False if bias is None else bias, + ) + ] + + if norm_layer is not None: + norm_layer_instance = norm_layer(out_channels, use_global_stats=True) + layers.append(norm_layer_instance) + + if activation_layer is not None: + layers.append(activation_layer) + + super(ConvNormActivation, self).__init__(*layers) + self.out_channels = out_channels + + +class Conv2DNormActivation(ConvNormActivation): + def __init__( + self, + in_channels: int, + out_channels: int, + kernel_size: Union[int, Tuple[int, int]] = 3, + stride: Union[int, Tuple[int, int]] = 1, + padding: Optional[Union[int, Tuple[int, int], str]] = None, + groups: int = 1, + norm_layer: Optional[Callable[..., paddle.nn.Layer]] = BatchNorm, + activation_layer: Optional[Callable[..., paddle.nn.Layer]] = ReLU, + dilation: Union[int, Tuple[int, int]] = 1, + inplace: Optional[bool] = True, + bias: Optional[bool] = None, + ) -> None: + super().__init__( + in_channels, + out_channels, + kernel_size, + stride, + padding, + groups, + norm_layer, + activation_layer, + dilation, + inplace, + bias, + Conv2D, + ) + + +class EfficientNet_V2_S_Weights: + IMAGENET1K_V1 = "https://download.pytorch.org/models/efficientnet_v2_s-dd5fe13b.pth" + + def __init__(self, url: str, transforms: Callable[..., Any], meta: Dict[str, Any]) -> None: + self.url = url + self.transforms = transforms + self.meta = meta + + def state_dict(self, progress: bool = True, check_hash: bool = False) -> Dict[str, Any]: + path = get_weights_path_from_url(self.url, progress=progress, check_hash=check_hash) + return paddle.load(path) + + @classmethod + def verify(cls, weights): + if weights is None: + return None + if not isinstance(weights, EfficientNet_V2_S_Weights): + raise ValueError(f"weights must be an instance of EfficientNet_V2_S_Weights, but got {type(weights)}") + return weights + + +@dataclass +class _MBConvConfig: + expand_ratio: float + kernel: int + stride: int + input_channels: int + out_channels: int + num_layers: int + block: Callable[..., paddle.nn.Layer] + + @staticmethod + def adjust_channels(channels: int, width_mult: float, min_value: Optional[int] = None) -> int: + return _make_divisible(channels * width_mult, 8, min_value) + + +class MBConvConfig(_MBConvConfig): + def __init__( + self, + expand_ratio: float, + kernel: int, + stride: int, + input_channels: int, + out_channels: int, + num_layers: int, + width_mult: float = 1.0, + depth_mult: float = 1.0, + block: Optional[Callable[..., paddle.nn.Layer]] = None, + ) -> None: + input_channels = self.adjust_channels(input_channels, width_mult) + out_channels = self.adjust_channels(out_channels, width_mult) + num_layers = self.adjust_depth(num_layers, depth_mult) + if block is None: + block = MBConv + super().__init__(expand_ratio, kernel, stride, input_channels, out_channels, num_layers, block) + + @staticmethod + def adjust_depth(num_layers: int, depth_mult: float): + return int(math.ceil(num_layers * depth_mult)) + + +class FusedMBConvConfig(_MBConvConfig): + def __init__( + self, + expand_ratio: float, + kernel: int, + stride: int, + input_channels: int, + out_channels: int, + num_layers: int, + block: Optional[Callable[..., paddle.nn.Layer]] = None, + ) -> None: + if block is None: + block = FusedMBConv + super().__init__(expand_ratio, kernel, stride, input_channels, out_channels, num_layers, block) + + +class MBConv(Layer): + def __init__( + self, + cnf, + stochastic_depth_prob: float, + norm_layer: Callable[..., Layer], + se_layer: Callable[..., Layer] = SqueezeExcitation, + ) -> None: + super(MBConv, self).__init__() + + if not (1 <= cnf.stride <= 2): + raise ValueError("illegal stride value") + + self.use_res_connect = cnf.stride == 1 and cnf.input_channels == cnf.out_channels + + layers = [] + activation_layer = nn.Silu() + + # expand + expanded_channels = cnf.adjust_channels(cnf.input_channels, cnf.expand_ratio) + if expanded_channels != cnf.input_channels: + layers.append( + Conv2DNormActivation( + cnf.input_channels, + expanded_channels, + kernel_size=1, + norm_layer=norm_layer, + activation_layer=activation_layer, + ) + ) + + # depthwise + layers.append( + Conv2DNormActivation( + expanded_channels, + expanded_channels, + kernel_size=cnf.kernel, + stride=cnf.stride, + groups=expanded_channels, + norm_layer=norm_layer, + activation_layer=activation_layer, + ) + ) + + # squeeze and excitation + squeeze_channels = max(1, cnf.input_channels // 4) + layers.append(se_layer(expanded_channels, squeeze_channels, activation=nn.Silu())) + + # project + layers.append( + Conv2DNormActivation( + expanded_channels, cnf.out_channels, kernel_size=1, norm_layer=norm_layer, activation_layer=None + ) + ) + + self.block = Sequential(*layers) + self.stochastic_depth = StochasticDepth(stochastic_depth_prob, "row") + self.out_channels = cnf.out_channels + + def forward(self, input) -> paddle.Tensor: + result = self.block(input) + if self.use_res_connect: + result = self.stochastic_depth(result) + result += input + return result + + +class FusedMBConv(Layer): + def __init__( + self, + cnf: "FusedMBConvConfig", + stochastic_depth_prob: float, + norm_layer: Callable[..., Layer], + ) -> None: + super(FusedMBConv, self).__init__() + + if not (1 <= cnf.stride <= 2): + raise ValueError("illegal stride value") + + self.use_res_connect = cnf.stride == 1 and cnf.input_channels == cnf.out_channels + + layers: List[Layer] = [] + activation_layer = nn.Silu() + + expanded_channels = cnf.adjust_channels(cnf.input_channels, cnf.expand_ratio) + if expanded_channels != cnf.input_channels: + # fused expand and project + layers.append( + Conv2DNormActivation( + cnf.input_channels, + expanded_channels, + kernel_size=cnf.kernel, + stride=cnf.stride, + norm_layer=norm_layer, + activation_layer=activation_layer, + ) + ) + # project + layers.append( + Conv2DNormActivation( + expanded_channels, + cnf.out_channels, + kernel_size=1, + norm_layer=norm_layer, + activation_layer=None, + ) + ) + else: + layers.append( + Conv2DNormActivation( + cnf.input_channels, + cnf.out_channels, + kernel_size=cnf.kernel, + stride=cnf.stride, + norm_layer=norm_layer, + activation_layer=activation_layer, + ) + ) + + self.block = Sequential(*layers) + self.stochastic_depth = StochasticDepth(stochastic_depth_prob, "row") + self.out_channels = cnf.out_channels + + def forward(self, input: Tensor) -> Tensor: + result = self.block(input) + if self.use_res_connect: + result = self.stochastic_depth(result) + result += input + return result + + +class EfficientNet(Layer): + def __init__( + self, + inverted_residual_setting: Sequence[Union[MBConvConfig, FusedMBConvConfig]], + dropout: float, + stochastic_depth_prob: float = 0.2, + num_classes: int = 1000, + norm_layer: Optional[Callable[..., paddle.nn.Layer]] = None, + last_channel: Optional[int] = None, + ) -> None: + super().__init__() + if not inverted_residual_setting: + raise ValueError("The inverted_residual_setting should not be empty") + elif not ( + isinstance(inverted_residual_setting, Sequence) + and all([isinstance(s, _MBConvConfig) for s in inverted_residual_setting]) + ): + raise TypeError("The inverted_residual_setting should be List[MBConvConfig]") + if norm_layer is None: + norm_layer = BatchNorm2D + layers: List[paddle.nn.Layer] = [] + firstconv_output_channels = inverted_residual_setting[0].input_channels + layers.append( + Conv2DNormActivation( + 3, firstconv_output_channels, kernel_size=3, stride=2, norm_layer=norm_layer, activation_layer=Silu() + ) + ) + total_stage_blocks = sum(cnf.num_layers for cnf in inverted_residual_setting) + stage_block_id = 0 + for cnf in inverted_residual_setting: + stage: List[paddle.nn.Layer] = [] + for _ in range(cnf.num_layers): + block_cnf = copy.copy(cnf) + if stage: + block_cnf.input_channels = block_cnf.out_channels + block_cnf.stride = 1 + sd_prob = stochastic_depth_prob * float(stage_block_id) / total_stage_blocks + stage.append(block_cnf.block(block_cnf, sd_prob, norm_layer)) + stage_block_id += 1 + layers.append(Sequential(*stage)) + lastconv_input_channels = inverted_residual_setting[-1].out_channels + lastconv_output_channels = last_channel if last_channel is not None else 4 * lastconv_input_channels + layers.append( + Conv2DNormActivation( + lastconv_input_channels, + lastconv_output_channels, + kernel_size=1, + norm_layer=norm_layer, + activation_layer=Silu(), + ) + ) + self.features = Sequential(*layers) + self.avgpool = AdaptiveAvgPool2D(output_size=1) + self.classifier = Sequential( + Dropout(p=dropout), Linear(in_features=lastconv_output_channels, out_features=num_classes) + ) + + for m in self.sublayers(): + if isinstance(m, Conv2D): + KaimingNormal()(m.weight) + if m.bias is not None: + Constant(value=0.0)(m.bias) + elif isinstance(m, (BatchNorm2D, GroupNorm)): + Constant(value=1.0)(m.weight) + Constant(value=0.0)(m.bias) + elif isinstance(m, Linear): + init_range = 1.0 / math.sqrt(m.weight.shape[1]) + Uniform(low=-init_range, high=init_range)(m.weight) + Constant(value=0.0)(m.bias) + + def forward(self, x: paddle.Tensor) -> paddle.Tensor: + x = self.features(x) + x = self.avgpool(x) + x = paddle.flatten(x=x, start_axis=1) + x = self.classifier(x) + return x + + +def _make_divisible(value: float, divisor: int, min_value: Optional[int] = None) -> int: + if min_value is None: + min_value = divisor + new_value = max(min_value, int(value + divisor / 2) // divisor * divisor) + if new_value < 0.9 * value: + new_value += divisor + return new_value + + +def _efficientnet( + inverted_residual_setting: Sequence[Union[MBConvConfig, FusedMBConvConfig]], + dropout: float, + last_channel: Optional[int], + weights: Optional[EfficientNet_V2_S_Weights], + progress: bool, + **kwargs: Any +) -> EfficientNet: + if weights is not None: + kwargs["num_classes"] = len(weights.meta["categories"]) + model = EfficientNet(inverted_residual_setting, dropout, last_channel=last_channel, **kwargs) + if weights is not None: + model.set_state_dict(weights.state_dict(progress=progress, check_hash=True)) + return model + + +def _efficientnet_conf( + arch: str, **kwargs: Any +) -> Tuple[Sequence[Union[MBConvConfig, FusedMBConvConfig]], Optional[int]]: + inverted_residual_setting: Sequence[Union[MBConvConfig, FusedMBConvConfig]] + if arch.startswith("efficientnet_v2_s"): + inverted_residual_setting = [ + FusedMBConvConfig(1, 3, 1, 24, 24, 2), + FusedMBConvConfig(4, 3, 2, 24, 48, 4), + FusedMBConvConfig(4, 3, 2, 48, 64, 4), + MBConvConfig(4, 3, 2, 64, 128, 6), + MBConvConfig(6, 3, 1, 128, 160, 9), + MBConvConfig(6, 3, 2, 160, 256, 15), + ] + last_channel = 1280 + else: + raise ValueError(f"Unsupported model type {arch}") + return inverted_residual_setting, last_channel + + +def efficientnet_v2_s( + *, weights: Optional[EfficientNet_V2_S_Weights] = None, progress: bool = True, **kwargs: Any +) -> EfficientNet: + weights = EfficientNet_V2_S_Weights.verify(weights) + inverted_residual_setting, last_channel = _efficientnet_conf("efficientnet_v2_s") + return _efficientnet( + inverted_residual_setting, + kwargs.pop("dropout", 0.2), + last_channel, + weights, + progress, + norm_layer=partial(BatchNorm2D, epsilon=0.001), + **kwargs, + ) diff --git a/VLMEvalKit_old/PaddleMIX/ppdiffusers/ppdiffusers/models/stable_cascade/modules/effnet.py b/VLMEvalKit_old/PaddleMIX/ppdiffusers/ppdiffusers/models/stable_cascade/modules/effnet.py new file mode 100644 index 0000000000000000000000000000000000000000..84bc3fb0f907f802a807e51102ecd6bbba7ea338 --- /dev/null +++ b/VLMEvalKit_old/PaddleMIX/ppdiffusers/ppdiffusers/models/stable_cascade/modules/effnet.py @@ -0,0 +1,83 @@ +# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import paddle +import paddle.nn as nn + +from .efficientnet_v2_s import efficientnet_v2_s + + +class BatchNorm2D(nn.Layer): + def __init__(self, num_features, eps=1e-5, momentum=0.1, affine=True, track_running_stats=True): + super(BatchNorm2D, self).__init__() + self.num_features = num_features + self.eps = eps + self.momentum = momentum + self.affine = affine + self.track_running_stats = track_running_stats + + if self.affine: + self.weight = self.create_parameter( + shape=[num_features], default_initializer=paddle.nn.initializer.Constant(value=1.0) + ) + self.bias = self.create_parameter( + shape=[num_features], default_initializer=paddle.nn.initializer.Constant(value=0.0) + ) + else: + self.weight = None + self.bias = None + + if self.track_running_stats: + self._mean = self.create_parameter( + shape=[num_features], default_initializer=paddle.nn.initializer.Constant(value=0.0), is_bias=False + ) + self._variance = self.create_parameter( + shape=[num_features], default_initializer=paddle.nn.initializer.Constant(value=1.0), is_bias=False + ) + self._mean.stop_gradient = True + self._variance.stop_gradient = True + else: + self._mean = None + self._variance = None + + def forward(self, input): + mean = self._mean + variance = self._variance + + output = (input - paddle.unsqueeze(mean, axis=[0, 2, 3])) / paddle.unsqueeze( + paddle.sqrt(variance + self.eps), axis=[0, 2, 3] + ) + if self.affine: + output = output * paddle.unsqueeze(self.weight, axis=[0, 2, 3]) + paddle.unsqueeze( + self.bias, axis=[0, 2, 3] + ) + return output + + +class EfficientNetEncoder(nn.Layer): + def __init__(self, c_latent=16): + super().__init__() + self.backbone = efficientnet_v2_s().features + self.backbone.eval() + self.mapper = nn.Sequential( + nn.Conv2D(1280, c_latent, kernel_size=1, bias_attr=False), + BatchNorm2D(c_latent, affine=False), + ) + self.mapper.eval() + + def forward(self, x): + + x = self.backbone(x) + x = self.mapper(x) + return x diff --git a/VLMEvalKit_old/PaddleMIX/ppdiffusers/ppdiffusers/models/stable_cascade/modules/previewer.py b/VLMEvalKit_old/PaddleMIX/ppdiffusers/ppdiffusers/models/stable_cascade/modules/previewer.py new file mode 100644 index 0000000000000000000000000000000000000000..d26ef68dd319d993bf3bc51881441fb657170a62 --- /dev/null +++ b/VLMEvalKit_old/PaddleMIX/ppdiffusers/ppdiffusers/models/stable_cascade/modules/previewer.py @@ -0,0 +1,80 @@ +# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import paddle + + +class Previewer(paddle.nn.Layer): + def __init__(self, c_in=16, c_hidden=512, c_out=3): + super().__init__() + self.blocks = paddle.nn.Sequential( + paddle.nn.Conv2D(in_channels=c_in, out_channels=c_hidden, kernel_size=1), + paddle.nn.GELU(), + paddle.nn.BatchNorm2D(num_features=c_hidden), + paddle.nn.Conv2D(in_channels=c_hidden, out_channels=c_hidden, kernel_size=3, padding=1), + paddle.nn.GELU(), + paddle.nn.BatchNorm2D(num_features=c_hidden), + paddle.nn.Conv2DTranspose( + in_channels=c_hidden, + out_channels=c_hidden // 2, + kernel_size=2, + stride=2, + ), + paddle.nn.GELU(), + paddle.nn.BatchNorm2D(num_features=c_hidden // 2), + paddle.nn.Conv2D( + in_channels=c_hidden // 2, + out_channels=c_hidden // 2, + kernel_size=3, + padding=1, + ), + paddle.nn.GELU(), + paddle.nn.BatchNorm2D(num_features=c_hidden // 2), + paddle.nn.Conv2DTranspose( + in_channels=c_hidden // 2, + out_channels=c_hidden // 4, + kernel_size=2, + stride=2, + ), + paddle.nn.GELU(), + paddle.nn.BatchNorm2D(num_features=c_hidden // 4), + paddle.nn.Conv2D( + in_channels=c_hidden // 4, + out_channels=c_hidden // 4, + kernel_size=3, + padding=1, + ), + paddle.nn.GELU(), + paddle.nn.BatchNorm2D(num_features=c_hidden // 4), + paddle.nn.Conv2DTranspose( + in_channels=c_hidden // 4, + out_channels=c_hidden // 4, + kernel_size=2, + stride=2, + ), + paddle.nn.GELU(), + paddle.nn.BatchNorm2D(num_features=c_hidden // 4), + paddle.nn.Conv2D( + in_channels=c_hidden // 4, + out_channels=c_hidden // 4, + kernel_size=3, + padding=1, + ), + paddle.nn.GELU(), + paddle.nn.BatchNorm2D(num_features=c_hidden // 4), + paddle.nn.Conv2D(in_channels=c_hidden // 4, out_channels=c_out, kernel_size=1), + ) + + def forward(self, x): + return self.blocks(x) diff --git a/VLMEvalKit_old/PaddleMIX/ppdiffusers/ppdiffusers/models/stable_cascade/modules/stage_a.py b/VLMEvalKit_old/PaddleMIX/ppdiffusers/ppdiffusers/models/stable_cascade/modules/stage_a.py new file mode 100644 index 0000000000000000000000000000000000000000..24861c58f4ddf14f4ac88af18d6d8d59f6f6edc6 --- /dev/null +++ b/VLMEvalKit_old/PaddleMIX/ppdiffusers/ppdiffusers/models/stable_cascade/modules/stage_a.py @@ -0,0 +1,206 @@ +# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import paddle +from torchtools.nn import VectorQuantize + + +class ResBlock(paddle.nn.Layer): + def __init__(self, c, c_hidden): + super().__init__() + self.norm1 = paddle.nn.LayerNorm(normalized_shape=c, weight_attr=False, bias_attr=False, epsilon=1e-06) + self.depthwise = paddle.nn.Sequential( + paddle.nn.Pad2D(padding=1, mode="replicate"), + paddle.nn.Conv2D(in_channels=c, out_channels=c, kernel_size=3, groups=c), + ) + self.norm2 = paddle.nn.LayerNorm(normalized_shape=c, weight_attr=False, bias_attr=False, epsilon=1e-06) + self.channelwise = paddle.nn.Sequential( + paddle.nn.Linear(in_features=c, out_features=c_hidden), + paddle.nn.GELU(), + paddle.nn.Linear(in_features=c_hidden, out_features=c), + ) + out_19 = paddle.create_parameter( + shape=paddle.zeros(shape=[6]).shape, + dtype=paddle.zeros(shape=[6]).numpy().dtype, + default_initializer=paddle.nn.initializer.Assign(paddle.zeros(shape=[6])), + ) + out_19.stop_gradient = not True + self.gammas = out_19 + + def _basic_init(module): + if isinstance(module, paddle.nn.Linear) or isinstance(module, paddle.nn.Conv2D): + init_XavierUniform = paddle.nn.initializer.XavierUniform() + init_XavierUniform(module.weight) + if module.bias is not None: + init_Constant = paddle.nn.initializer.Constant(value=0) + init_Constant(module.bias) + + self.apply(_basic_init) + + def _norm(self, x, norm): + return norm(x.transpose(perm=[0, 2, 3, 1])).transpose(perm=[0, 3, 1, 2]) + + def forward(self, x): + mods = self.gammas + x_temp = self._norm(x, self.norm1) * (1 + mods[0]) + mods[1] + x = x + self.depthwise(x_temp) * mods[2] + x_temp = self._norm(x, self.norm2) * (1 + mods[3]) + mods[4] + x = x + self.channelwise(x_temp.transpose(perm=[0, 2, 3, 1])).transpose(perm=[0, 3, 1, 2]) * mods[5] + return x + + +class StageA(paddle.nn.Layer): + def __init__( + self, + levels=2, + bottleneck_blocks=12, + c_hidden=384, + c_latent=4, + codebook_size=8192, + scale_factor=0.43, + ): + super().__init__() + self.c_latent = c_latent + self.scale_factor = scale_factor + c_levels = [(c_hidden // 2**i) for i in reversed(range(levels))] + self.in_block = paddle.nn.Sequential( + paddle.nn.PixelUnshuffle(downscale_factor=2), + paddle.nn.Conv2D(in_channels=3 * 4, out_channels=c_levels[0], kernel_size=1), + ) + down_blocks = [] + for i in range(levels): + if i > 0: + down_blocks.append( + paddle.nn.Conv2D( + in_channels=c_levels[i - 1], + out_channels=c_levels[i], + kernel_size=4, + stride=2, + padding=1, + ) + ) + block = ResBlock(c_levels[i], c_levels[i] * 4) + down_blocks.append(block) + down_blocks.append( + paddle.nn.Sequential( + paddle.nn.Conv2D( + in_channels=c_levels[-1], + out_channels=c_latent, + kernel_size=1, + bias_attr=False, + ), + paddle.nn.BatchNorm2D(num_features=c_latent), + ) + ) + self.down_blocks = paddle.nn.Sequential(*down_blocks) + self.down_blocks[0] + self.codebook_size = codebook_size + self.vquantizer = VectorQuantize(c_latent, k=codebook_size) + up_blocks = [ + paddle.nn.Sequential(paddle.nn.Conv2D(in_channels=c_latent, out_channels=c_levels[-1], kernel_size=1)) + ] + for i in range(levels): + for j in range(bottleneck_blocks if i == 0 else 1): + block = ResBlock(c_levels[levels - 1 - i], c_levels[levels - 1 - i] * 4) + up_blocks.append(block) + if i < levels - 1: + up_blocks.append( + paddle.nn.Conv2DTranspose( + in_channels=c_levels[levels - 1 - i], + out_channels=c_levels[levels - 2 - i], + kernel_size=4, + stride=2, + padding=1, + ) + ) + self.up_blocks = paddle.nn.Sequential(*up_blocks) + self.out_block = paddle.nn.Sequential( + paddle.nn.Conv2D(in_channels=c_levels[0], out_channels=3 * 4, kernel_size=1), + paddle.nn.PixelShuffle(upscale_factor=2), + ) + + def encode(self, x, quantize=False): + x = self.in_block(x) + x = self.down_blocks(x) + if quantize: + qe, (vq_loss, commit_loss), indices = self.vquantizer.forward(x, dim=1) + return ( + qe / self.scale_factor, + x / self.scale_factor, + indices, + vq_loss + commit_loss * 0.25, + ) + else: + return x / self.scale_factor, None, None, None + + def decode(self, x): + x = x * self.scale_factor + x = self.up_blocks(x) + x = self.out_block(x) + return x + + def forward(self, x, quantize=False): + qe, x, _, vq_loss = self.encode(x, quantize) + x = self.decode(qe) + return x, vq_loss + + +class Discriminator(paddle.nn.Layer): + def __init__(self, c_in=3, c_cond=0, c_hidden=512, depth=6): + super().__init__() + d = max(depth - 3, 3) + layers = [ + paddle.nn.utils.spectral_norm( + layer=paddle.nn.Conv2D( + in_channels=c_in, + out_channels=c_hidden // 2**d, + kernel_size=3, + stride=2, + padding=1, + ) + ), + paddle.nn.LeakyReLU(negative_slope=0.2), + ] + for i in range(depth - 1): + c_in = c_hidden // 2 ** max(d - i, 0) + c_out = c_hidden // 2 ** max(d - 1 - i, 0) + layers.append( + paddle.nn.utils.spectral_norm( + layer=paddle.nn.Conv2D( + in_channels=c_in, + out_channels=c_out, + kernel_size=3, + stride=2, + padding=1, + ) + ) + ) + layers.append(paddle.nn.InstanceNorm2D(num_features=c_out, momentum=1 - 0.1)) + layers.append(paddle.nn.LeakyReLU(negative_slope=0.2)) + self.encoder = paddle.nn.Sequential(*layers) + self.shuffle = paddle.nn.Conv2D( + in_channels=c_hidden + c_cond if c_cond > 0 else c_hidden, + out_channels=1, + kernel_size=1, + ) + self.logits = paddle.nn.Sigmoid() + + def forward(self, x, cond=None): + x = self.encoder(x) + if cond is not None: + cond = cond.reshape([cond.shape[0], cond.shape[1], 1, 1]).expand(shape=[-1, -1, x.shape[-2], x.shape[-1]]) + x = paddle.concat(x=[x, cond], axis=1) + x = self.shuffle(x) + x = self.logits(x) + return x diff --git a/VLMEvalKit_old/PaddleMIX/ppdiffusers/ppdiffusers/models/stable_cascade/modules/stage_b.py b/VLMEvalKit_old/PaddleMIX/ppdiffusers/ppdiffusers/models/stable_cascade/modules/stage_b.py new file mode 100644 index 0000000000000000000000000000000000000000..34a9fd7abc8b43658437d367b56ef064dab746fc --- /dev/null +++ b/VLMEvalKit_old/PaddleMIX/ppdiffusers/ppdiffusers/models/stable_cascade/modules/stage_b.py @@ -0,0 +1,349 @@ +# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import math + +import numpy as np +import paddle +import paddle_aux # noqa + +from .common import AttnBlock, FeedForwardBlock, LayerNorm2d, ResBlock, TimestepBlock + + +class StageB(paddle.nn.Layer): + def __init__( + self, + c_in=4, + c_out=4, + c_r=64, + patch_size=2, + c_cond=1280, + c_hidden=[320, 640, 1280, 1280], + nhead=[-1, -1, 20, 20], + blocks=[[2, 6, 28, 6], [6, 28, 6, 2]], + block_repeat=[[1, 1, 1, 1], [3, 3, 2, 2]], + level_config=["CT", "CT", "CTA", "CTA"], + c_clip=1280, + c_clip_seq=4, + c_effnet=16, + c_pixels=3, + kernel_size=3, + dropout=[0, 0, 0.1, 0.1], + self_attn=True, + t_conds=["sca"], + ): + super().__init__() + self.c_r = c_r + self.t_conds = t_conds + self.c_clip_seq = c_clip_seq + if not isinstance(dropout, list): + dropout = [dropout] * len(c_hidden) + if not isinstance(self_attn, list): + self_attn = [self_attn] * len(c_hidden) + self.effnet_mapper = paddle.nn.Sequential( + paddle.nn.Conv2D(in_channels=c_effnet, out_channels=c_hidden[0] * 4, kernel_size=1), + paddle.nn.GELU(), + paddle.nn.Conv2D(in_channels=c_hidden[0] * 4, out_channels=c_hidden[0], kernel_size=1), + LayerNorm2d(c_hidden[0], weight_attr=False, bias_attr=False, epsilon=1e-06), + ) + self.pixels_mapper = paddle.nn.Sequential( + paddle.nn.Conv2D(in_channels=c_pixels, out_channels=c_hidden[0] * 4, kernel_size=1), + paddle.nn.GELU(), + paddle.nn.Conv2D(in_channels=c_hidden[0] * 4, out_channels=c_hidden[0], kernel_size=1), + LayerNorm2d(c_hidden[0], weight_attr=False, bias_attr=False, epsilon=1e-06), + ) + self.clip_mapper = paddle.nn.Linear(in_features=c_clip, out_features=c_cond * c_clip_seq) + self.clip_norm = paddle.nn.LayerNorm( + normalized_shape=c_cond, weight_attr=False, bias_attr=False, epsilon=1e-06 + ) + self.embedding = paddle.nn.Sequential( + paddle.nn.PixelUnshuffle(downscale_factor=patch_size), + paddle.nn.Conv2D( + in_channels=c_in * patch_size**2, + out_channels=c_hidden[0], + kernel_size=1, + ), + LayerNorm2d(c_hidden[0], weight_attr=False, bias_attr=False, epsilon=1e-06), + ) + + def get_block(block_type, c_hidden, nhead, c_skip=0, dropout=0, self_attn=True): + if block_type == "C": + return ResBlock(c_hidden, c_skip, kernel_size=kernel_size, dropout=dropout) + elif block_type == "A": + return AttnBlock(c_hidden, c_cond, nhead, self_attn=self_attn, dropout=dropout) + elif block_type == "F": + return FeedForwardBlock(c_hidden, dropout=dropout) + elif block_type == "T": + return TimestepBlock(c_hidden, c_r, conds=t_conds) + else: + raise Exception(f"Block type {block_type} not supported") + + self.down_blocks = paddle.nn.LayerList() + self.down_downscalers = paddle.nn.LayerList() + self.down_repeat_mappers = paddle.nn.LayerList() + for i in range(len(c_hidden)): + if i > 0: + self.down_downscalers.append( + paddle.nn.Sequential( + LayerNorm2d( + c_hidden[i - 1], + weight_attr=False, + bias_attr=False, + epsilon=1e-06, + ), + paddle.nn.Conv2D( + in_channels=c_hidden[i - 1], + out_channels=c_hidden[i], + kernel_size=2, + stride=2, + ), + ) + ) + else: + self.down_downscalers.append(paddle.nn.Identity()) + down_block = paddle.nn.LayerList() + for _ in range(blocks[0][i]): + for block_type in level_config[i]: + block = get_block( + block_type, + c_hidden[i], + nhead[i], + dropout=dropout[i], + self_attn=self_attn[i], + ) + down_block.append(block) + self.down_blocks.append(down_block) + if block_repeat is not None: + block_repeat_mappers = paddle.nn.LayerList() + for _ in range(block_repeat[0][i] - 1): + block_repeat_mappers.append( + paddle.nn.Conv2D( + in_channels=c_hidden[i], + out_channels=c_hidden[i], + kernel_size=1, + ) + ) + self.down_repeat_mappers.append(block_repeat_mappers) + self.up_blocks = paddle.nn.LayerList() + self.up_upscalers = paddle.nn.LayerList() + self.up_repeat_mappers = paddle.nn.LayerList() + for i in reversed(range(len(c_hidden))): + if i > 0: + self.up_upscalers.append( + paddle.nn.Sequential( + LayerNorm2d( + c_hidden[i], + weight_attr=False, + bias_attr=False, + epsilon=1e-06, + ), + paddle.nn.Conv2DTranspose( + in_channels=c_hidden[i], + out_channels=c_hidden[i - 1], + kernel_size=2, + stride=2, + ), + ) + ) + else: + self.up_upscalers.append(paddle.nn.Identity()) + up_block = paddle.nn.LayerList() + for j in range(blocks[1][::-1][i]): + for k, block_type in enumerate(level_config[i]): + c_skip = c_hidden[i] if i < len(c_hidden) - 1 and j == k == 0 else 0 + block = get_block( + block_type, + c_hidden[i], + nhead[i], + c_skip=c_skip, + dropout=dropout[i], + self_attn=self_attn[i], + ) + up_block.append(block) + self.up_blocks.append(up_block) + if block_repeat is not None: + block_repeat_mappers = paddle.nn.LayerList() + for _ in range(block_repeat[1][::-1][i] - 1): + block_repeat_mappers.append( + paddle.nn.Conv2D( + in_channels=c_hidden[i], + out_channels=c_hidden[i], + kernel_size=1, + ) + ) + self.up_repeat_mappers.append(block_repeat_mappers) + self.clf = paddle.nn.Sequential( + LayerNorm2d(c_hidden[0], weight_attr=False, bias_attr=False, epsilon=1e-06), + paddle.nn.Conv2D( + in_channels=c_hidden[0], + out_channels=c_out * patch_size**2, + kernel_size=1, + ), + paddle.nn.PixelShuffle(upscale_factor=patch_size), + ) + self.apply(self._init_weights) + init_Normal = paddle.nn.initializer.Normal(std=0.02) + init_Normal(self.clip_mapper.weight) + init_Normal = paddle.nn.initializer.Normal(std=0.02) + init_Normal(self.effnet_mapper[0].weight) + init_Normal = paddle.nn.initializer.Normal(std=0.02) + init_Normal(self.effnet_mapper[2].weight) + init_Normal = paddle.nn.initializer.Normal(std=0.02) + init_Normal(self.pixels_mapper[0].weight) + init_Normal = paddle.nn.initializer.Normal(std=0.02) + init_Normal(self.pixels_mapper[2].weight) + paddle.nn.initializer.XavierUniform()(self.embedding[1].weight) + init_Constant = paddle.nn.initializer.Constant(value=0) + init_Constant(self.clf[1].weight) + for level_list in (self.down_blocks, self.up_blocks): + for level_block in level_list: + for block in level_block: + if isinstance(block, ResBlock) or isinstance(block, FeedForwardBlock): + block.channelwise[-1].weight.data *= np.sqrt(1 / sum(blocks[0])) + elif isinstance(block, TimestepBlock): + for layer in block.sublayers(): + if isinstance(layer, paddle.nn.Linear): + init_Constant = paddle.nn.initializer.Constant(value=0) + init_Constant(layer.weight) + + def _init_weights(self, m): + if isinstance(m, (paddle.nn.Conv2D, paddle.nn.Linear)): + init_XavierUniform = paddle.nn.initializer.XavierUniform() + init_XavierUniform(m.weight) + if m.bias is not None: + init_Constant = paddle.nn.initializer.Constant(value=0) + init_Constant(m.bias) + + def gen_r_embedding(self, r, max_positions=10000): + r = r * max_positions + half_dim = self.c_r // 2 + emb = math.log(max_positions) / (half_dim - 1) + emb = paddle.arange(end=half_dim).astype(dtype="float32").mul(-emb).exp() + emb = r[:, None] * emb[None, :] + emb = paddle.concat(x=[emb.sin(), emb.cos()], axis=1) + if self.c_r % 2 == 1: + emb = paddle.nn.functional.pad(emb, [0, 1], mode="constant") + return emb + + def gen_c_embeddings(self, clip): + if len(clip.shape) == 2: + clip = clip.unsqueeze(axis=1) + clip = self.clip_mapper(clip).reshape([clip.shape[0], clip.shape[1] * self.c_clip_seq, -1]) + + clip = self.clip_norm(clip) + return clip + + def _down_encode(self, x, r_embed, clip): + level_outputs = [] + block_group = zip(self.down_blocks, self.down_downscalers, self.down_repeat_mappers) + for down_block, downscaler, repmap in block_group: + x = downscaler(x) + for i in range(len(repmap) + 1): + for block in down_block: + if ( + isinstance(block, ResBlock) + or hasattr(block, "_fsdp_wrapped_module") + and isinstance(block._fsdp_wrapped_module, ResBlock) + ): + x = block(x) + elif ( + isinstance(block, AttnBlock) + or hasattr(block, "_fsdp_wrapped_module") + and isinstance(block._fsdp_wrapped_module, AttnBlock) + ): + x = block(x, clip) + elif ( + isinstance(block, TimestepBlock) + or hasattr(block, "_fsdp_wrapped_module") + and isinstance(block._fsdp_wrapped_module, TimestepBlock) + ): + x = block(x, r_embed) + else: + x = block(x) + if i < len(repmap): + x = repmap[i](x) + level_outputs.insert(0, x) + return level_outputs + + def _up_decode(self, level_outputs, r_embed, clip): + x = level_outputs[0] + block_group = zip(self.up_blocks, self.up_upscalers, self.up_repeat_mappers) + for i, (up_block, upscaler, repmap) in enumerate(block_group): + for j in range(len(repmap) + 1): + for k, block in enumerate(up_block): + if ( + isinstance(block, ResBlock) + or hasattr(block, "_fsdp_wrapped_module") + and isinstance(block._fsdp_wrapped_module, ResBlock) + ): + skip = level_outputs[i] if k == 0 and i > 0 else None + if skip is not None and (x.shape[-1] != skip.shape[-1] or x.shape[-2] != skip.shape[-2]): + x = paddle.nn.functional.interpolate( + x=x.astype(dtype="float32"), + size=skip.shape[-2:], + mode="bilinear", + align_corners=True, + ) + x = block(x, skip) + elif ( + isinstance(block, AttnBlock) + or hasattr(block, "_fsdp_wrapped_module") + and isinstance(block._fsdp_wrapped_module, AttnBlock) + ): + x = block(x, clip) + elif ( + isinstance(block, TimestepBlock) + or hasattr(block, "_fsdp_wrapped_module") + and isinstance(block._fsdp_wrapped_module, TimestepBlock) + ): + x = block(x, r_embed) + else: + x = block(x) + if j < len(repmap): + x = repmap[j](x) + x = upscaler(x) + return x + + def forward(self, x, r, effnet, clip, pixels=None, **kwargs): + if pixels is None: + pixels = paddle.zeros(shape=[x.shape[0], 3, 8, 8], dtype=x.dtype) + r_embed = self.gen_r_embedding(r) + for c in self.t_conds: + t_cond = kwargs.get(c, paddle.zeros_like(x=r)) + r_embed = paddle.concat(x=[r_embed, self.gen_r_embedding(t_cond)], axis=1) + clip = self.gen_c_embeddings(clip) + x = self.embedding(x) + x = x + self.effnet_mapper( + paddle.nn.functional.interpolate( + x=effnet.astype(dtype="float32"), + size=x.shape[-2:], + mode="bilinear", + align_corners=True, + ) + ) + x = x + paddle.nn.functional.interpolate( + x=self.pixels_mapper(pixels).astype(dtype="float32"), + size=x.shape[-2:], + mode="bilinear", + align_corners=True, + ) + level_outputs = self._down_encode(x, r_embed, clip) + x = self._up_decode(level_outputs, r_embed, clip) + return self.clf(x) + + def update_weights_ema(self, src_model, beta=0.999): + for self_params, src_params in zip(self.parameters(), src_model.parameters()): + self_params.data = self_params.data * beta + src_params.data.clone() * (1 - beta) + for self_buffers, src_buffers in zip(self.buffers(), src_model.buffers()): + self_buffers.data = self_buffers.data * beta + src_buffers.data.clone() * (1 - beta) diff --git a/VLMEvalKit_old/PaddleMIX/ppdiffusers/ppdiffusers/models/stable_cascade/modules/stage_c.py b/VLMEvalKit_old/PaddleMIX/ppdiffusers/ppdiffusers/models/stable_cascade/modules/stage_c.py new file mode 100644 index 0000000000000000000000000000000000000000..c868be56c475de877c6cc02b44c03a47e81db102 --- /dev/null +++ b/VLMEvalKit_old/PaddleMIX/ppdiffusers/ppdiffusers/models/stable_cascade/modules/stage_c.py @@ -0,0 +1,368 @@ +# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import math + +import numpy as np +import paddle +import paddle.nn as nn +import paddle_aux # noqa + +from .common import AttnBlock, FeedForwardBlock, LayerNorm2d, ResBlock, TimestepBlock + + +def load(path="../x.npy"): + return paddle.to_tensor(np.load(path)) + + +def diff(a, b): + return (a - b).abs().mean() + + +class UpDownBlock2d(nn.Layer): + def __init__(self, c_in, c_out, mode, enabled=True): + super().__init__() + assert mode in ["up", "down"] + interpolation = ( + nn.Upsample( + scale_factor=2 if mode == "up" else 0.5, + mode="bilinear", + align_corners=True, + ) + if enabled + else nn.Identity() + ) + mapping = nn.Conv2D(in_channels=c_in, out_channels=c_out, kernel_size=1) + self.blocks = nn.LayerList(sublayers=[interpolation, mapping] if mode == "up" else [mapping, interpolation]) + + def forward(self, x): + for block in self.blocks: + x = block(x.astype(paddle.float32)) + return x + + +class StageC(nn.Layer): + def __init__( + self, + c_in=16, + c_out=16, + c_r=64, + patch_size=1, + c_cond=2048, + c_hidden=[2048, 2048], + nhead=[32, 32], + blocks=[[8, 24], [24, 8]], + block_repeat=[[1, 1], [1, 1]], + level_config=["CTA", "CTA"], + c_clip_text=1280, + c_clip_text_pooled=1280, + c_clip_img=768, + c_clip_seq=4, + kernel_size=3, + dropout=[0.1, 0.1], + # dropout=[0, 0], + self_attn=True, + t_conds=["sca", "crp"], + switch_level=[False], + ): + super().__init__() + self.c_r = c_r + self.t_conds = t_conds + self.c_clip_seq = c_clip_seq + if not isinstance(dropout, list): + dropout = [dropout] * len(c_hidden) + if not isinstance(self_attn, list): + self_attn = [self_attn] * len(c_hidden) + # CONDITIONING + self.clip_txt_mapper = nn.Linear(c_clip_text, c_cond) + self.clip_txt_pooled_mapper = nn.Linear(c_clip_text_pooled, c_cond * c_clip_seq) + self.clip_img_mapper = nn.Linear(c_clip_img, c_cond * c_clip_seq) + self.clip_norm = nn.LayerNorm(c_cond, weight_attr=False, bias_attr=False, epsilon=1e-6) + + self.embedding = nn.Sequential( + nn.PixelUnshuffle(patch_size), + nn.Conv2D(c_in * (patch_size**2), c_hidden[0], kernel_size=1), + LayerNorm2d(c_hidden[0], weight_attr=False, bias_attr=False, epsilon=1e-6), + ) + + def get_block(block_type, c_hidden, nhead, c_skip=0, dropout=0, self_attn=True): + if block_type == "C": + return ResBlock(c_hidden, c_skip, kernel_size=kernel_size, dropout=dropout) + elif block_type == "A": + return AttnBlock(c_hidden, c_cond, nhead, self_attn=self_attn, dropout=dropout) + elif block_type == "F": + return FeedForwardBlock(c_hidden, dropout=dropout) + elif block_type == "T": + return TimestepBlock(c_hidden, c_r, conds=t_conds) + else: + raise Exception(f"Block type {block_type} not supported") + + self.down_blocks = nn.LayerList() + self.down_downscalers = nn.LayerList() + self.down_repeat_mappers = nn.LayerList() + for i in range(len(c_hidden)): + if i > 0: + self.down_downscalers.append( + nn.Sequential( + LayerNorm2d( + c_hidden[i - 1], + weight_attr=False, + bias_attr=False, + epsilon=1e-06, + ), + UpDownBlock2d( + c_hidden[i - 1], + c_hidden[i], + mode="down", + enabled=switch_level[i - 1], + ), + ) + ) + else: + self.down_downscalers.append(nn.Identity()) + down_block = nn.LayerList() + for _ in range(blocks[0][i]): + for block_type in level_config[i]: + block = get_block( + block_type, + c_hidden[i], + nhead[i], + dropout=dropout[i], + self_attn=self_attn[i], + ) + down_block.append(block) + self.down_blocks.append(down_block) + if block_repeat is not None: + block_repeat_mappers = nn.LayerList() + for _ in range(block_repeat[0][i] - 1): + block_repeat_mappers.append(nn.Conv2D(c_hidden[i], c_hidden[i], kernel_size=1)) + self.down_repeat_mappers.append(block_repeat_mappers) + self.up_blocks = nn.LayerList() + self.up_upscalers = nn.LayerList() + self.up_repeat_mappers = nn.LayerList() + for i in reversed(range(len(c_hidden))): + if i > 0: + self.up_upscalers.append( + nn.Sequential( + LayerNorm2d(c_hidden[i], weight_attr=False, bias_attr=False, epsilon=1e-6), + UpDownBlock2d( + c_hidden[i], + c_hidden[i - 1], + mode="up", + enabled=switch_level[i - 1], + ), + ) + ) + else: + self.up_upscalers.append(nn.Identity()) + up_block = nn.LayerList() + for j in range(blocks[1][::-1][i]): + for k, block_type in enumerate(level_config[i]): + c_skip = c_hidden[i] if i < len(c_hidden) - 1 and j == k == 0 else 0 + block = get_block( + block_type, + c_hidden[i], + nhead[i], + c_skip=c_skip, + dropout=dropout[i], + self_attn=self_attn[i], + ) + up_block.append(block) + self.up_blocks.append(up_block) + if block_repeat is not None: + block_repeat_mappers = nn.LayerList() + for _ in range(block_repeat[1][::-1][i] - 1): + block_repeat_mappers.append(nn.Conv2D(c_hidden[i], c_hidden[i], kernel_size=1)) + self.up_repeat_mappers.append(block_repeat_mappers) + self.clf = nn.Sequential( + LayerNorm2d(c_hidden[0], weight_attr=False, bias_attr=False, epsilon=1e-06), + nn.Conv2D(c_hidden[0], c_out * (patch_size**2), kernel_size=1), + nn.PixelShuffle(upscale_factor=patch_size), + ) + self.apply(self._init_weights) + init_Normal = nn.initializer.Normal(std=0.02) + init_Normal(self.clip_txt_mapper.weight) + init_Normal = nn.initializer.Normal(std=0.02) + init_Normal(self.clip_txt_pooled_mapper.weight) + init_Normal = nn.initializer.Normal(std=0.02) + init_Normal(self.clip_img_mapper.weight) + init_Xavier = nn.initializer.XavierUniform() + self.embedding[1].weight = self.create_parameter( + shape=self.embedding[1].weight.shape, default_initializer=init_Xavier + ) + init_Constant = nn.initializer.Constant(value=0) + init_Constant(self.clf[1].weight) + + for level_list in (self.down_blocks, self.up_blocks): + for level_block in level_list: + for block in level_block: + if isinstance(block, ResBlock) or isinstance(block, FeedForwardBlock): + block.channelwise[-1].weight.multiply(np.sqrt(1 / sum(blocks[0]))) + elif isinstance(block, TimestepBlock): + for layer in block.sublayers(): + if isinstance(layer, nn.Linear): + init_Constant = nn.initializer.Constant(value=0) + init_Constant(layer.weight) + + def _init_weights(self, m): + if isinstance(m, (nn.Conv2D, nn.Linear)): + init_XavierUniform = nn.initializer.XavierUniform() + init_XavierUniform(m.weight) + if m.bias is not None: + init_Constant = nn.initializer.Constant(value=0) + init_Constant(m.bias) + + def gen_r_embedding(self, r, max_positions=10000): + r = r * max_positions + half_dim = self.c_r // 2 + emb = math.log(max_positions) / (half_dim - 1) + emb = paddle.arange(end=half_dim).astype(dtype="float32").mul(-emb).exp() + emb = r[:, None] * emb[None, :] + emb = paddle.concat(x=[emb.sin(), emb.cos()], axis=1) + if self.c_r % 2 == 1: + emb = nn.functional.pad(emb, [0, 1], mode="constant") + return emb + + def gen_c_embeddings(self, clip_txt, clip_txt_pooled, clip_img): + clip_txt = self.clip_txt_mapper(clip_txt) + if len(clip_txt_pooled.shape) == 2: + clip_txt_pool = clip_txt_pooled.unsqueeze(axis=1) + if len(clip_img.shape) == 2: + clip_img = paddle.unsqueeze(clip_img, axis=1) + + clip_txt_pool = self.clip_txt_pooled_mapper(clip_txt_pooled).reshape( + [clip_txt_pooled.shape[0], clip_txt_pooled.shape[1] * self.c_clip_seq, -1] + ) + + clip_img = self.clip_img_mapper(clip_img).reshape([clip_img.shape[0], clip_img.shape[1] * self.c_clip_seq, -1]) + + clip = paddle.concat(x=[clip_txt, clip_txt_pool, clip_img], axis=1) + clip = self.clip_norm(clip) + + return clip + + def _down_encode(self, x, r_embed, clip, cnet=None): + level_outputs = [] + block_group = zip(self.down_blocks, self.down_downscalers, self.down_repeat_mappers) + for down_block, downscaler, repmap in block_group: + x = downscaler(x) + for i in range(len(repmap) + 1): + for block in down_block: + if ( + isinstance(block, ResBlock) + or hasattr(block, "_fsdp_wrapped_module") + and isinstance(block._fsdp_wrapped_module, ResBlock) + ): + if cnet is not None: + next_cnet = cnet() + if next_cnet is not None: + x = x + nn.functional.interpolate( + next_cnet, + size=x.shape[-2:], + mode="bilinear", + align_corners=True, + ) + x = block(x) + + elif ( + isinstance(block, AttnBlock) + or hasattr(block, "_fsdp_wrapped_module") + and isinstance(block._fsdp_wrapped_module, AttnBlock) + ): + x = block(x, clip) + + elif ( + isinstance(block, TimestepBlock) + or hasattr(block, "_fsdp_wrapped_module") + and isinstance(block._fsdp_wrapped_module, TimestepBlock) + ): + x = block(x, r_embed) + else: + x = block(x) + + if i < len(repmap): + x = repmap[i](x) + level_outputs.insert(0, x) + return level_outputs + + def _up_decode(self, level_outputs, r_embed, clip, cnet=None): + x = level_outputs[0] + block_group = zip(self.up_blocks, self.up_upscalers, self.up_repeat_mappers) + count_i = 0 + for i, (up_block, upscaler, repmap) in enumerate(block_group): + count_i += 1 + count_j = 0 + for j in range(len(repmap) + 1): + count_j += 1 + count_k = 0 + for k, block in enumerate(up_block): + count_k += 1 + + if ( + isinstance(block, ResBlock) + or hasattr(block, "_fsdp_wrapped_module") + and isinstance(block._fsdp_wrapped_module, ResBlock) + ): + skip = level_outputs[i] if k == 0 and i > 0 else None + if skip is not None and (x.shape[-1] != skip.shape[-1] or x.shape[-2] != skip.shape[-2]): + x = nn.functional.interpolate( + x=x.astype(paddle.float32), + size=skip.shape[-2:], + mode="bilinear", + align_corners=True, + ) + x = block(x, skip) + elif ( + isinstance(block, AttnBlock) + or hasattr(block, "_fsdp_wrapped_module") + and isinstance(block._fsdp_wrapped_module, AttnBlock) + ): + x = block(x, clip) + elif ( + isinstance(block, TimestepBlock) + or hasattr(block, "_fsdp_wrapped_module") + and isinstance(block._fsdp_wrapped_module, TimestepBlock) + ): + x = block(x, r_embed) + else: + x = block(x) + + if j < len(repmap): + x = repmap[j](x) + + x = upscaler(x) + + return x + + def forward(self, x, r, clip_text, clip_text_pooled, clip_img, cnet=None, **kwargs): + + r_embed = self.gen_r_embedding(r) + for c in self.t_conds: + t_cond = kwargs.get(c, paddle.zeros_like(r)) + r_embed = paddle.concat(x=[r_embed, self.gen_r_embedding(t_cond)], axis=1) + clip = self.gen_c_embeddings(clip_text, clip_text_pooled, clip_img) + + x = self.embedding(x) + level_outputs = self._down_encode(x, r_embed, clip, cnet) + x = self._up_decode(level_outputs, r_embed, clip, cnet) + x = self.clf(x) + # x.register_hook(lambda grad: print("@@@ before-clf-x @@@", grad.shape, grad.abs().mean())) + + return x + + def update_weights_ema(self, src_model, beta=0.999): + for self_params, src_params in zip(self.parameters(), src_model.parameters()): + self_params.data = self_params.data * beta + src_params.data.clone() * (1 - beta) + for self_buffers, src_buffers in zip(self.buffers(), src_model.buffers()): + self_buffers.data = self_buffers.data * beta + src_buffers.data.clone() * (1 - beta) diff --git a/VLMEvalKit_old/PaddleMIX/ppdiffusers/ppdiffusers/pipelines/blip_diffusion/__init__.py b/VLMEvalKit_old/PaddleMIX/ppdiffusers/ppdiffusers/pipelines/blip_diffusion/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..b68e33e511a5ed3eee62e9397a9abd6c05d54086 --- /dev/null +++ b/VLMEvalKit_old/PaddleMIX/ppdiffusers/ppdiffusers/pipelines/blip_diffusion/__init__.py @@ -0,0 +1,37 @@ +# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from dataclasses import dataclass +from typing import List, Optional, Union + +import numpy as np +import PIL +from PIL import Image + +from ...utils import ( + OptionalDependencyNotAvailable, + is_paddle_available, + is_paddlenlp_available, +) + +try: + if not (is_paddlenlp_available() and is_paddle_available()): + raise OptionalDependencyNotAvailable() +except OptionalDependencyNotAvailable: + from ...utils.dummy_paddle_and_paddlenlp_objects import ShapEPipeline +else: + from .blip_image_processing import BlipImageProcessor + from .modeling_blip2 import Blip2QFormerModel + from .modeling_ctx_clip import ContextCLIPTextModel + from .pipeline_blip_diffusion import BlipDiffusionPipeline diff --git a/VLMEvalKit_old/PaddleMIX/ppdiffusers/ppdiffusers/pipelines/blip_diffusion/modeling_blip2.py b/VLMEvalKit_old/PaddleMIX/ppdiffusers/ppdiffusers/pipelines/blip_diffusion/modeling_blip2.py new file mode 100644 index 0000000000000000000000000000000000000000..484577c2d8ec3c86d85cc0afb335db649d88fa14 --- /dev/null +++ b/VLMEvalKit_old/PaddleMIX/ppdiffusers/ppdiffusers/pipelines/blip_diffusion/modeling_blip2.py @@ -0,0 +1,659 @@ +# Copyright 2023 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from typing import Optional, Tuple, Union + +import paddle +from paddle import nn +from paddlenlp.transformers.activations import QuickGELUActivation as QuickGELU +from paddlenlp.transformers.blip_2.configuration import Blip2Config, Blip2VisionConfig +from paddlenlp.transformers.blip_2.modeling import ( + Blip2Encoder, + Blip2QFormerAttention, + Blip2QFormerIntermediate, + Blip2QFormerOutput, +) +from paddlenlp.transformers.model_outputs import ( + BaseModelOutputWithPastAndCrossAttentions, + BaseModelOutputWithPooling, + BaseModelOutputWithPoolingAndCrossAttentions, +) +from paddlenlp.transformers.model_utils import apply_chunking_to_forward + +from ppdiffusers.transformers import BertTokenizer, PretrainedModel + +from ...utils import logging + +logger = logging.get_logger(__name__) + + +class Blip2PretrainedModel(PretrainedModel): + """ + An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained + models. + """ + + config_class = Blip2Config + base_model_prefix = "blip" + supports_gradient_checkpointing = True + _keys_to_ignore_on_load_missing = [ + r"position_ids", + r"language_model.encoder.embed_tokens.weight", + r"language_model.decoder.embed_tokens.weight", + ] + _no_split_modules = ["Blip2Attention", "T5Block", "OPTDecoderLayer"] + _keep_in_fp32_modules = ["wo"] + + def _init_weights(self, module): + """Initialize the weights""" + factor = self.config.initializer_range + if isinstance(module, nn.Conv2D) or isinstance(module, nn.Embedding) or isinstance(module, nn.Linear): + nn.init.normal_(module.weight, mean=0.0, std=factor) + if hasattr(module, "padding_idx") and module.padding_idx is not None: + module.weight[module.padding_idx] = 0.0 + if hasattr(module, "bias") and module.bias is not None: + nn.init.zeros_(module.bias) + if isinstance(module, Blip2VisionEmbeddings): + if hasattr(self.config, "vision_config"): + factor = self.config.vision_config.initializer_range + trunc_normal_ = nn.initializer.TruncatedNormal(mean=0.0, std=factor) + trunc_normal_(module.position_embedding) + trunc_normal_( + module.class_embedding, + ) + elif isinstance(module, nn.LayerNorm): + nn.init.zeros_(module.bias) + nn.init.ones_(module.weight) + elif isinstance(module, nn.Linear) and module.bias is not None: + nn.init.zeros_(module.bias) + + +# There is an implementation of Blip2 in `transformers` : https://github.com/huggingface/transformers/blob/main/src/transformers/models/blip_2/modeling_blip_2.py. +# But it doesn't support getting multimodal embeddings. So, this module can be +# replaced with a future `transformers` version supports that. +class Blip2TextEmbeddings(nn.Layer): + """Construct the embeddings from word and position embeddings.""" + + def __init__(self, config): + super().__init__() + self.word_embeddings = nn.Embedding( + config.vocab_size, config.hidden_size + ) # padding_idx=config.pad_token_id NOTE, donot set padding_idx + self.word_embeddings.padding_idx = config.pad_token_id + self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.hidden_size) + + # self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load + # any TensorFlow checkpoint file + self.LayerNorm = nn.LayerNorm(config.hidden_size, epsilon=config.layer_norm_eps) + self.dropout = nn.Dropout(config.hidden_dropout_prob) + + # position_ids (1, len position emb) is contiguous in memory and exported when serialized + self.register_buffer( + "position_ids", paddle.arange(config.max_position_embeddings, dtype=paddle.int64).expand((1, -1)) + ) + self.position_embedding_type = getattr(config, "position_embedding_type", "absolute") + + self.config = config + + def forward( + self, + input_ids=None, + position_ids=None, + query_embeds=None, + past_key_values_length=0, + ): + if input_ids is not None: + seq_length = input_ids.shape[1] + else: + seq_length = 0 + + if position_ids is None: + position_ids = self.position_ids[:, past_key_values_length : seq_length + past_key_values_length].clone() + + if input_ids is not None: + embeddings = self.word_embeddings(input_ids) + if self.position_embedding_type == "absolute": + position_embeddings = self.position_embeddings(position_ids) + embeddings = embeddings + position_embeddings + + if query_embeds is not None: + batch_size = embeddings.shape[0] + # repeat the query embeddings for batch size + query_embeds = query_embeds.tile([batch_size, 1, 1]) + embeddings = paddle.concat((query_embeds, embeddings), axis=1) + else: + embeddings = query_embeds + embeddings = embeddings.cast(query_embeds.dtype) + embeddings = self.LayerNorm(embeddings) + embeddings = self.dropout(embeddings) + return embeddings + + +# Copy-pasted from transformers.models.blip.modeling_blip.BlipVisionEmbeddings with Blip->Blip2 +class Blip2VisionEmbeddings(nn.Layer): + def __init__(self, config: Blip2VisionConfig): + super().__init__() + self.config = config + self.embed_dim = config.hidden_size + self.image_size = config.image_size + self.patch_size = config.patch_size + + self.class_embedding = nn.Parameter(paddle.randn([1, 1, self.embed_dim])) + + self.patch_embedding = nn.Conv2D( + in_channels=3, + out_channels=self.embed_dim, + kernel_size=self.patch_size, + stride=self.patch_size, + bias_attr=False, + ) + + self.num_patches = (self.image_size // self.patch_size) ** 2 + self.num_positions = self.num_patches + 1 + + self.position_embedding = nn.Parameter(paddle.randn([1, self.num_positions, self.embed_dim])) + + def forward(self, pixel_values: paddle.Tensor) -> paddle.Tensor: + batch_size = pixel_values.shape[0] + target_dtype = self.patch_embedding.weight.dtype + patch_embeds = self.patch_embedding(pixel_values.cast(dtype=target_dtype)) # shape = [*, width, grid, grid] + patch_embeds = patch_embeds.flatten(2).transpose([0, 2, 1]) + + class_embeds = self.class_embedding.expand([batch_size, 1, -1]).cast(target_dtype) + embeddings = paddle.concat([class_embeds, patch_embeds], axis=1) + embeddings = embeddings + self.position_embedding[:, : embeddings.shape[1], :].cast(target_dtype) + return embeddings + + +# The Qformer encoder, which takes the visual embeddings, and the text input, to get multimodal embeddings +class Blip2QFormerEncoder(nn.Layer): + def __init__(self, config): + super().__init__() + self.config = config + self.layer = nn.LayerList( + [Blip2QFormerLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)] + ) + self.gradient_checkpointing = False + + def forward( + self, + hidden_states, + attention_mask=None, + encoder_hidden_states=None, + encoder_attention_mask=None, + past_key_values=None, + use_cache=None, + output_attentions=False, + output_hidden_states=False, + return_dict=True, + query_length=0, + ): + all_hidden_states = () if output_hidden_states else None + all_self_attentions = () if output_attentions else None + all_cross_attentions = () if output_attentions else None + + next_decoder_cache = () if use_cache else None + + if getattr(self.config, "gradient_checkpointing", False) and self.training: + if use_cache: + logger.warning_once( + "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..." + ) + use_cache = False + + for i in range(self.config.num_hidden_layers): + layer_module = self.layer[i] + if output_hidden_states: + all_hidden_states = all_hidden_states + (hidden_states,) + + past_key_value = past_key_values[i] if past_key_values is not None else None + + if self.gradient_checkpointing and not hidden_states.stop_gradient: + layer_outputs = self._gradient_checkpointing_func( + layer_module.__call__, + hidden_states, + attention_mask, + encoder_hidden_states, + encoder_attention_mask, + past_key_value, + output_attentions, + query_length, + ) + + else: + layer_outputs = layer_module( + hidden_states, + attention_mask, + encoder_hidden_states, + encoder_attention_mask, + past_key_value, + output_attentions, + query_length, + ) + + hidden_states = layer_outputs[0] + if use_cache: + next_decoder_cache += (layer_outputs[-1],) + if output_attentions: + all_self_attentions = all_self_attentions + (layer_outputs[1],) + if layer_module.has_cross_attention: + all_cross_attentions = all_cross_attentions + (layer_outputs[2],) + + if output_hidden_states: + all_hidden_states = all_hidden_states + (hidden_states,) + + if not return_dict: + return tuple( + v + for v in [ + hidden_states, + next_decoder_cache, + all_hidden_states, + all_self_attentions, + all_cross_attentions, + ] + if v is not None + ) + return BaseModelOutputWithPastAndCrossAttentions( + last_hidden_state=hidden_states, + past_key_values=next_decoder_cache, + hidden_states=all_hidden_states, + attentions=all_self_attentions, + cross_attentions=all_cross_attentions, + ) + + +# The layers making up the Qformer encoder +class Blip2QFormerLayer(nn.Layer): + def __init__(self, config, layer_idx): + super().__init__() + self.chunk_size_feed_forward = config.chunk_size_feed_forward + self.seq_len_dim = 1 + self.attention = Blip2QFormerAttention(config) + + self.layer_idx = layer_idx + + if layer_idx % config.cross_attention_frequency == 0: + self.crossattention = Blip2QFormerAttention(config, is_cross_attention=True) + self.has_cross_attention = True + else: + self.has_cross_attention = False + + self.intermediate = Blip2QFormerIntermediate(config) + self.intermediate_query = Blip2QFormerIntermediate(config) + self.output_query = Blip2QFormerOutput(config) + self.output = Blip2QFormerOutput(config) + + def forward( + self, + hidden_states, + attention_mask=None, + encoder_hidden_states=None, + encoder_attention_mask=None, + past_key_value=None, + output_attentions=False, + query_length=0, + ): + # decoder uni-directional self-attention cached key/values tuple is at positions 1,2 + self_attn_past_key_value = past_key_value[:2] if past_key_value is not None else None + self_attention_outputs = self.attention( + hidden_states, + attention_mask, + output_attentions=output_attentions, + past_key_value=self_attn_past_key_value, + ) + attention_output = self_attention_outputs[0] + outputs = self_attention_outputs[1:-1] + + present_key_value = self_attention_outputs[-1] + + if query_length > 0: + query_attention_output = attention_output[:, :query_length, :] + + if self.has_cross_attention: + if encoder_hidden_states is None: + raise ValueError("encoder_hidden_states must be given for cross-attention layers") + cross_attention_outputs = self.crossattention( + query_attention_output, + attention_mask, + encoder_hidden_states, + encoder_attention_mask, + output_attentions=output_attentions, + ) + query_attention_output = cross_attention_outputs[0] + # add cross attentions if we output attention weights + outputs = outputs + cross_attention_outputs[1:-1] + + layer_output = apply_chunking_to_forward( + self.feed_forward_chunk_query, + self.chunk_size_feed_forward, + self.seq_len_dim, + query_attention_output, + ) + + if attention_output.shape[1] > query_length: + layer_output_text = apply_chunking_to_forward( + self.feed_forward_chunk, + self.chunk_size_feed_forward, + self.seq_len_dim, + attention_output[:, query_length:, :], + ) + layer_output = paddle.concat([layer_output, layer_output_text], axis=1) + else: + layer_output = apply_chunking_to_forward( + self.feed_forward_chunk, + self.chunk_size_feed_forward, + self.seq_len_dim, + attention_output, + ) + outputs = (layer_output,) + outputs + + outputs = outputs + (present_key_value,) + + return outputs + + def feed_forward_chunk(self, attention_output): + intermediate_output = self.intermediate(attention_output) + layer_output = self.output(intermediate_output, attention_output) + return layer_output + + def feed_forward_chunk_query(self, attention_output): + intermediate_output = self.intermediate_query(attention_output) + layer_output = self.output_query(intermediate_output, attention_output) + return layer_output + + +# ProjLayer used to project the multimodal Blip2 embeddings to be used in the text encoder +class ProjLayer(nn.Layer): + def __init__(self, in_dim, out_dim, hidden_dim, drop_p=0.1, eps=1e-12): + super().__init__() + + # Dense1 -> Act -> Dense2 -> Drop -> Res -> Norm + self.dense1 = nn.Linear(in_dim, hidden_dim) + self.act_fn = QuickGELU() + self.dense2 = nn.Linear(hidden_dim, out_dim) + self.dropout = nn.Dropout(drop_p) + + self.LayerNorm = nn.LayerNorm(out_dim, epsilon=eps) + + def forward(self, x): + x_in = x + + x = self.LayerNorm(x) + x = self.dropout(self.dense2(self.act_fn(self.dense1(x)))) + x_in + + return x + + +# Copy-pasted from transformers.models.blip.modeling_blip.BlipVisionModel with Blip->Blip2, BLIP->BLIP_2 +class Blip2VisionModel(Blip2PretrainedModel): + main_input_name = "pixel_values" + config_class = Blip2VisionConfig + + def __init__(self, config: Blip2VisionConfig): + super().__init__(config) + self.config = config + embed_dim = config.hidden_size + self.embeddings = Blip2VisionEmbeddings(config) + self.pre_layernorm = nn.LayerNorm(embed_dim, epsilon=config.layer_norm_eps) + self.encoder = Blip2Encoder(config) + self.post_layernorm = nn.LayerNorm(embed_dim, epsilon=config.layer_norm_eps) + + self.post_init() + + def forward( + self, + pixel_values: Optional[paddle.Tensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[Tuple, BaseModelOutputWithPooling]: + r""" + Returns: + + """ + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + if pixel_values is None: + raise ValueError("You have to specify pixel_values") + + hidden_states = self.embeddings(pixel_values) + hidden_states = self.pre_layernorm(hidden_states) + encoder_outputs = self.encoder( + inputs_embeds=hidden_states, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + last_hidden_state = encoder_outputs[0] + last_hidden_state = self.post_layernorm(last_hidden_state) + + pooled_output = last_hidden_state[:, 0, :] + pooled_output = self.post_layernorm(pooled_output) + + if not return_dict: + return (last_hidden_state, pooled_output) + encoder_outputs[1:] + + return BaseModelOutputWithPooling( + last_hidden_state=last_hidden_state, + pooler_output=pooled_output, + hidden_states=encoder_outputs.hidden_states, + attentions=encoder_outputs.attentions, + ) + + def get_input_embeddings(self): + return self.embeddings + + +# Qformer model, used to get multimodal embeddings from the text and image inputs +class Blip2QFormerModel(Blip2PretrainedModel): + """ + Querying Transformer (Q-Former), used in BLIP-2. + """ + + def __init__(self, config: Blip2Config): + super().__init__(config) + self.config = config + self.embeddings = Blip2TextEmbeddings(config.qformer_config) + self.visual_encoder = Blip2VisionModel(config.vision_config) + self.query_tokens = nn.Parameter(paddle.zeros([1, config.num_query_tokens, config.qformer_config.hidden_size])) + if not hasattr(config, "tokenizer") or config.tokenizer is None: + self.tokenizer = BertTokenizer.from_pretrained("bert-base-uncased", truncation_side="right") + else: + self.tokenizer = BertTokenizer.from_pretrained(config.tokenizer, truncation_side="right") + self.tokenizer.add_special_tokens({"bos_token": "[DEC]"}) + self.proj_layer = ProjLayer( + in_dim=config.qformer_config.hidden_size, + out_dim=config.qformer_config.hidden_size, + hidden_dim=config.qformer_config.hidden_size * 4, + drop_p=0.1, + eps=1e-12, + ) + + self.encoder = Blip2QFormerEncoder(config.qformer_config) + + self.post_init() + + def get_input_embeddings(self): + return self.embeddings.word_embeddings + + def set_input_embeddings(self, value): + self.embeddings.word_embeddings = value + + def get_extended_attention_mask( + self, + attention_mask: paddle.Tensor, + input_shape: Tuple[int], + has_query: bool = False, + ) -> paddle.Tensor: + """ + Makes broadcastable attention and causal masks so that future and masked tokens are ignored. + + Arguments: + attention_mask (`paddle.Tensor`): + Mask with ones indicating tokens to attend to, zeros for tokens to ignore. + input_shape (`Tuple[int]`): + The shape of the input to the model. + + Returns: + `paddle.Tensor` The extended attention mask, with a the same dtype as `attention_mask.dtype`. + """ + # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length] + # ourselves in which case we just need to make it broadcastable to all heads. + if attention_mask.dim() == 3: + extended_attention_mask = attention_mask[:, None, :, :] + elif attention_mask.dim() == 2: + # Provided a padding mask of dimensions [batch_size, seq_length] + # - the model is an encoder, so make the mask broadcastable to [batch_size, num_heads, seq_length, seq_length] + extended_attention_mask = attention_mask[:, None, None, :] + else: + raise ValueError( + "Wrong shape for input_ids (shape {}) or attention_mask (shape {})".format( + input_shape, attention_mask.shape + ) + ) + + # Since attention_mask is 1.0 for positions we want to attend and 0.0 for + # masked positions, this operation will create a tensor which is 0.0 for + # positions we want to attend and -10000.0 for masked positions. + # Since we are adding it to the raw scores before the softmax, this is + # effectively the same as removing these entirely. + extended_attention_mask = extended_attention_mask.cast(dtype=self.dtype) # fp16 compatibility + extended_attention_mask = (1.0 - extended_attention_mask) * -10000.0 + return extended_attention_mask + + def forward( + self, + text_input=None, + image_input=None, + encoder_hidden_states=None, + encoder_attention_mask=None, + past_key_values=None, + use_cache=None, + output_attentions=None, + output_hidden_states=None, + return_dict=None, + ): + r""" + encoder_hidden_states (`paddle.Tensor` of shape `(batch_size, sequence_length, hidden_size)`, `optional`): + Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if + the model is configured as a decoder. + encoder_attention_mask (`paddle.Tensor` of shape `(batch_size, sequence_length)`, `optional`): + Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in + the cross-attention if the model is configured as a decoder. Mask values selected in `[0, 1]`: + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + past_key_values (`tuple(tuple(paddle.Tensor))` of length `config.n_layers` with each tuple having 4 tensors of: + shape `(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`): Contains precomputed key and + value hidden states of the attention blocks. Can be used to speed up decoding. If `past_key_values` are + used, the user can optionally input only the last `decoder_input_ids` (those that don't have their past key + value states given to this model) of shape `(batch_size, 1)` instead of all `decoder_input_ids` of shape + `(batch_size, sequence_length)`. + use_cache (`bool`, `optional`): + If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see + `past_key_values`). + """ + + text = self.tokenizer(text_input, return_tensors="pd", padding=True, return_attention_mask=True) + input_ids = text.input_ids + batch_size = input_ids.shape[0] + query_atts = paddle.ones((batch_size, self.query_tokens.shape[1]), dtype=paddle.int64) + attention_mask = paddle.concat([query_atts, text.attention_mask], axis=1) + + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + # past_key_values_length + past_key_values_length = ( + past_key_values[0][0].shape[2] - self.config.query_length if past_key_values is not None else 0 + ) + + query_length = self.query_tokens.shape[1] + + embedding_output = self.embeddings( + input_ids=input_ids, + query_embeds=self.query_tokens, + past_key_values_length=past_key_values_length, + ) + + # embedding_output = self.layernorm(query_embeds) + # embedding_output = self.dropout(embedding_output) + + input_shape = embedding_output.shape[:-1] + batch_size, seq_length = input_shape + + image_embeds_frozen = self.visual_encoder(image_input).last_hidden_state + # image_embeds_frozen = paddle.ones_like(image_embeds_frozen) + encoder_hidden_states = image_embeds_frozen + + if attention_mask is None: + attention_mask = paddle.ones( + ((batch_size, seq_length + past_key_values_length)), + ) + + # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length] + # ourselves in which case we just need to make it broadcastable to all heads. + extended_attention_mask = self.get_extended_attention_mask(attention_mask, input_shape) + + # If a 2D or 3D attention mask is provided for the cross-attention + # we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length] + if encoder_hidden_states is not None: + if isinstance(encoder_hidden_states, list): + encoder_batch_size, encoder_sequence_length, _ = encoder_hidden_states[0].shape + else: + encoder_batch_size, encoder_sequence_length, _ = encoder_hidden_states.shape + encoder_hidden_shape = (encoder_batch_size, encoder_sequence_length) + + if isinstance(encoder_attention_mask, list): + encoder_extended_attention_mask = [self.invert_attention_mask(mask) for mask in encoder_attention_mask] + elif encoder_attention_mask is None: + encoder_attention_mask = paddle.ones(encoder_hidden_shape) + encoder_extended_attention_mask = self.invert_attention_mask(encoder_attention_mask) + else: + encoder_extended_attention_mask = self.invert_attention_mask(encoder_attention_mask) + else: + encoder_extended_attention_mask = None + + encoder_outputs = self.encoder( + embedding_output, + attention_mask=extended_attention_mask, + encoder_hidden_states=encoder_hidden_states, + encoder_attention_mask=encoder_extended_attention_mask, + past_key_values=past_key_values, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + query_length=query_length, + ) + sequence_output = encoder_outputs[0] + pooled_output = sequence_output[:, 0, :] + + if not return_dict: + return self.proj_layer(sequence_output[:, :query_length, :]) + + return BaseModelOutputWithPoolingAndCrossAttentions( + last_hidden_state=sequence_output, + pooler_output=pooled_output, + past_key_values=encoder_outputs.past_key_values, + hidden_states=encoder_outputs.hidden_states, + attentions=encoder_outputs.attentions, + cross_attentions=encoder_outputs.cross_attentions, + ) diff --git a/VLMEvalKit_old/PaddleMIX/ppdiffusers/ppdiffusers/pipelines/blip_diffusion/modeling_ctx_clip.py b/VLMEvalKit_old/PaddleMIX/ppdiffusers/ppdiffusers/pipelines/blip_diffusion/modeling_ctx_clip.py new file mode 100644 index 0000000000000000000000000000000000000000..b78442c52e0b777e6a601f0b205cf9ccb5c75991 --- /dev/null +++ b/VLMEvalKit_old/PaddleMIX/ppdiffusers/ppdiffusers/pipelines/blip_diffusion/modeling_ctx_clip.py @@ -0,0 +1,248 @@ +# Copyright 2023 Salesforce.com, inc. +# Copyright 2023 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from typing import Optional, Tuple, Union + +import paddle +from paddle import nn +from paddlenlp.transformers.model_outputs import BaseModelOutputWithPooling + +from ppdiffusers.transformers import CLIPPretrainedModel +from ppdiffusers.transformers.clip.configuration import CLIPTextConfig +from ppdiffusers.transformers.clip.modeling import CLIPEncoder + + +def _expand_mask(mask: paddle.Tensor, dtype, tgt_len: Optional[int] = None): + """ + Expands attention_mask from `[bsz, seq_len]` to `[bsz, 1, tgt_seq_len, src_seq_len]`. + """ + bsz, src_len = mask.shape + tgt_len = tgt_len if tgt_len is not None else src_len + + expanded_mask = mask[:, None, None, :].expand([bsz, 1, tgt_len, src_len]).cast(dtype) + + inverted_mask = 1.0 - expanded_mask + + return paddle.masked_fill(inverted_mask, inverted_mask.cast(paddle.bool), paddle.finfo(dtype).min) + + +# This is a modified version of the CLIPTextModel from transformers.models.clip.modeling_clip +# Which allows for an extra input of "context embeddings", which are the query embeddings used in Qformer +# They pass through the clip model, along with the text embeddings, and interact with them using self attention +class ContextCLIPTextModel(CLIPPretrainedModel): + config_class = CLIPTextConfig + + _no_split_modules = ["CLIPEncoderLayer"] + + def __init__(self, config: CLIPTextConfig): + super().__init__(config) + self.text_model = ContextCLIPTextTransformer(config) + # Initialize weights and apply final processing + self.post_init() + + def forward( + self, + ctx_embeddings: paddle.Tensor = None, + ctx_begin_pos: list = None, + input_ids: Optional[paddle.Tensor] = None, + attention_mask: Optional[paddle.Tensor] = None, + position_ids: Optional[paddle.Tensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[Tuple, BaseModelOutputWithPooling]: + return self.text_model( + ctx_embeddings=ctx_embeddings, + ctx_begin_pos=ctx_begin_pos, + input_ids=input_ids, + attention_mask=attention_mask, + position_ids=position_ids, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + +class ContextCLIPTextTransformer(nn.Layer): + def __init__(self, config: CLIPTextConfig): + super().__init__() + self.config = config + embed_dim = config.hidden_size + self.embeddings = ContextCLIPTextEmbeddings(config) + self.encoder = CLIPEncoder(config) + self.final_layer_norm = nn.LayerNorm(embed_dim) + self.eos_token_id = config.eos_token_id + + def forward( + self, + ctx_embeddings: paddle.Tensor, + ctx_begin_pos: list, + input_ids: Optional[paddle.Tensor] = None, + attention_mask: Optional[paddle.Tensor] = None, + position_ids: Optional[paddle.Tensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[Tuple, BaseModelOutputWithPooling]: + r""" + Returns: + + """ + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + if input_ids is None: + raise ValueError("You have to specify either input_ids") + + input_shape = input_ids.shape + input_ids = input_ids.reshape([-1, input_shape[-1]]) + + hidden_states = self.embeddings( + input_ids=input_ids, + position_ids=position_ids, + ctx_embeddings=ctx_embeddings, + ctx_begin_pos=ctx_begin_pos, + ) + + bsz, seq_len = input_shape + if ctx_embeddings is not None: + seq_len += ctx_embeddings.shape[1] + # CLIP's text model uses causal mask, prepare it here. + # https://github.com/openai/CLIP/blob/cfcffb90e69f37bf2ff1e988237a0fbe41f33c04/clip/model.py#L324 + causal_attention_mask = self._build_causal_attention_mask( + bsz, + seq_len, + hidden_states.dtype, + ) + # expand attention_mask + if attention_mask is not None: + # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len] + attention_mask = _expand_mask(attention_mask, hidden_states.dtype) + + encoder_outputs = self.encoder( + inputs_embeds=hidden_states, + attention_mask=attention_mask, + causal_attention_mask=causal_attention_mask, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + last_hidden_state = encoder_outputs[0] + last_hidden_state = self.final_layer_norm(last_hidden_state) + + if self.eos_token_id == 2: + # The `eos_token_id` was incorrect before PR #24773: Let's keep what have been done here. + # A CLIP model with such `eos_token_id` in the config can't work correctly with extra new tokens added + # ------------------------------------------------------------ + # text_embeds.shape = [batch_size, sequence_length, transformer.width] + # take features from the eot embedding (eot_token is the highest number in each sequence) + # casting to paddle.int32 for onnx compatibility: argmax doesn't support int64 inputs with opset 14 + pooled_output = last_hidden_state.gather_nd( + paddle.stack( + [paddle.arange(last_hidden_state.shape[0], dtype="int32"), input_ids.argmax(-1, dtype="int32")], + axis=-1, + ) + ) + else: + # The config gets updated `eos_token_id` from PR #24773 (so the use of extra new tokens is possible) + # We need to get the first position of `eos_token_id` value (`pad_token_ids` might equal to `eos_token_id`) + pooled_output = last_hidden_state.gather_nd( + paddle.stack( + [ + paddle.arange(last_hidden_state.shape[0], dtype="int32"), + (input_ids == paddle.to_tensor([self.eos_token_id])) + .cast("int32") + .argmax(axis=-1, dtype="int32"), + ], + axis=-1, + ) + ) + + if not return_dict: + return (last_hidden_state, pooled_output) + encoder_outputs[1:] + + return BaseModelOutputWithPooling( + last_hidden_state=last_hidden_state, + pooler_output=pooled_output, + hidden_states=encoder_outputs.hidden_states, + attentions=encoder_outputs.attentions, + ) + + def _build_causal_attention_mask(self, bsz, seq_len, dtype): + mask = paddle.triu( + # paddle.full((bsz, 1, seq_len, seq_len), paddle.finfo(dtype).min, dtype=dtype), + paddle.ones((bsz, paddle.to_tensor([1]), seq_len, seq_len), dtype=dtype) * paddle.finfo(dtype).min, + diagonal=1, + ) + return mask + + +class ContextCLIPTextEmbeddings(nn.Layer): + def __init__(self, config: CLIPTextConfig): + super().__init__() + embed_dim = config.hidden_size + + self.token_embedding = nn.Embedding(config.vocab_size, embed_dim) + self.position_embedding = nn.Embedding(config.max_position_embeddings, embed_dim) + + # position_ids (1, len position emb) is contiguous in memory and exported when serialized + self.register_buffer( + "position_ids", paddle.arange(config.max_position_embeddings, dtype=paddle.int64).expand((1, -1)) + ) + + def forward( + self, + ctx_embeddings: paddle.Tensor, + ctx_begin_pos: list, + input_ids: Optional[paddle.Tensor] = None, + position_ids: Optional[paddle.Tensor] = None, + inputs_embeds: Optional[paddle.Tensor] = None, + ) -> paddle.Tensor: + if ctx_embeddings is None: + ctx_len = 0 + else: + ctx_len = ctx_embeddings.shape[1] + + seq_length = (input_ids.shape[-1] if input_ids is not None else inputs_embeds.shape[-2]) + ctx_len + + if position_ids is None: + position_ids = self.position_ids[:, :seq_length].cast(paddle.int64) + + if inputs_embeds is None: + inputs_embeds = self.token_embedding(input_ids) + + # for each input embeddings, add the ctx embeddings at the correct position + input_embeds_ctx = [] + bsz = inputs_embeds.shape[0] + + if ctx_embeddings is not None: + for i in range(bsz): + cbp = ctx_begin_pos[i] + + prefix = inputs_embeds[i, :cbp] + # remove the special token embedding + suffix = inputs_embeds[i, cbp:] + + input_embeds_ctx.append(paddle.concat([prefix, ctx_embeddings[i], suffix], axis=0)) + + inputs_embeds = paddle.stack(input_embeds_ctx, axis=0) + + position_embeddings = self.position_embedding(position_ids) + embeddings = inputs_embeds + position_embeddings + + return embeddings diff --git a/VLMEvalKit_old/PaddleMIX/ppdiffusers/ppdiffusers/pipelines/consistency_models/__init__.py b/VLMEvalKit_old/PaddleMIX/ppdiffusers/ppdiffusers/pipelines/consistency_models/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..0de04fa38c3109ea181a0c289564b10ae9e49a92 --- /dev/null +++ b/VLMEvalKit_old/PaddleMIX/ppdiffusers/ppdiffusers/pipelines/consistency_models/__init__.py @@ -0,0 +1,34 @@ +# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import TYPE_CHECKING + +from ...utils import PPDIFFUSERS_SLOW_IMPORT, _LazyModule + +_import_structure = { + "pipeline_consistency_models": ["ConsistencyModelPipeline"], +} + +if TYPE_CHECKING or PPDIFFUSERS_SLOW_IMPORT: + from .pipeline_consistency_models import ConsistencyModelPipeline + +else: + import sys + + sys.modules[__name__] = _LazyModule( + __name__, + globals()["__file__"], + _import_structure, + module_spec=__spec__, + ) diff --git a/VLMEvalKit_old/PaddleMIX/ppdiffusers/ppdiffusers/pipelines/controlnet/pipeline_controlnet_sd_xl.py b/VLMEvalKit_old/PaddleMIX/ppdiffusers/ppdiffusers/pipelines/controlnet/pipeline_controlnet_sd_xl.py new file mode 100644 index 0000000000000000000000000000000000000000..4f22c95d57a1680ad45f763c5bd76591c8d56d2c --- /dev/null +++ b/VLMEvalKit_old/PaddleMIX/ppdiffusers/ppdiffusers/pipelines/controlnet/pipeline_controlnet_sd_xl.py @@ -0,0 +1,1308 @@ +# Copyright 2023 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import inspect +from typing import Any, Callable, Dict, List, Optional, Tuple, Union + +import numpy as np +import paddle +import PIL.Image + +from ppdiffusers.transformers import ( + CLIPImageProcessor, + CLIPTextModel, + CLIPTextModelWithProjection, + CLIPTokenizer, + CLIPVisionModelWithProjection, +) + +from ...image_processor import PipelineImageInput, VaeImageProcessor +from ...loaders import ( + FromSingleFileMixin, + IPAdapterMixin, + StableDiffusionXLLoraLoaderMixin, + TextualInversionLoaderMixin, +) +from ...models import AutoencoderKL, ControlNetModel, UNet2DConditionModel +from ...models.attention_processor import ( + AttnProcessor2_5, + LoRAAttnProcessor2_5, + LoRAXFormersAttnProcessor, + XFormersAttnProcessor, +) +from ...models.lora import adjust_lora_scale_text_encoder +from ...schedulers import KarrasDiffusionSchedulers +from ...utils import ( + USE_PEFT_BACKEND, + deprecate, + is_pp_invisible_watermark_available, + logging, + replace_example_docstring, +) +from ...utils.paddle_utils import randn_tensor +from ..pipeline_utils import DiffusionPipeline +from ..stable_diffusion_xl.pipeline_output import StableDiffusionXLPipelineOutput + +if is_pp_invisible_watermark_available(): + from ..stable_diffusion_xl.watermark import StableDiffusionXLWatermarker + +from .multicontrolnet import MultiControlNetModel + +logger = logging.get_logger(__name__) # pylint: disable=invalid-name + + +EXAMPLE_DOC_STRING = """ + Examples: + ```py + >>> # !pip install opencv-python paddlenlp ppdiffusers + >>> from ppdiffusers import StableDiffusionXLControlNetPipeline, ControlNetModel, AutoencoderKL + >>> from ppdiffusers.utils import load_image + >>> import numpy as np + >>> import paddle + + >>> import cv2 + >>> from PIL import Image + + >>> prompt = "aerial view, a futuristic research complex in a bright foggy jungle, hard lighting" + >>> negative_prompt = "low quality, bad quality, sketches" + + >>> # download an image + >>> image = load_image( + ... "https://hf-mirror.com/datasets/hf-internal-testing/diffusers-images/resolve/main/sd_controlnet/hf-logo.png" + ... ) + + >>> # initialize the models and pipeline + >>> controlnet_conditioning_scale = 0.5 # recommended for good generalization + >>> controlnet = ControlNetModel.from_pretrained( + ... "diffusers/controlnet-canny-sdxl-1.0", paddle_dtype=paddle.float16 + ... ) + >>> vae = AutoencoderKL.from_pretrained("madebyollin/sdxl-vae-fp16-fix", paddle_dtype=paddle.float16) + >>> pipe = StableDiffusionXLControlNetPipeline.from_pretrained( + ... "stabilityai/stable-diffusion-xl-base-1.0", controlnet=controlnet, vae=vae, paddle_dtype=paddle.float16 + ... ) + + >>> # get canny image + >>> image = np.array(image) + >>> image = cv2.Canny(image, 100, 200) + >>> image = image[:, :, None] + >>> image = np.concatenate([image, image, image], axis=2) + >>> canny_image = Image.fromarray(image) + + >>> # generate image + >>> image = pipe( + ... prompt, controlnet_conditioning_scale=controlnet_conditioning_scale, image=canny_image + ... ).images[0] + ``` +""" + + +class StableDiffusionXLControlNetPipeline( + DiffusionPipeline, + TextualInversionLoaderMixin, + StableDiffusionXLLoraLoaderMixin, + IPAdapterMixin, + FromSingleFileMixin, +): + r""" + Pipeline for text-to-image generation using Stable Diffusion XL with ControlNet guidance. + + This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods + implemented for all pipelines (downloading, saving, running on a particular device, etc.). + + The pipeline also inherits the following loading methods: + - [`~loaders.TextualInversionLoaderMixin.load_textual_inversion`] for loading textual inversion embeddings + - [`loaders.StableDiffusionXLLoraLoaderMixin.load_lora_weights`] for loading LoRA weights + - [`loaders.FromSingleFileMixin.from_single_file`] for loading `.ckpt` files + + Args: + vae ([`AutoencoderKL`]): + Variational Auto-Encoder (VAE) model to encode and decode images to and from latent representations. + text_encoder ([`~transformers.CLIPTextModel`]): + Frozen text-encoder ([clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14)). + text_encoder_2 ([`~transformers.CLIPTextModelWithProjection`]): + Second frozen text-encoder + ([laion/CLIP-ViT-bigG-14-laion2B-39B-b160k](https://huggingface.co/laion/CLIP-ViT-bigG-14-laion2B-39B-b160k)). + tokenizer ([`~transformers.CLIPTokenizer`]): + A `CLIPTokenizer` to tokenize text. + tokenizer_2 ([`~transformers.CLIPTokenizer`]): + A `CLIPTokenizer` to tokenize text. + unet ([`UNet2DConditionModel`]): + A `UNet2DConditionModel` to denoise the encoded image latents. + controlnet ([`ControlNetModel`] or `List[ControlNetModel]`): + Provides additional conditioning to the `unet` during the denoising process. If you set multiple + ControlNets as a list, the outputs from each ControlNet are added together to create one combined + additional conditioning. + scheduler ([`SchedulerMixin`]): + A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of + [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`]. + force_zeros_for_empty_prompt (`bool`, *optional*, defaults to `"True"`): + Whether the negative prompt embeddings should always be set to 0. Also see the config of + `stabilityai/stable-diffusion-xl-base-1-0`. + add_watermarker (`bool`, *optional*): + Whether to use the [pp_invisible_watermark](https://github.com/junnyu/pp-invisible-watermark/) library to + watermark output images. If not defined, it defaults to `True` if the package is installed; otherwise no + watermarker is used. + """ + + # leave controlnet out on purpose because it iterates with unet + model_cpu_offload_seq = "text_encoder->text_encoder_2->unet->vae" + _optional_components = [ + "tokenizer", + "tokenizer_2", + "text_encoder", + "text_encoder_2", + "feature_extractor", + "image_encoder", + ] + _callback_tensor_inputs = ["latents", "prompt_embeds", "negative_prompt_embeds"] + + def __init__( + self, + vae: AutoencoderKL, + text_encoder: CLIPTextModel, + text_encoder_2: CLIPTextModelWithProjection, + tokenizer: CLIPTokenizer, + tokenizer_2: CLIPTokenizer, + unet: UNet2DConditionModel, + controlnet: Union[ControlNetModel, List[ControlNetModel], Tuple[ControlNetModel], MultiControlNetModel], + scheduler: KarrasDiffusionSchedulers, + force_zeros_for_empty_prompt: bool = True, + add_watermarker: Optional[bool] = None, + feature_extractor: CLIPImageProcessor = None, + image_encoder: CLIPVisionModelWithProjection = None, + ): + super().__init__() + + if isinstance(controlnet, (list, tuple)): + controlnet = MultiControlNetModel(controlnet) + + self.register_modules( + vae=vae, + text_encoder=text_encoder, + text_encoder_2=text_encoder_2, + tokenizer=tokenizer, + tokenizer_2=tokenizer_2, + unet=unet, + controlnet=controlnet, + scheduler=scheduler, + feature_extractor=feature_extractor, + image_encoder=image_encoder, + ) + self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1) + self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor, do_convert_rgb=True) + self.control_image_processor = VaeImageProcessor( + vae_scale_factor=self.vae_scale_factor, do_convert_rgb=True, do_normalize=False + ) + add_watermarker = add_watermarker if add_watermarker is not None else is_pp_invisible_watermark_available() + + if add_watermarker: + self.watermark = StableDiffusionXLWatermarker() + else: + self.watermark = None + + self.register_to_config(force_zeros_for_empty_prompt=force_zeros_for_empty_prompt) + + # Copied from ppdiffusers.pipelines.stable_diffusion_xl.pipeline_stable_diffusion_xl.StableDiffusionXLPipeline.encode_prompt + def encode_prompt( + self, + prompt: str, + prompt_2: Optional[str] = None, + num_images_per_prompt: int = 1, + do_classifier_free_guidance: bool = True, + negative_prompt: Optional[str] = None, + negative_prompt_2: Optional[str] = None, + prompt_embeds: Optional[paddle.Tensor] = None, + negative_prompt_embeds: Optional[paddle.Tensor] = None, + pooled_prompt_embeds: Optional[paddle.Tensor] = None, + negative_pooled_prompt_embeds: Optional[paddle.Tensor] = None, + lora_scale: Optional[float] = None, + clip_skip: Optional[int] = None, + ): + r""" + Encodes the prompt into text encoder hidden states. + + Args: + prompt (`str` or `List[str]`, *optional*): + prompt to be encoded + prompt_2 (`str` or `List[str]`, *optional*): + The prompt or prompts to be sent to the `tokenizer_2` and `text_encoder_2`. If not defined, `prompt` is + used in both text-encoders + num_images_per_prompt (`int`): + number of images that should be generated per prompt + do_classifier_free_guidance (`bool`): + whether to use classifier free guidance or not + negative_prompt (`str` or `List[str]`, *optional*): + The prompt or prompts not to guide the image generation. If not defined, one has to pass + `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is + less than `1`). + negative_prompt_2 (`str` or `List[str]`, *optional*): + The prompt or prompts not to guide the image generation to be sent to `tokenizer_2` and + `text_encoder_2`. If not defined, `negative_prompt` is used in both text-encoders + prompt_embeds (`paddle.Tensor`, *optional*): + Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not + provided, text embeddings will be generated from `prompt` input argument. + negative_prompt_embeds (`paddle.Tensor`, *optional*): + Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt + weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input + argument. + pooled_prompt_embeds (`paddle.Tensor`, *optional*): + Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. + If not provided, pooled text embeddings will be generated from `prompt` input argument. + negative_pooled_prompt_embeds (`paddle.Tensor`, *optional*): + Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt + weighting. If not provided, pooled negative_prompt_embeds will be generated from `negative_prompt` + input argument. + lora_scale (`float`, *optional*): + A lora scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded. + clip_skip (`int`, *optional*): + Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that + the output of the pre-final layer will be used for computing the prompt embeddings. + """ + # set lora scale so that monkey patched LoRA + # function of text encoder can correctly access it + if lora_scale is not None and isinstance(self, StableDiffusionXLLoraLoaderMixin): + self._lora_scale = lora_scale + + # dynamically adjust the LoRA scale + if self.text_encoder is not None: + if not USE_PEFT_BACKEND: + adjust_lora_scale_text_encoder(self.text_encoder, lora_scale) + + if self.text_encoder_2 is not None: + if not USE_PEFT_BACKEND: + adjust_lora_scale_text_encoder(self.text_encoder_2, lora_scale) + + prompt = [prompt] if isinstance(prompt, str) else prompt + + if prompt is not None: + batch_size = len(prompt) + else: + batch_size = prompt_embeds.shape[0] + + # Define tokenizers and text encoders + tokenizers = [self.tokenizer, self.tokenizer_2] if self.tokenizer is not None else [self.tokenizer_2] + text_encoders = ( + [self.text_encoder, self.text_encoder_2] if self.text_encoder is not None else [self.text_encoder_2] + ) + + if prompt_embeds is None: + prompt_2 = prompt_2 or prompt + prompt_2 = [prompt_2] if isinstance(prompt_2, str) else prompt_2 + + # textual inversion: process multi-vector tokens if necessary + prompt_embeds_list = [] + prompts = [prompt, prompt_2] + for prompt, tokenizer, text_encoder in zip(prompts, tokenizers, text_encoders): + if isinstance(self, TextualInversionLoaderMixin): + prompt = self.maybe_convert_prompt(prompt, tokenizer) + + text_inputs = tokenizer( + prompt, + padding="max_length", + max_length=tokenizer.model_max_length, + truncation=True, + return_tensors="pd", + ) + + text_input_ids = text_inputs.input_ids + untruncated_ids = tokenizer(prompt, padding="longest", return_tensors="pd").input_ids + + if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not paddle.equal_all( + text_input_ids, untruncated_ids + ): + removed_text = tokenizer.batch_decode(untruncated_ids[:, tokenizer.model_max_length - 1 : -1]) + logger.warning( + "The following part of your input was truncated because CLIP can only handle sequences up to" + f" {tokenizer.model_max_length} tokens: {removed_text}" + ) + + prompt_embeds = text_encoder(text_input_ids, output_hidden_states=True) + + # We are only ALWAYS interested in the pooled output of the final text encoder + pooled_prompt_embeds = prompt_embeds[0] + if clip_skip is None: + prompt_embeds = prompt_embeds.hidden_states[-2] + else: + # "2" because SDXL always indexes from the penultimate layer. + prompt_embeds = prompt_embeds.hidden_states[-(clip_skip + 2)] + + prompt_embeds_list.append(prompt_embeds) + + prompt_embeds = paddle.concat(prompt_embeds_list, axis=-1) + + # get unconditional embeddings for classifier free guidance + zero_out_negative_prompt = negative_prompt is None and self.config.force_zeros_for_empty_prompt + if do_classifier_free_guidance and negative_prompt_embeds is None and zero_out_negative_prompt: + negative_prompt_embeds = paddle.zeros_like(prompt_embeds) + negative_pooled_prompt_embeds = paddle.zeros_like(pooled_prompt_embeds) + elif do_classifier_free_guidance and negative_prompt_embeds is None: + negative_prompt = negative_prompt or "" + negative_prompt_2 = negative_prompt_2 or negative_prompt + + # normalize str to list + negative_prompt = batch_size * [negative_prompt] if isinstance(negative_prompt, str) else negative_prompt + negative_prompt_2 = ( + batch_size * [negative_prompt_2] if isinstance(negative_prompt_2, str) else negative_prompt_2 + ) + + uncond_tokens: List[str] + if prompt is not None and type(prompt) is not type(negative_prompt): + raise TypeError( + f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !=" + f" {type(prompt)}." + ) + elif batch_size != len(negative_prompt): + raise ValueError( + f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:" + f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches" + " the batch size of `prompt`." + ) + else: + uncond_tokens = [negative_prompt, negative_prompt_2] + + negative_prompt_embeds_list = [] + for negative_prompt, tokenizer, text_encoder in zip(uncond_tokens, tokenizers, text_encoders): + if isinstance(self, TextualInversionLoaderMixin): + negative_prompt = self.maybe_convert_prompt(negative_prompt, tokenizer) + + max_length = prompt_embeds.shape[1] + uncond_input = tokenizer( + negative_prompt, + padding="max_length", + max_length=max_length, + truncation=True, + return_tensors="pd", + ) + + negative_prompt_embeds = text_encoder( + uncond_input.input_ids, + output_hidden_states=True, + ) + # We are only ALWAYS interested in the pooled output of the final text encoder + negative_pooled_prompt_embeds = negative_prompt_embeds[0] + negative_prompt_embeds = negative_prompt_embeds.hidden_states[-2] + + negative_prompt_embeds_list.append(negative_prompt_embeds) + + negative_prompt_embeds = paddle.concat(negative_prompt_embeds_list, axis=-1) + + if self.text_encoder_2 is not None: + prompt_embeds = prompt_embeds.cast(dtype=self.text_encoder_2.dtype) + else: + prompt_embeds = prompt_embeds.cast(dtype=self.unet.dtype) + + bs_embed, seq_len, _ = prompt_embeds.shape + # duplicate text embeddings for each generation per prompt, using mps friendly method + prompt_embeds = prompt_embeds.tile([1, num_images_per_prompt, 1]) + prompt_embeds = prompt_embeds.reshape([bs_embed * num_images_per_prompt, seq_len, -1]) + + if do_classifier_free_guidance: + # duplicate unconditional embeddings for each generation per prompt, using mps friendly method + seq_len = negative_prompt_embeds.shape[1] + + if self.text_encoder_2 is not None: + negative_prompt_embeds = negative_prompt_embeds.cast(dtype=self.text_encoder_2.dtype) + else: + negative_prompt_embeds = negative_prompt_embeds.cast(dtype=self.unet.dtype) + + negative_prompt_embeds = negative_prompt_embeds.tile([1, num_images_per_prompt, 1]) + negative_prompt_embeds = negative_prompt_embeds.reshape([batch_size * num_images_per_prompt, seq_len, -1]) + + pooled_prompt_embeds = pooled_prompt_embeds.tile([1, num_images_per_prompt]).reshape( + [bs_embed * num_images_per_prompt, -1] + ) + if do_classifier_free_guidance: + negative_pooled_prompt_embeds = negative_pooled_prompt_embeds.tile([1, num_images_per_prompt]).reshape( + [bs_embed * num_images_per_prompt, -1] + ) + + return prompt_embeds, negative_prompt_embeds, pooled_prompt_embeds, negative_pooled_prompt_embeds + + # Copied from ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.encode_image + def encode_image(self, image, num_images_per_prompt): + dtype = next(self.image_encoder.named_parameters())[1].dtype + + if not isinstance(image, paddle.Tensor): + image = self.feature_extractor(image, return_tensors="pd").pixel_values + + image = image.cast(dtype=dtype) + image_embeds = self.image_encoder(image).image_embeds + image_embeds = image_embeds.repeat_interleave(num_images_per_prompt, axis=0) + + uncond_image_embeds = paddle.zeros_like(image_embeds) + return image_embeds, uncond_image_embeds + + # Copied from ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs + def prepare_extra_step_kwargs(self, generator, eta): + # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature + # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers. + # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502 + # and should be between [0, 1] + + accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys()) + extra_step_kwargs = {} + if accepts_eta: + extra_step_kwargs["eta"] = eta + + # check if the scheduler accepts generator + accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys()) + if accepts_generator: + extra_step_kwargs["generator"] = generator + return extra_step_kwargs + + def check_inputs( + self, + prompt, + prompt_2, + image, + callback_steps, + negative_prompt=None, + negative_prompt_2=None, + prompt_embeds=None, + negative_prompt_embeds=None, + pooled_prompt_embeds=None, + negative_pooled_prompt_embeds=None, + controlnet_conditioning_scale=1.0, + control_guidance_start=0.0, + control_guidance_end=1.0, + callback_on_step_end_tensor_inputs=None, + ): + if callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0): + raise ValueError( + f"`callback_steps` has to be a positive integer but is {callback_steps} of type" + f" {type(callback_steps)}." + ) + + if callback_on_step_end_tensor_inputs is not None and not all( + k in self._callback_tensor_inputs for k in callback_on_step_end_tensor_inputs + ): + raise ValueError( + f"`callback_on_step_end_tensor_inputs` has to be in {self._callback_tensor_inputs}, but found {[k for k in callback_on_step_end_tensor_inputs if k not in self._callback_tensor_inputs]}" + ) + + if prompt is not None and prompt_embeds is not None: + raise ValueError( + f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to" + " only forward one of the two." + ) + elif prompt_2 is not None and prompt_embeds is not None: + raise ValueError( + f"Cannot forward both `prompt_2`: {prompt_2} and `prompt_embeds`: {prompt_embeds}. Please make sure to" + " only forward one of the two." + ) + elif prompt is None and prompt_embeds is None: + raise ValueError( + "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined." + ) + elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)): + raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}") + elif prompt_2 is not None and (not isinstance(prompt_2, str) and not isinstance(prompt_2, list)): + raise ValueError(f"`prompt_2` has to be of type `str` or `list` but is {type(prompt_2)}") + + if negative_prompt is not None and negative_prompt_embeds is not None: + raise ValueError( + f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:" + f" {negative_prompt_embeds}. Please make sure to only forward one of the two." + ) + elif negative_prompt_2 is not None and negative_prompt_embeds is not None: + raise ValueError( + f"Cannot forward both `negative_prompt_2`: {negative_prompt_2} and `negative_prompt_embeds`:" + f" {negative_prompt_embeds}. Please make sure to only forward one of the two." + ) + + if prompt_embeds is not None and negative_prompt_embeds is not None: + if prompt_embeds.shape != negative_prompt_embeds.shape: + raise ValueError( + "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but" + f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`" + f" {negative_prompt_embeds.shape}." + ) + + if prompt_embeds is not None and pooled_prompt_embeds is None: + raise ValueError( + "If `prompt_embeds` are provided, `pooled_prompt_embeds` also have to be passed. Make sure to generate `pooled_prompt_embeds` from the same text encoder that was used to generate `prompt_embeds`." + ) + + if negative_prompt_embeds is not None and negative_pooled_prompt_embeds is None: + raise ValueError( + "If `negative_prompt_embeds` are provided, `negative_pooled_prompt_embeds` also have to be passed. Make sure to generate `negative_pooled_prompt_embeds` from the same text encoder that was used to generate `negative_prompt_embeds`." + ) + + # `prompt` needs more sophisticated handling when there are multiple + # conditionings. + if isinstance(self.controlnet, MultiControlNetModel): + if isinstance(prompt, list): + logger.warning( + f"You have {len(self.controlnet.nets)} ControlNets and you have passed {len(prompt)}" + " prompts. The conditionings will be fixed across the prompts." + ) + + # Check `image` + if isinstance(self.controlnet, ControlNetModel): + self.check_image(image, prompt, prompt_embeds) + elif isinstance(self.controlnet, MultiControlNetModel): + if not isinstance(image, list): + raise TypeError("For multiple controlnets: `image` must be type `list`") + + # When `image` is a nested list: + # (e.g. [[canny_image_1, pose_image_1], [canny_image_2, pose_image_2]]) + elif any(isinstance(i, list) for i in image): + raise ValueError("A single batch of multiple conditionings are supported at the moment.") + elif len(image) != len(self.controlnet.nets): + raise ValueError( + f"For multiple controlnets: `image` must have the same length as the number of controlnets, but got {len(image)} images and {len(self.controlnet.nets)} ControlNets." + ) + + for image_ in image: + self.check_image(image_, prompt, prompt_embeds) + else: + assert False + + # Check `controlnet_conditioning_scale` + if isinstance(self.controlnet, ControlNetModel): + if not isinstance(controlnet_conditioning_scale, float): + raise TypeError("For single controlnet: `controlnet_conditioning_scale` must be type `float`.") + elif isinstance(self.controlnet, MultiControlNetModel): + if isinstance(controlnet_conditioning_scale, list): + if any(isinstance(i, list) for i in controlnet_conditioning_scale): + raise ValueError("A single batch of multiple conditionings are supported at the moment.") + elif isinstance(controlnet_conditioning_scale, list) and len(controlnet_conditioning_scale) != len( + self.controlnet.nets + ): + raise ValueError( + "For multiple controlnets: When `controlnet_conditioning_scale` is specified as `list`, it must have" + " the same length as the number of controlnets" + ) + else: + assert False + + if not isinstance(control_guidance_start, (tuple, list)): + control_guidance_start = [control_guidance_start] + + if not isinstance(control_guidance_end, (tuple, list)): + control_guidance_end = [control_guidance_end] + + if len(control_guidance_start) != len(control_guidance_end): + raise ValueError( + f"`control_guidance_start` has {len(control_guidance_start)} elements, but `control_guidance_end` has {len(control_guidance_end)} elements. Make sure to provide the same number of elements to each list." + ) + + if isinstance(self.controlnet, MultiControlNetModel): + if len(control_guidance_start) != len(self.controlnet.nets): + raise ValueError( + f"`control_guidance_start`: {control_guidance_start} has {len(control_guidance_start)} elements but there are {len(self.controlnet.nets)} controlnets available. Make sure to provide {len(self.controlnet.nets)}." + ) + + for start, end in zip(control_guidance_start, control_guidance_end): + if start >= end: + raise ValueError( + f"control guidance start: {start} cannot be larger or equal to control guidance end: {end}." + ) + if start < 0.0: + raise ValueError(f"control guidance start: {start} can't be smaller than 0.") + if end > 1.0: + raise ValueError(f"control guidance end: {end} can't be larger than 1.0.") + + # Copied from ppdiffusers.pipelines.controlnet.pipeline_controlnet.StableDiffusionControlNetPipeline.check_image + def check_image(self, image, prompt, prompt_embeds): + image_is_pil = isinstance(image, PIL.Image.Image) + image_is_tensor = isinstance(image, paddle.Tensor) + image_is_np = isinstance(image, np.ndarray) + image_is_pil_list = isinstance(image, list) and isinstance(image[0], PIL.Image.Image) + image_is_tensor_list = isinstance(image, list) and isinstance(image[0], paddle.Tensor) + image_is_np_list = isinstance(image, list) and isinstance(image[0], np.ndarray) + + if ( + not image_is_pil + and not image_is_tensor + and not image_is_np + and not image_is_pil_list + and not image_is_tensor_list + and not image_is_np_list + ): + raise TypeError( + f"image must be passed and be one of PIL image, numpy array, paddle tensor, list of PIL images, list of numpy arrays or list of paddle tensors, but is {type(image)}" + ) + + if image_is_pil: + image_batch_size = 1 + else: + image_batch_size = len(image) + + if prompt is not None and isinstance(prompt, str): + prompt_batch_size = 1 + elif prompt is not None and isinstance(prompt, list): + prompt_batch_size = len(prompt) + elif prompt_embeds is not None: + prompt_batch_size = prompt_embeds.shape[0] + + if image_batch_size != 1 and image_batch_size != prompt_batch_size: + raise ValueError( + f"If image batch size is not 1, image batch size must be same as prompt batch size. image batch size: {image_batch_size}, prompt batch size: {prompt_batch_size}" + ) + + # Copied from ppdiffusers.pipelines.controlnet.pipeline_controlnet.StableDiffusionControlNetPipeline.prepare_image + def prepare_image( + self, + image, + width, + height, + batch_size, + num_images_per_prompt, + dtype, + do_classifier_free_guidance=False, + guess_mode=False, + ): + image = self.control_image_processor.preprocess(image, height=height, width=width).cast(dtype=paddle.float32) + image_batch_size = image.shape[0] + + if image_batch_size == 1: + repeat_by = batch_size + else: + # image batch size is the same as prompt batch size + repeat_by = num_images_per_prompt + + image = image.repeat_interleave(repeat_by, axis=0) + + image = image.cast(dtype=dtype) + + if do_classifier_free_guidance and not guess_mode: + image = paddle.concat([image] * 2) + + return image + + # Copied from ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_latents + def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype, generator, latents=None): + shape = (batch_size, num_channels_latents, height // self.vae_scale_factor, width // self.vae_scale_factor) + if isinstance(generator, list) and len(generator) != batch_size: + raise ValueError( + f"You have passed a list of generators of length {len(generator)}, but requested an effective batch" + f" size of {batch_size}. Make sure the batch size matches the length of the generators." + ) + + if latents is None: + latents = randn_tensor(shape, generator=generator, dtype=dtype) + else: + latents = latents.cast(dtype) + + # scale the initial noise by the standard deviation required by the scheduler + latents = latents * self.scheduler.init_noise_sigma + return latents + + # Copied from ppdiffusers.pipelines.stable_diffusion_xl.pipeline_stable_diffusion_xl.StableDiffusionXLPipeline._get_add_time_ids + def _get_add_time_ids( + self, original_size, crops_coords_top_left, target_size, dtype, text_encoder_projection_dim=None + ): + add_time_ids = list(original_size + crops_coords_top_left + target_size) + + passed_add_embed_dim = ( + self.unet.config.addition_time_embed_dim * len(add_time_ids) + text_encoder_projection_dim + ) + expected_add_embed_dim = self.unet.add_embedding.linear_1.in_features + + if expected_add_embed_dim != passed_add_embed_dim: + raise ValueError( + f"Model expects an added time embedding vector of length {expected_add_embed_dim}, but a vector of {passed_add_embed_dim} was created. The model has an incorrect config. Please check `unet.config.time_embedding_type` and `text_encoder_2.config.projection_dim`." + ) + + add_time_ids = paddle.to_tensor([add_time_ids], dtype=dtype) + return add_time_ids + + # Copied from ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_upscale.StableDiffusionUpscalePipeline.upcast_vae + def upcast_vae(self): + dtype = self.vae.dtype + self.vae.to(dtype=paddle.float32) + use_paddle_2_5_or_ppxformers = isinstance( + self.vae.decoder.mid_block.attentions[0].processor, + ( + AttnProcessor2_5, + XFormersAttnProcessor, + LoRAXFormersAttnProcessor, + LoRAAttnProcessor2_5, + ), + ) + # if xformers or torch_2_0 is used attention block does not need + # to be in float32 which can save lots of memory + if use_paddle_2_5_or_ppxformers: + self.vae.post_quant_conv.to(dtype=dtype) + self.vae.decoder.conv_in.to(dtype=dtype) + self.vae.decoder.mid_block.to(dtype=dtype) + + # Copied from ppdiffusers.pipelines.latent_consistency_models.pipeline_latent_consistency_text2img.LatentConsistencyModelPipeline.get_guidance_scale_embedding + def get_guidance_scale_embedding(self, w, embedding_dim=512, dtype=paddle.float32): + """ + See https://github.com/google-research/vdm/blob/dc27b98a554f65cdc654b800da5aa1846545d41b/model_vdm.py#L298 + + Args: + timesteps (`paddle.Tensor`): + generate embedding vectors at these timesteps + embedding_dim (`int`, *optional*, defaults to 512): + dimension of the embeddings to generate + dtype: + data type of the generated embeddings + + Returns: + `paddle.Tensor`: Embedding vectors with shape `(len(timesteps), embedding_dim)` + """ + assert len(w.shape) == 1 + w = w * 1000.0 + + half_dim = embedding_dim // 2 + emb = paddle.log(paddle.to_tensor(10000.0)) / (half_dim - 1) + emb = paddle.exp(paddle.arange(half_dim, dtype=dtype) * -emb) + emb = w.cast(dtype=dtype)[:, None] * emb[None, :] + emb = paddle.concat([paddle.sin(emb), paddle.cos(emb)], axis=1) + if embedding_dim % 2 == 1: + emb = paddle.concat(emb, paddle.zeros([emb.shape[0], 1]), axis=-1) + assert emb.shape == [w.shape[0], embedding_dim] + return emb + + @property + def guidance_scale(self): + return self._guidance_scale + + @property + def clip_skip(self): + return self._clip_skip + + # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2) + # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1` + # corresponds to doing no classifier free guidance. + @property + def do_classifier_free_guidance(self): + return self._guidance_scale > 1 and self.unet.config.time_cond_proj_dim is None + + @property + def cross_attention_kwargs(self): + return self._cross_attention_kwargs + + @property + def num_timesteps(self): + return self._num_timesteps + + @paddle.no_grad() + @replace_example_docstring(EXAMPLE_DOC_STRING) + def __call__( + self, + prompt: Union[str, List[str]] = None, + prompt_2: Optional[Union[str, List[str]]] = None, + image: PipelineImageInput = None, + height: Optional[int] = None, + width: Optional[int] = None, + num_inference_steps: int = 50, + guidance_scale: float = 5.0, + negative_prompt: Optional[Union[str, List[str]]] = None, + negative_prompt_2: Optional[Union[str, List[str]]] = None, + num_images_per_prompt: Optional[int] = 1, + eta: float = 0.0, + generator: Optional[Union[paddle.Generator, List[paddle.Generator]]] = None, + latents: Optional[paddle.Tensor] = None, + prompt_embeds: Optional[paddle.Tensor] = None, + negative_prompt_embeds: Optional[paddle.Tensor] = None, + pooled_prompt_embeds: Optional[paddle.Tensor] = None, + negative_pooled_prompt_embeds: Optional[paddle.Tensor] = None, + ip_adapter_image: Optional[PipelineImageInput] = None, + output_type: Optional[str] = "pil", + return_dict: bool = True, + cross_attention_kwargs: Optional[Dict[str, Any]] = None, + controlnet_conditioning_scale: Union[float, List[float]] = 1.0, + guess_mode: bool = False, + control_guidance_start: Union[float, List[float]] = 0.0, + control_guidance_end: Union[float, List[float]] = 1.0, + original_size: Tuple[int, int] = None, + crops_coords_top_left: Tuple[int, int] = (0, 0), + target_size: Tuple[int, int] = None, + negative_original_size: Optional[Tuple[int, int]] = None, + negative_crops_coords_top_left: Tuple[int, int] = (0, 0), + negative_target_size: Optional[Tuple[int, int]] = None, + clip_skip: Optional[int] = None, + callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None, + callback_on_step_end_tensor_inputs: List[str] = ["latents"], + **kwargs, + ): + r""" + The call function to the pipeline for generation. + + Args: + prompt (`str` or `List[str]`, *optional*): + The prompt or prompts to guide image generation. If not defined, you need to pass `prompt_embeds`. + prompt_2 (`str` or `List[str]`, *optional*): + The prompt or prompts to be sent to `tokenizer_2` and `text_encoder_2`. If not defined, `prompt` is + used in both text-encoders. + image (`paddle.Tensor`, `PIL.Image.Image`, `np.ndarray`, `List[paddle.Tensor]`, `List[PIL.Image.Image]`, `List[np.ndarray]`,: + `List[List[paddle.Tensor]]`, `List[List[np.ndarray]]` or `List[List[PIL.Image.Image]]`): + The ControlNet input condition to provide guidance to the `unet` for generation. If the type is + specified as `paddle.Tensor`, it is passed to ControlNet as is. `PIL.Image.Image` can also be + accepted as an image. The dimensions of the output image defaults to `image`'s dimensions. If height + and/or width are passed, `image` is resized accordingly. If multiple ControlNets are specified in + `init`, images must be passed as a list such that each element of the list can be correctly batched for + input to a single ControlNet. + height (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`): + The height in pixels of the generated image. Anything below 512 pixels won't work well for + [stabilityai/stable-diffusion-xl-base-1.0](https://huggingface.co/stabilityai/stable-diffusion-xl-base-1.0) + and checkpoints that are not specifically fine-tuned on low resolutions. + width (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`): + The width in pixels of the generated image. Anything below 512 pixels won't work well for + [stabilityai/stable-diffusion-xl-base-1.0](https://huggingface.co/stabilityai/stable-diffusion-xl-base-1.0) + and checkpoints that are not specifically fine-tuned on low resolutions. + num_inference_steps (`int`, *optional*, defaults to 50): + The number of denoising steps. More denoising steps usually lead to a higher quality image at the + expense of slower inference. + guidance_scale (`float`, *optional*, defaults to 5.0): + A higher guidance scale value encourages the model to generate images closely linked to the text + `prompt` at the expense of lower image quality. Guidance scale is enabled when `guidance_scale > 1`. + negative_prompt (`str` or `List[str]`, *optional*): + The prompt or prompts to guide what to not include in image generation. If not defined, you need to + pass `negative_prompt_embeds` instead. Ignored when not using guidance (`guidance_scale < 1`). + negative_prompt_2 (`str` or `List[str]`, *optional*): + The prompt or prompts to guide what to not include in image generation. This is sent to `tokenizer_2` + and `text_encoder_2`. If not defined, `negative_prompt` is used in both text-encoders. + num_images_per_prompt (`int`, *optional*, defaults to 1): + The number of images to generate per prompt. + eta (`float`, *optional*, defaults to 0.0): + Corresponds to parameter eta (η) from the [DDIM](https://arxiv.org/abs/2010.02502) paper. Only applies + to the [`~schedulers.DDIMScheduler`], and is ignored in other schedulers. + generator (`paddle.Generator` or `List[paddle.Generator]`, *optional*): + A [`paddle.Generator`] to make generation deterministic. + + latents (`paddle.Tensor`, *optional*): + Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image + generation. Can be used to tweak the same generation with different prompts. If not provided, a latents + tensor is generated by sampling using the supplied random `generator`. + prompt_embeds (`paddle.Tensor`, *optional*): + Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not + provided, text embeddings are generated from the `prompt` input argument. + negative_prompt_embeds (`paddle.Tensor`, *optional*): + Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If + not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument. + pooled_prompt_embeds (`paddle.Tensor`, *optional*): + Pre-generated pooled text embeddings. Can be used to easily tweak text inputs (prompt weighting). If + not provided, pooled text embeddings are generated from `prompt` input argument. + negative_pooled_prompt_embeds (`paddle.Tensor`, *optional*): + Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs (prompt + weighting). If not provided, pooled `negative_prompt_embeds` are generated from `negative_prompt` input + argument. + ip_adapter_image: (`PipelineImageInput`, *optional*): Optional image input to work with IP Adapters. + output_type (`str`, *optional*, defaults to `"pil"`): + The output format of the generated image. Choose between `PIL.Image` or `np.array`. + return_dict (`bool`, *optional*, defaults to `True`): + Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a + plain tuple. + cross_attention_kwargs (`dict`, *optional*): + A kwargs dictionary that if specified is passed along to the [`AttentionProcessor`] as defined in + [`self.processor`](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py). + controlnet_conditioning_scale (`float` or `List[float]`, *optional*, defaults to 1.0): + The outputs of the ControlNet are multiplied by `controlnet_conditioning_scale` before they are added + to the residual in the original `unet`. If multiple ControlNets are specified in `init`, you can set + the corresponding scale as a list. + guess_mode (`bool`, *optional*, defaults to `False`): + The ControlNet encoder tries to recognize the content of the input image even if you remove all + prompts. A `guidance_scale` value between 3.0 and 5.0 is recommended. + control_guidance_start (`float` or `List[float]`, *optional*, defaults to 0.0): + The percentage of total steps at which the ControlNet starts applying. + control_guidance_end (`float` or `List[float]`, *optional*, defaults to 1.0): + The percentage of total steps at which the ControlNet stops applying. + original_size (`Tuple[int]`, *optional*, defaults to (1024, 1024)): + If `original_size` is not the same as `target_size` the image will appear to be down- or upsampled. + `original_size` defaults to `(height, width)` if not specified. Part of SDXL's micro-conditioning as + explained in section 2.2 of + [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952). + crops_coords_top_left (`Tuple[int]`, *optional*, defaults to (0, 0)): + `crops_coords_top_left` can be used to generate an image that appears to be "cropped" from the position + `crops_coords_top_left` downwards. Favorable, well-centered images are usually achieved by setting + `crops_coords_top_left` to (0, 0). Part of SDXL's micro-conditioning as explained in section 2.2 of + [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952). + target_size (`Tuple[int]`, *optional*, defaults to (1024, 1024)): + For most cases, `target_size` should be set to the desired height and width of the generated image. If + not specified it will default to `(height, width)`. Part of SDXL's micro-conditioning as explained in + section 2.2 of [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952). + negative_original_size (`Tuple[int]`, *optional*, defaults to (1024, 1024)): + To negatively condition the generation process based on a specific image resolution. Part of SDXL's + micro-conditioning as explained in section 2.2 of + [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952). For more + information, refer to this issue thread: https://github.com/huggingface/diffusers/issues/4208. + negative_crops_coords_top_left (`Tuple[int]`, *optional*, defaults to (0, 0)): + To negatively condition the generation process based on a specific crop coordinates. Part of SDXL's + micro-conditioning as explained in section 2.2 of + [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952). For more + information, refer to this issue thread: https://github.com/huggingface/diffusers/issues/4208. + negative_target_size (`Tuple[int]`, *optional*, defaults to (1024, 1024)): + To negatively condition the generation process based on a target image resolution. It should be as same + as the `target_size` for most cases. Part of SDXL's micro-conditioning as explained in section 2.2 of + [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952). For more + information, refer to this issue thread: https://github.com/huggingface/diffusers/issues/4208. + clip_skip (`int`, *optional*): + Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that + the output of the pre-final layer will be used for computing the prompt embeddings. + callback_on_step_end (`Callable`, *optional*): + A function that calls at the end of each denoising steps during the inference. The function is called + with the following arguments: `callback_on_step_end(self: DiffusionPipeline, step: int, timestep: int, + callback_kwargs: Dict)`. `callback_kwargs` will include a list of all tensors as specified by + `callback_on_step_end_tensor_inputs`. + callback_on_step_end_tensor_inputs (`List`, *optional*): + The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list + will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the + `._callback_tensor_inputs` attribute of your pipeine class. + + Examples: + + Returns: + [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`: + If `return_dict` is `True`, [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] is returned, + otherwise a `tuple` is returned containing the output images. + """ + + callback = kwargs.pop("callback", None) + callback_steps = kwargs.pop("callback_steps", None) + + if callback is not None: + deprecate( + "callback", + "1.0.0", + "Passing `callback` as an input argument to `__call__` is deprecated, consider using `callback_on_step_end`", + ) + if callback_steps is not None: + deprecate( + "callback_steps", + "1.0.0", + "Passing `callback_steps` as an input argument to `__call__` is deprecated, consider using `callback_on_step_end`", + ) + + controlnet = self.controlnet + + # align format for control guidance + if not isinstance(control_guidance_start, list) and isinstance(control_guidance_end, list): + control_guidance_start = len(control_guidance_end) * [control_guidance_start] + elif not isinstance(control_guidance_end, list) and isinstance(control_guidance_start, list): + control_guidance_end = len(control_guidance_start) * [control_guidance_end] + elif not isinstance(control_guidance_start, list) and not isinstance(control_guidance_end, list): + mult = len(controlnet.nets) if isinstance(controlnet, MultiControlNetModel) else 1 + control_guidance_start, control_guidance_end = ( + mult * [control_guidance_start], + mult * [control_guidance_end], + ) + + # 1. Check inputs. Raise error if not correct + self.check_inputs( + prompt, + prompt_2, + image, + callback_steps, + negative_prompt, + negative_prompt_2, + prompt_embeds, + negative_prompt_embeds, + pooled_prompt_embeds, + negative_pooled_prompt_embeds, + controlnet_conditioning_scale, + control_guidance_start, + control_guidance_end, + callback_on_step_end_tensor_inputs, + ) + + self._guidance_scale = guidance_scale + self._clip_skip = clip_skip + self._cross_attention_kwargs = cross_attention_kwargs + + # 2. Define call parameters + if prompt is not None and isinstance(prompt, str): + batch_size = 1 + elif prompt is not None and isinstance(prompt, list): + batch_size = len(prompt) + else: + batch_size = prompt_embeds.shape[0] + + if isinstance(controlnet, MultiControlNetModel) and isinstance(controlnet_conditioning_scale, float): + controlnet_conditioning_scale = [controlnet_conditioning_scale] * len(controlnet.nets) + + global_pool_conditions = ( + controlnet.config.global_pool_conditions + if isinstance(controlnet, ControlNetModel) + else controlnet.nets[0].config.global_pool_conditions + ) + guess_mode = guess_mode or global_pool_conditions + + # 3.1 Encode input prompt + text_encoder_lora_scale = ( + self.cross_attention_kwargs.get("scale", None) if self.cross_attention_kwargs is not None else None + ) + ( + prompt_embeds, + negative_prompt_embeds, + pooled_prompt_embeds, + negative_pooled_prompt_embeds, + ) = self.encode_prompt( + prompt, + prompt_2, + num_images_per_prompt, + self.do_classifier_free_guidance, + negative_prompt, + negative_prompt_2, + prompt_embeds=prompt_embeds, + negative_prompt_embeds=negative_prompt_embeds, + pooled_prompt_embeds=pooled_prompt_embeds, + negative_pooled_prompt_embeds=negative_pooled_prompt_embeds, + lora_scale=text_encoder_lora_scale, + clip_skip=self.clip_skip, + ) + + # 3.2 Encode ip_adapter_image + if ip_adapter_image is not None: + image_embeds, negative_image_embeds = self.encode_image(ip_adapter_image, num_images_per_prompt) + if self.do_classifier_free_guidance: + image_embeds = paddle.concat([negative_image_embeds, image_embeds]) + + # 4. Prepare image + if isinstance(controlnet, ControlNetModel): + image = self.prepare_image( + image=image, + width=width, + height=height, + batch_size=batch_size * num_images_per_prompt, + num_images_per_prompt=num_images_per_prompt, + dtype=controlnet.dtype, + do_classifier_free_guidance=self.do_classifier_free_guidance, + guess_mode=guess_mode, + ) + height, width = image.shape[-2:] + elif isinstance(controlnet, MultiControlNetModel): + images = [] + + for image_ in image: + image_ = self.prepare_image( + image=image_, + width=width, + height=height, + batch_size=batch_size * num_images_per_prompt, + num_images_per_prompt=num_images_per_prompt, + dtype=controlnet.dtype, + do_classifier_free_guidance=self.do_classifier_free_guidance, + guess_mode=guess_mode, + ) + + images.append(image_) + + image = images + height, width = image[0].shape[-2:] + else: + assert False + + # 5. Prepare timesteps + self.scheduler.set_timesteps(num_inference_steps) + timesteps = self.scheduler.timesteps + self._num_timesteps = len(timesteps) + + # 6. Prepare latent variables + num_channels_latents = self.unet.config.in_channels + latents = self.prepare_latents( + batch_size * num_images_per_prompt, + num_channels_latents, + height, + width, + prompt_embeds.dtype, + generator, + latents, + ) + + # 6.5 Optionally get Guidance Scale Embedding + timestep_cond = None + if self.unet.config.time_cond_proj_dim is not None: + guidance_scale_tensor = paddle.to_tensor([self.guidance_scale - 1]).tile( + [ + batch_size * num_images_per_prompt, + ] + ) + timestep_cond = self.get_guidance_scale_embedding( + guidance_scale_tensor, embedding_dim=self.unet.config.time_cond_proj_dim + ).cast(dtype=latents.dtype) + + # 7. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline + extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta) + + # 7.1 Create tensor stating which controlnets to keep + controlnet_keep = [] + for i in range(len(timesteps)): + keeps = [ + 1.0 - float(i / len(timesteps) < s or (i + 1) / len(timesteps) > e) + for s, e in zip(control_guidance_start, control_guidance_end) + ] + controlnet_keep.append(keeps[0] if isinstance(controlnet, ControlNetModel) else keeps) + + # 7.2 Prepare added time ids & embeddings + if isinstance(image, list): + original_size = original_size or tuple(image[0].shape[-2:]) + else: + original_size = original_size or tuple(image.shape[-2:]) + target_size = target_size or (height, width) + + add_text_embeds = pooled_prompt_embeds + if self.text_encoder_2 is None: + text_encoder_projection_dim = int(pooled_prompt_embeds.shape[-1]) + else: + text_encoder_projection_dim = self.text_encoder_2.config.projection_dim + + add_time_ids = self._get_add_time_ids( + original_size, + crops_coords_top_left, + target_size, + dtype=prompt_embeds.dtype, + text_encoder_projection_dim=text_encoder_projection_dim, + ) + + if negative_original_size is not None and negative_target_size is not None: + negative_add_time_ids = self._get_add_time_ids( + negative_original_size, + negative_crops_coords_top_left, + negative_target_size, + dtype=prompt_embeds.dtype, + text_encoder_projection_dim=text_encoder_projection_dim, + ) + else: + negative_add_time_ids = add_time_ids + + if self.do_classifier_free_guidance: + prompt_embeds = paddle.concat([negative_prompt_embeds, prompt_embeds], axis=0) + add_text_embeds = paddle.concat([negative_pooled_prompt_embeds, add_text_embeds], axis=0) + add_time_ids = paddle.concat([negative_add_time_ids, add_time_ids], axis=0) + + add_time_ids = add_time_ids.tile([batch_size * num_images_per_prompt, 1]) + + # 8. Denoising loop + num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order + + with self.progress_bar(total=num_inference_steps) as progress_bar: + for i, t in enumerate(timesteps): + # expand the latents if we are doing classifier free guidance + latent_model_input = paddle.concat([latents] * 2) if self.do_classifier_free_guidance else latents + latent_model_input = self.scheduler.scale_model_input(latent_model_input, t) + + added_cond_kwargs = {"text_embeds": add_text_embeds, "time_ids": add_time_ids} + + # controlnet(s) inference + if guess_mode and self.do_classifier_free_guidance: + # Infer ControlNet only for the conditional batch. + control_model_input = latents + control_model_input = self.scheduler.scale_model_input(control_model_input, t) + controlnet_prompt_embeds = prompt_embeds.chunk(2)[1] + controlnet_added_cond_kwargs = { + "text_embeds": add_text_embeds.chunk(2)[1], + "time_ids": add_time_ids.chunk(2)[1], + } + else: + control_model_input = latent_model_input + controlnet_prompt_embeds = prompt_embeds + controlnet_added_cond_kwargs = added_cond_kwargs + + if isinstance(controlnet_keep[i], list): + cond_scale = [c * s for c, s in zip(controlnet_conditioning_scale, controlnet_keep[i])] + else: + controlnet_cond_scale = controlnet_conditioning_scale + if isinstance(controlnet_cond_scale, list): + controlnet_cond_scale = controlnet_cond_scale[0] + cond_scale = controlnet_cond_scale * controlnet_keep[i] + + down_block_res_samples, mid_block_res_sample = self.controlnet( + control_model_input, + t, + encoder_hidden_states=controlnet_prompt_embeds, + controlnet_cond=image, + conditioning_scale=cond_scale, + guess_mode=guess_mode, + added_cond_kwargs=controlnet_added_cond_kwargs, + return_dict=False, + ) + + if guess_mode and self.do_classifier_free_guidance: + # Infered ControlNet only for the conditional batch. + # To apply the output of ControlNet to both the unconditional and conditional batches, + # add 0 to the unconditional batch to keep it unchanged. + down_block_res_samples = [paddle.concat([paddle.zeros_like(d), d]) for d in down_block_res_samples] + mid_block_res_sample = paddle.concat( + [paddle.zeros_like(mid_block_res_sample), mid_block_res_sample] + ) + + if ip_adapter_image is not None: + added_cond_kwargs["image_embeds"] = image_embeds + + # predict the noise residual + noise_pred = self.unet( + latent_model_input, + t, + encoder_hidden_states=prompt_embeds, + timestep_cond=timestep_cond, + cross_attention_kwargs=self.cross_attention_kwargs, + down_block_additional_residuals=down_block_res_samples, + mid_block_additional_residual=mid_block_res_sample, + added_cond_kwargs=added_cond_kwargs, + return_dict=False, + )[0] + + # perform guidance + if self.do_classifier_free_guidance: + noise_pred_uncond, noise_pred_text = noise_pred.chunk(2) + noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond) + + # compute the previous noisy sample x_t -> x_t-1 + latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs, return_dict=False)[0] + + if callback_on_step_end is not None: + callback_kwargs = {} + for k in callback_on_step_end_tensor_inputs: + callback_kwargs[k] = locals()[k] + callback_outputs = callback_on_step_end(self, i, t, callback_kwargs) + + latents = callback_outputs.pop("latents", latents) + prompt_embeds = callback_outputs.pop("prompt_embeds", prompt_embeds) + negative_prompt_embeds = callback_outputs.pop("negative_prompt_embeds", negative_prompt_embeds) + + # call the callback, if provided + if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0): + progress_bar.update() + if callback is not None and i % callback_steps == 0: + step_idx = i // getattr(self.scheduler, "order", 1) + callback(step_idx, t, latents) + + # manually for max memory savings + # if self.vae.dtype in [paddle.float16, "float16"] and self.vae.config.force_upcast: + # self.upcast_vae() + # latents = latents.cast(dtype=next(iter(self.vae.post_quant_conv.named_parameters()))[1].dtype) + + if not output_type == "latent": + # make sure the VAE is in float32 mode, as it overflows in float16 + needs_upcasting = self.vae.dtype in [paddle.float16, "float16"] and self.vae.config.force_upcast + + if needs_upcasting: + self.upcast_vae() + latents = latents.cast(dtype=next(iter(self.vae.post_quant_conv.named_parameters()))[1].dtype) + + image = self.vae.decode(latents / self.vae.config.scaling_factor, return_dict=False)[0] + + # cast back to fp16 if needed + if needs_upcasting: + self.vae.to(dtype=paddle.float16) + else: + image = latents + + if not output_type == "latent": + # apply watermark if available + if self.watermark is not None: + image = self.watermark.apply_watermark(image) + + image = self.image_processor.postprocess(image, output_type=output_type) + + if not return_dict: + return (image,) + + return StableDiffusionXLPipelineOutput(images=image) diff --git a/VLMEvalKit_old/PaddleMIX/ppdiffusers/ppdiffusers/pipelines/dit/__init__.py b/VLMEvalKit_old/PaddleMIX/ppdiffusers/ppdiffusers/pipelines/dit/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..62eae6ad873171dc0f578593d9dabb88271c519f --- /dev/null +++ b/VLMEvalKit_old/PaddleMIX/ppdiffusers/ppdiffusers/pipelines/dit/__init__.py @@ -0,0 +1,32 @@ +# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import TYPE_CHECKING + +from ...utils import PPDIFFUSERS_SLOW_IMPORT, _LazyModule + +_import_structure = {"pipeline_dit": ["DiTPipeline"]} + +if TYPE_CHECKING or PPDIFFUSERS_SLOW_IMPORT: + from .pipeline_dit import DiTPipeline + +else: + import sys + + sys.modules[__name__] = _LazyModule( + __name__, + globals()["__file__"], + _import_structure, + module_spec=__spec__, + ) diff --git a/VLMEvalKit_old/PaddleMIX/ppdiffusers/ppdiffusers/schedulers/deprecated/__init__.py b/VLMEvalKit_old/PaddleMIX/ppdiffusers/ppdiffusers/schedulers/deprecated/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..6ab67b337fc3f0f4ab5030f5ea3f81110734f191 --- /dev/null +++ b/VLMEvalKit_old/PaddleMIX/ppdiffusers/ppdiffusers/schedulers/deprecated/__init__.py @@ -0,0 +1,63 @@ +# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import TYPE_CHECKING + +from ...utils import ( + PPDIFFUSERS_SLOW_IMPORT, + OptionalDependencyNotAvailable, + _LazyModule, + get_objects_from_module, + is_paddle_available, + is_paddlenlp_available, +) + +_dummy_objects = {} +_import_structure = {} + +try: + if not (is_paddlenlp_available() and is_paddle_available()): + raise OptionalDependencyNotAvailable() +except OptionalDependencyNotAvailable: + from ...utils import dummy_paddle_objects # noqa F403 + + _dummy_objects.update(get_objects_from_module(dummy_paddle_objects)) +else: + _import_structure["scheduling_karras_ve"] = ["KarrasVeScheduler"] + _import_structure["scheduling_sde_vp"] = ["ScoreSdeVpScheduler"] + +if TYPE_CHECKING or PPDIFFUSERS_SLOW_IMPORT: + try: + if not is_paddle_available(): + raise OptionalDependencyNotAvailable() + + except OptionalDependencyNotAvailable: + from ..utils.dummy_pd_objects import * # noqa F403 + else: + from .scheduling_karras_ve import KarrasVeScheduler + from .scheduling_sde_vp import ScoreSdeVpScheduler + + +else: + import sys + + sys.modules[__name__] = _LazyModule( + __name__, + globals()["__file__"], + _import_structure, + module_spec=__spec__, + ) + + for name, value in _dummy_objects.items(): + setattr(sys.modules[__name__], name, value) diff --git a/VLMEvalKit_old/PaddleMIX/ppdiffusers/ppdiffusers/schedulers/deprecated/scheduling_karras_ve.py b/VLMEvalKit_old/PaddleMIX/ppdiffusers/ppdiffusers/schedulers/deprecated/scheduling_karras_ve.py new file mode 100644 index 0000000000000000000000000000000000000000..3d50991869b7224e45d57950da5f44dc369adeac --- /dev/null +++ b/VLMEvalKit_old/PaddleMIX/ppdiffusers/ppdiffusers/schedulers/deprecated/scheduling_karras_ve.py @@ -0,0 +1,243 @@ +# Copyright 2023 NVIDIA and The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +from dataclasses import dataclass +from typing import Optional, Tuple, Union + +import numpy as np +import paddle + +from ...configuration_utils import ConfigMixin, register_to_config +from ...utils import BaseOutput +from ...utils.paddle_utils import randn_tensor +from ..scheduling_utils import SchedulerMixin + + +@dataclass +class KarrasVeOutput(BaseOutput): + """ + Output class for the scheduler's step function output. + + Args: + prev_sample (`paddle.Tensor` of shape `(batch_size, num_channels, height, width)` for images): + Computed sample (x_{t-1}) of previous timestep. `prev_sample` should be used as next model input in the + denoising loop. + derivative (`paddle.Tensor` of shape `(batch_size, num_channels, height, width)` for images): + Derivative of predicted original image sample (x_0). + pred_original_sample (`paddle.Tensor` of shape `(batch_size, num_channels, height, width)` for images): + The predicted denoised sample (x_{0}) based on the model output from the current timestep. + `pred_original_sample` can be used to preview progress or for guidance. + """ + + prev_sample: paddle.Tensor + derivative: paddle.Tensor + pred_original_sample: Optional[paddle.Tensor] = None + + +class KarrasVeScheduler(SchedulerMixin, ConfigMixin): + """ + A stochastic scheduler tailored to variance-expanding models. + + This model inherits from [`SchedulerMixin`] and [`ConfigMixin`]. Check the superclass documentation for the generic + methods the library implements for all schedulers such as loading and saving. + + + + For more details on the parameters, see [Appendix E](https://arxiv.org/abs/2206.00364). The grid search values used + to find the optimal `{s_noise, s_churn, s_min, s_max}` for a specific model are described in Table 5 of the paper. + + + + Args: + sigma_min (`float`, defaults to 0.02): + The minimum noise magnitude. + sigma_max (`float`, defaults to 100): + The maximum noise magnitude. + s_noise (`float`, defaults to 1.007): + The amount of additional noise to counteract loss of detail during sampling. A reasonable range is [1.000, + 1.011]. + s_churn (`float`, defaults to 80): + The parameter controlling the overall amount of stochasticity. A reasonable range is [0, 100]. + s_min (`float`, defaults to 0.05): + The start value of the sigma range to add noise (enable stochasticity). A reasonable range is [0, 10]. + s_max (`float`, defaults to 50): + The end value of the sigma range to add noise. A reasonable range is [0.2, 80]. + """ + + order = 2 + + @register_to_config + def __init__( + self, + sigma_min: float = 0.02, + sigma_max: float = 100, + s_noise: float = 1.007, + s_churn: float = 80, + s_min: float = 0.05, + s_max: float = 50, + ): + # standard deviation of the initial noise distribution + self.init_noise_sigma = sigma_max + + # setable values + self.num_inference_steps: int = None + self.timesteps: paddle.Tensor = None + self.schedule: paddle.Tensor = None # sigma(t_i) + + def scale_model_input(self, sample: paddle.Tensor, timestep: Optional[int] = None) -> paddle.Tensor: + """ + Ensures interchangeability with schedulers that need to scale the denoising model input depending on the + current timestep. + + Args: + sample (`paddle.Tensor`): + The input sample. + timestep (`int`, *optional*): + The current timestep in the diffusion chain. + + Returns: + `paddle.Tensor`: + A scaled input sample. + """ + return sample + + def set_timesteps(self, num_inference_steps: int): + """ + Sets the discrete timesteps used for the diffusion chain (to be run before inference). + + Args: + num_inference_steps (`int`): + The number of diffusion steps used when generating samples with a pre-trained model. + device (`str` or `torch.device`, *optional*): + The device to which the timesteps should be moved to. If `None`, the timesteps are not moved. + """ + self.num_inference_steps = num_inference_steps + timesteps = np.arange(0, self.num_inference_steps)[::-1].copy() + self.timesteps = paddle.to_tensor(timesteps) + schedule = [ + ( + self.config.sigma_max**2 + * (self.config.sigma_min**2 / self.config.sigma_max**2) ** (i / (num_inference_steps - 1)) + ) + for i in self.timesteps + ] + self.schedule = paddle.to_tensor(schedule, dtype=paddle.float32) + + def add_noise_to_input( + self, sample: paddle.Tensor, sigma: float, generator: Optional[paddle.Generator] = None + ) -> Tuple[paddle.Tensor, float]: + """ + Explicit Langevin-like "churn" step of adding noise to the sample according to a `gamma_i ≥ 0` to reach a + higher noise level `sigma_hat = sigma_i + gamma_i*sigma_i`. + + Args: + sample (`paddle.Tensor`): + The input sample. + sigma (`float`): + generator (`paddle.Generator`, *optional*): + A random number generator. + """ + if self.config.s_min <= sigma <= self.config.s_max: + gamma = min(self.config.s_churn / self.num_inference_steps, 2**0.5 - 1) + else: + gamma = 0 + + # sample eps ~ N(0, S_noise^2 * I) + eps = self.config.s_noise * randn_tensor(sample.shape, generator=generator) + sigma_hat = sigma + gamma * sigma + sample_hat = sample + ((sigma_hat**2 - sigma**2) ** 0.5 * eps) + + return sample_hat, sigma_hat + + def step( + self, + model_output: paddle.Tensor, + sigma_hat: float, + sigma_prev: float, + sample_hat: paddle.Tensor, + return_dict: bool = True, + ) -> Union[KarrasVeOutput, Tuple]: + """ + Predict the sample from the previous timestep by reversing the SDE. This function propagates the diffusion + process from the learned model outputs (most often the predicted noise). + + Args: + model_output (`paddle.Tensor`): + The direct output from learned diffusion model. + sigma_hat (`float`): + sigma_prev (`float`): + sample_hat (`paddle.Tensor`): + return_dict (`bool`, *optional*, defaults to `True`): + Whether or not to return a [`~schedulers.scheduling_karras_ve.KarrasVESchedulerOutput`] or `tuple`. + + Returns: + [`~schedulers.scheduling_karras_ve.KarrasVESchedulerOutput`] or `tuple`: + If return_dict is `True`, [`~schedulers.scheduling_karras_ve.KarrasVESchedulerOutput`] is returned, + otherwise a tuple is returned where the first element is the sample tensor. + + """ + + pred_original_sample = sample_hat + sigma_hat * model_output + derivative = (sample_hat - pred_original_sample) / sigma_hat + sample_prev = sample_hat + (sigma_prev - sigma_hat) * derivative + + if not return_dict: + return (sample_prev, derivative) + + return KarrasVeOutput( + prev_sample=sample_prev, derivative=derivative, pred_original_sample=pred_original_sample + ) + + def step_correct( + self, + model_output: paddle.Tensor, + sigma_hat: float, + sigma_prev: float, + sample_hat: paddle.Tensor, + sample_prev: paddle.Tensor, + derivative: paddle.Tensor, + return_dict: bool = True, + ) -> Union[KarrasVeOutput, Tuple]: + """ + Corrects the predicted sample based on the `model_output` of the network. + + Args: + model_output (`paddle.Tensor`): + The direct output from learned diffusion model. + sigma_hat (`float`): TODO + sigma_prev (`float`): TODO + sample_hat (`paddle.Tensor`): TODO + sample_prev (`paddle.Tensor`): TODO + derivative (`paddle.Tensor`): TODO + return_dict (`bool`, *optional*, defaults to `True`): + Whether or not to return a [`~schedulers.scheduling_ddpm.DDPMSchedulerOutput`] or `tuple`. + + Returns: + prev_sample (TODO): updated sample in the diffusion chain. derivative (TODO): TODO + + """ + pred_original_sample = sample_prev + sigma_prev * model_output + derivative_corr = (sample_prev - pred_original_sample) / sigma_prev + sample_prev = sample_hat + (sigma_prev - sigma_hat) * (0.5 * derivative + 0.5 * derivative_corr) + + if not return_dict: + return (sample_prev, derivative) + + return KarrasVeOutput( + prev_sample=sample_prev, derivative=derivative, pred_original_sample=pred_original_sample + ) + + def add_noise(self, original_samples, noise, timesteps): + raise NotImplementedError() diff --git a/VLMEvalKit_old/PaddleMIX/ppdiffusers/ppdiffusers/schedulers/deprecated/scheduling_sde_vp.py b/VLMEvalKit_old/PaddleMIX/ppdiffusers/ppdiffusers/schedulers/deprecated/scheduling_sde_vp.py new file mode 100644 index 0000000000000000000000000000000000000000..c63036c9f4b894e8ca1f33701b9082e0606ff52e --- /dev/null +++ b/VLMEvalKit_old/PaddleMIX/ppdiffusers/ppdiffusers/schedulers/deprecated/scheduling_sde_vp.py @@ -0,0 +1,110 @@ +# Copyright 2023 Google Brain and The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# DISCLAIMER: This file is strongly influenced by https://github.com/yang-song/score_sde_pytorch + +import math + +import paddle + +from ...configuration_utils import ConfigMixin, register_to_config +from ...utils.paddle_utils import randn_tensor +from ..scheduling_utils import SchedulerMixin + + +class ScoreSdeVpScheduler(SchedulerMixin, ConfigMixin): + """ + `ScoreSdeVpScheduler` is a variance preserving stochastic differential equation (SDE) scheduler. + + This model inherits from [`SchedulerMixin`] and [`ConfigMixin`]. Check the superclass documentation for the generic + methods the library implements for all schedulers such as loading and saving. + + Args: + num_train_timesteps (`int`, defaults to 2000): + The number of diffusion steps to train the model. + beta_min (`int`, defaults to 0.1): + beta_max (`int`, defaults to 20): + sampling_eps (`int`, defaults to 1e-3): + The end value of sampling where timesteps decrease progressively from 1 to epsilon. + """ + + order = 1 + + @register_to_config + def __init__(self, num_train_timesteps=2000, beta_min=0.1, beta_max=20, sampling_eps=1e-3): + self.sigmas = None + self.discrete_sigmas = None + self.timesteps = None + + def set_timesteps(self, num_inference_steps): + """ + Sets the continuous timesteps used for the diffusion chain (to be run before inference). + + Args: + num_inference_steps (`int`): + The number of diffusion steps used when generating samples with a pre-trained model. + device (`str` or `torch.device`, *optional*): + The device to which the timesteps should be moved to. If `None`, the timesteps are not moved. + """ + self.timesteps = paddle.linspace(1, self.config.sampling_eps, num_inference_steps) + + def step_pred(self, score, x, t, generator=None): + """ + Predict the sample from the previous timestep by reversing the SDE. This function propagates the diffusion + process from the learned model outputs (most often the predicted noise). + + Args: + score (): + x (): + t (): + generator (`paddle.Generator`, *optional*): + A random number generator. + """ + if self.timesteps is None: + raise ValueError( + "`self.timesteps` is not set, you need to run 'set_timesteps' after creating the scheduler" + ) + + # TODO(Patrick) better comments + non-Paddle + # postprocess model score + log_mean_coeff = ( + -0.25 * t**2 * (self.config.beta_max - self.config.beta_min) - 0.5 * t * self.config.beta_min + ) + std = paddle.sqrt(1.0 - paddle.exp(2.0 * log_mean_coeff)) + std = std.flatten() + while len(std.shape) < len(score.shape): + std = std.unsqueeze(-1) + score = -score / std + + # compute + dt = -1.0 / len(self.timesteps) + + beta_t = self.config.beta_min + t * (self.config.beta_max - self.config.beta_min) + beta_t = beta_t.flatten() + while len(beta_t.shape) < len(x.shape): + beta_t = beta_t.unsqueeze(-1) + drift = -0.5 * beta_t * x + + diffusion = paddle.sqrt(beta_t) + drift = drift - diffusion**2 * score + x_mean = x + drift * dt + + # add noise + noise = randn_tensor(x.shape, generator=generator, dtype=x.dtype) + x = x_mean + diffusion * math.sqrt(-dt) * noise + + return x, x_mean + + def __len__(self): + return self.config.num_train_timesteps diff --git a/VLMEvalKit_old/PaddleMIX/ppdiffusers/ppdiffusers/schedulers/preconfig/__init__.py b/VLMEvalKit_old/PaddleMIX/ppdiffusers/ppdiffusers/schedulers/preconfig/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..ecff93753b32dea4e0625006b6d457681611a8d6 --- /dev/null +++ b/VLMEvalKit_old/PaddleMIX/ppdiffusers/ppdiffusers/schedulers/preconfig/__init__.py @@ -0,0 +1,38 @@ +# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. +# Copyright 2023 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# flake8: noqa + +from ...utils import ( + OptionalDependencyNotAvailable, + is_paddle_available, + is_scipy_available, +) + +try: + if not is_paddle_available(): + raise OptionalDependencyNotAvailable() +except OptionalDependencyNotAvailable: + from ...utils.dummy_paddle_objects import * # noqa F403 +else: + from .preconfig_scheduling_euler_ancestral_discrete import ( + PreconfigEulerAncestralDiscreteScheduler, + ) +try: + if not (is_paddle_available() and is_scipy_available()): + raise OptionalDependencyNotAvailable() +except OptionalDependencyNotAvailable: + from ...utils.dummy_paddle_and_scipy_objects import * # noqa F403 +else: + from .preconfig_scheduling_lms_discrete import PreconfigLMSDiscreteScheduler diff --git a/VLMEvalKit_old/PaddleMIX/ppdiffusers/ppdiffusers/schedulers/preconfig/preconfig_scheduling_euler_ancestral_discrete.py b/VLMEvalKit_old/PaddleMIX/ppdiffusers/ppdiffusers/schedulers/preconfig/preconfig_scheduling_euler_ancestral_discrete.py new file mode 100644 index 0000000000000000000000000000000000000000..b45428de9e12c35acf25d98c53df0f773e1e5ed1 --- /dev/null +++ b/VLMEvalKit_old/PaddleMIX/ppdiffusers/ppdiffusers/schedulers/preconfig/preconfig_scheduling_euler_ancestral_discrete.py @@ -0,0 +1,313 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# Copyright 2022 Katherine Crowson and The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import math +from dataclasses import dataclass +from typing import List, Optional, Tuple, Union + +import numpy as np +import paddle + +from ...configuration_utils import ConfigMixin, register_to_config +from ...utils import BaseOutput, logging, randn_tensor +from ..scheduling_utils import KarrasDiffusionSchedulers, SchedulerMixin + +logger = logging.get_logger(__name__) # pylint: disable=invalid-name + + +@dataclass +# Copied from ppdiffusers.schedulers.scheduling_ddpm.DDPMSchedulerOutput with DDPM->EulerAncestralDiscrete +class PreconfigEulerAncestralDiscreteSchedulerOutput(BaseOutput): + """ + Output class for the scheduler's step function output. + + Args: + prev_sample (`paddle.Tensor` of shape `(batch_size, num_channels, height, width)` for images): + Computed sample (x_{t-1}) of previous timestep. `prev_sample` should be used as next model input in the + denoising loop. + pred_original_sample (`paddle.Tensor` of shape `(batch_size, num_channels, height, width)` for images): + The predicted denoised sample (x_{0}) based on the model output from the current timestep. + `pred_original_sample` can be used to preview progress or for guidance. + """ + + prev_sample: paddle.Tensor + pred_original_sample: Optional[paddle.Tensor] = None + + +# Copied from ppdiffusers.schedulers.scheduling_ddpm.betas_for_alpha_bar +def betas_for_alpha_bar(num_diffusion_timesteps, max_beta=0.999) -> paddle.Tensor: + """ + Create a beta schedule that discretizes the given alpha_t_bar function, which defines the cumulative product of + (1-beta) over time from t = [0,1]. + + Contains a function alpha_bar that takes an argument t and transforms it to the cumulative product of (1-beta) up + to that part of the diffusion process. + + + Args: + num_diffusion_timesteps (`int`): the number of betas to produce. + max_beta (`float`): the maximum beta to use; use values lower than 1 to + prevent singularities. + + Returns: + betas (`np.ndarray`): the betas used by the scheduler to step the model outputs + """ + + def alpha_bar(time_step): + return math.cos((time_step + 0.008) / 1.008 * math.pi / 2) ** 2 + + betas = [] + for i in range(num_diffusion_timesteps): + t1 = i / num_diffusion_timesteps + t2 = (i + 1) / num_diffusion_timesteps + betas.append(min(1 - alpha_bar(t2) / alpha_bar(t1), max_beta)) + return paddle.to_tensor(betas, dtype=paddle.float32) + + +class PreconfigEulerAncestralDiscreteScheduler(SchedulerMixin, ConfigMixin): + """ + Ancestral sampling with Euler method steps. Based on the original k-diffusion implementation by Katherine Crowson: + https://github.com/crowsonkb/k-diffusion/blob/481677d114f6ea445aa009cf5bd7a9cdee909e47/k_diffusion/sampling.py#L72 + + [`~ConfigMixin`] takes care of storing all config attributes that are passed in the scheduler's `__init__` + function, such as `num_train_timesteps`. They can be accessed via `scheduler.config.num_train_timesteps`. + [`SchedulerMixin`] provides general loading and saving functionality via the [`SchedulerMixin.save_pretrained`] and + [`~SchedulerMixin.from_pretrained`] functions. + + Args: + num_train_timesteps (`int`): number of diffusion steps used to train the model. + beta_start (`float`): the starting `beta` value of inference. + beta_end (`float`): the final `beta` value. + beta_schedule (`str`): + the beta schedule, a mapping from a beta range to a sequence of betas for stepping the model. Choose from + `linear` or `scaled_linear`. + trained_betas (`np.ndarray`, optional): + option to pass an array of betas directly to the constructor to bypass `beta_start`, `beta_end` etc. + prediction_type (`str`, default `epsilon`, optional): + prediction type of the scheduler function, one of `epsilon` (predicting the noise of the diffusion + process), `sample` (directly predicting the noisy sample`) or `v_prediction` (see section 2.4 + https://imagen.research.google/video/paper.pdf) + + """ + + _compatibles = [e.name for e in KarrasDiffusionSchedulers] + order = 1 + + @register_to_config + def __init__( + self, + num_train_timesteps: int = 1000, + beta_start: float = 0.0001, + beta_end: float = 0.02, + beta_schedule: str = "linear", + trained_betas: Optional[Union[np.ndarray, List[float]]] = None, + prediction_type: str = "epsilon", + preconfig: bool = True, + ): + if trained_betas is not None: + self.betas = paddle.to_tensor(trained_betas, dtype=paddle.float32) + elif beta_schedule == "linear": + self.betas = paddle.linspace(beta_start, beta_end, num_train_timesteps, dtype=paddle.float32) + elif beta_schedule == "scaled_linear": + # this schedule is very specific to the latent diffusion model. + self.betas = ( + paddle.linspace(beta_start**0.5, beta_end**0.5, num_train_timesteps, dtype=paddle.float32) ** 2 + ) + elif beta_schedule == "squaredcos_cap_v2": + # Glide cosine schedule + self.betas = betas_for_alpha_bar(num_train_timesteps) + else: + raise NotImplementedError(f"{beta_schedule} does is not implemented for {self.__class__}") + + self.alphas = 1.0 - self.betas + self.alphas_cumprod = paddle.cumprod(self.alphas, 0) + + sigmas = np.array(((1 - self.alphas_cumprod) / self.alphas_cumprod) ** 0.5) + sigmas = np.concatenate([sigmas[::-1], [0.0]]).astype(np.float32) + self.sigmas = paddle.to_tensor(sigmas) + + # standard deviation of the initial noise distribution + self.init_noise_sigma = self.sigmas.max() + + # setable values + self.num_inference_steps = None + timesteps = np.linspace(0, num_train_timesteps - 1, num_train_timesteps, dtype=float)[::-1].copy() + self.timesteps = paddle.to_tensor(timesteps, dtype=paddle.float32) + self.is_scale_input_called = False + self.preconfig = preconfig + self.step_index_offset = 0 + + def scale_model_input( + self, sample: paddle.Tensor, timestep: Union[float, paddle.Tensor], **kwargs + ) -> paddle.Tensor: + """ + Scales the denoising model input by `(sigma**2 + 1) ** 0.5` to match the Euler algorithm. + + Args: + sample (`paddle.Tensor`): input sample + timestep (`float` or `paddle.Tensor`): the current timestep in the diffusion chain + + Returns: + `paddle.Tensor`: scaled input sample + """ + self.is_scale_input_called = True + if kwargs.get("step_index") is not None: + step_index = kwargs["step_index"] + self.step_index_offset + else: + step_index = (self.timesteps == timestep).nonzero().item() + + if not self.preconfig: + sigma = self.sigmas[step_index] + sample = sample / ((sigma**2 + 1) ** 0.5) + return sample + else: + if step_index > (len(self.latent_scales) - 1): + step_index = -1 + return sample * self.latent_scales[step_index] + + def set_timesteps(self, num_inference_steps: int): + """ + Sets the timesteps used for the diffusion chain. Supporting function to be run before inference. + + Args: + num_inference_steps (`int`): + the number of diffusion steps used when generating samples with a pre-trained model. + """ + self.num_inference_steps = num_inference_steps + self.step_index_offset = 0 + + timesteps = np.linspace(0, self.config.num_train_timesteps - 1, num_inference_steps, dtype=float)[::-1].copy() + sigmas = np.array(((1 - self.alphas_cumprod) / self.alphas_cumprod) ** 0.5) + sigmas = np.interp(timesteps, np.arange(0, len(sigmas)), sigmas) + sigmas = np.concatenate([sigmas, [0.0]]).astype(np.float32) + self.sigmas = paddle.to_tensor(sigmas) + self.timesteps = paddle.to_tensor(timesteps, dtype=paddle.float32) + if self.preconfig: + self.sigma_up = [] + self.sigma_down = [] + for step_index_i in range(len(self.timesteps)): + sigma_from = self.sigmas[step_index_i] + sigma_to = self.sigmas[step_index_i + 1] + sigma_up = (sigma_to**2 * (sigma_from**2 - sigma_to**2) / sigma_from**2) ** 0.5 + sigma_down = (sigma_to**2 - sigma_up**2) ** 0.5 + self.sigma_up.append(sigma_up) + self.sigma_down.append(sigma_down) + self.latent_scales = 1 / ((self.sigmas**2 + 1) ** 0.5) + + def step( + self, + model_output: paddle.Tensor, + timestep: Union[float, paddle.Tensor], + sample: paddle.Tensor, + generator: Optional[Union[paddle.Generator, List[paddle.Generator]]] = None, + return_dict: bool = True, + **kwargs + ) -> Union[PreconfigEulerAncestralDiscreteSchedulerOutput, Tuple]: + """ + Predict the sample at the previous timestep by reversing the SDE. Core function to propagate the diffusion + process from the learned model outputs (most often the predicted noise). + + Args: + model_output (`paddle.Tensor`): direct output from learned diffusion model. + timestep (`float`): current timestep in the diffusion chain. + sample (`paddle.Tensor`): + current instance of sample being created by diffusion process. + generator (`paddle.Generator`, optional): Random number generator. + return_dict (`bool`): option for returning tuple rather than PreconfigEulerAncestralDiscreteSchedulerOutput class + + Returns: + [`~schedulers.scheduling_utils.PreconfigEulerAncestralDiscreteSchedulerOutput`] or `tuple`: + [`~schedulers.scheduling_utils.PreconfigEulerAncestralDiscreteSchedulerOutput`] if `return_dict` is True, otherwise + a `tuple`. When returning a tuple, the first element is the sample tensor. + + """ + if not self.is_scale_input_called: + logger.warning( + "The `scale_model_input` function should be called before `step` to ensure correct denoising. " + "See `StableDiffusionPipeline` for a usage example." + ) + if kwargs.get("return_pred_original_sample") is not None: + return_pred_original_sample = kwargs["return_pred_original_sample"] + else: + return_pred_original_sample = True + if kwargs.get("step_index") is not None: + step_index = kwargs["step_index"] + self.step_index_offset + else: + step_index = (self.timesteps == timestep).nonzero().item() + sigma = self.sigmas[step_index] + if self.config.prediction_type == "epsilon" and not return_pred_original_sample: + derivative = model_output + pred_original_sample = None + else: + # 1. compute predicted original sample (x_0) from sigma-scaled predicted noise + if self.config.prediction_type == "epsilon": + pred_original_sample = sample - sigma * model_output + elif self.config.prediction_type == "v_prediction": + # * c_out + input * c_skip + pred_original_sample = model_output * (-sigma / (sigma**2 + 1) ** 0.5) + (sample / (sigma**2 + 1)) + elif self.config.prediction_type == "sample": + raise NotImplementedError("prediction_type not implemented yet: sample") + else: + raise ValueError( + f"prediction_type given as {self.config.prediction_type} must be one of `epsilon`, or `v_prediction`" + ) + derivative = (sample - pred_original_sample) / sigma + if not self.preconfig: + sigma_from = self.sigmas[step_index] + sigma_to = self.sigmas[step_index + 1] + sigma_up = (sigma_to**2 * (sigma_from**2 - sigma_to**2) / sigma_from**2) ** 0.5 + sigma_down = (sigma_to**2 - sigma_up**2) ** 0.5 + else: + sigma_up = self.sigma_up[step_index] + sigma_down = self.sigma_down[step_index] + # 2. Convert to an ODE derivative + dt = sigma_down - sigma + prev_sample = sample + derivative * dt + noise = randn_tensor(model_output.shape, dtype=model_output.dtype, generator=generator) + prev_sample = prev_sample + noise * sigma_up + if not return_dict: + if not return_pred_original_sample: + return (prev_sample,) + else: + return (prev_sample, pred_original_sample) + + return PreconfigEulerAncestralDiscreteSchedulerOutput( + prev_sample=prev_sample, pred_original_sample=pred_original_sample + ) + + def add_noise( + self, + original_samples: paddle.Tensor, + noise: paddle.Tensor, + timesteps: paddle.Tensor, + ) -> paddle.Tensor: + # Fix 0D tensor + if paddle.is_tensor(timesteps) and timesteps.ndim == 0: + timesteps = timesteps.unsqueeze(0) + # Make sure sigmas and timesteps have the same dtype as original_samples + self.sigmas = self.sigmas.cast(original_samples.dtype) + + schedule_timesteps = self.timesteps + step_indices = [(schedule_timesteps == t).nonzero().item() for t in timesteps] + + sigma = self.sigmas[step_indices].flatten() + while len(sigma.shape) < len(original_samples.shape): + sigma = sigma.unsqueeze(-1) + + noisy_samples = original_samples + noise * sigma + return noisy_samples + + def __len__(self): + return self.config.num_train_timesteps diff --git a/VLMEvalKit_old/PaddleMIX/ppdiffusers/ppdiffusers/schedulers/preconfig/preconfig_scheduling_lms_discrete.py b/VLMEvalKit_old/PaddleMIX/ppdiffusers/ppdiffusers/schedulers/preconfig/preconfig_scheduling_lms_discrete.py new file mode 100644 index 0000000000000000000000000000000000000000..450dcb635843e07edd7737d2230b9c6ab7502cd3 --- /dev/null +++ b/VLMEvalKit_old/PaddleMIX/ppdiffusers/ppdiffusers/schedulers/preconfig/preconfig_scheduling_lms_discrete.py @@ -0,0 +1,340 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# Copyright 2022 Katherine Crowson and The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import math +import warnings +from dataclasses import dataclass +from typing import List, Optional, Tuple, Union + +import numpy as np +import paddle +from scipy import integrate + +from ...configuration_utils import ConfigMixin, register_to_config +from ...utils import BaseOutput +from ..scheduling_utils import KarrasDiffusionSchedulers, SchedulerMixin + + +@dataclass +# Copied from ppdiffusers.schedulers.scheduling_ddpm.DDPMSchedulerOutput with DDPM->LMSDiscrete +class PreconfigLMSDiscreteSchedulerOutput(BaseOutput): + """ + Output class for the scheduler's step function output. + + Args: + prev_sample (`paddle.Tensor` of shape `(batch_size, num_channels, height, width)` for images): + Computed sample (x_{t-1}) of previous timestep. `prev_sample` should be used as next model input in the + denoising loop. + pred_original_sample (`paddle.Tensor` of shape `(batch_size, num_channels, height, width)` for images): + The predicted denoised sample (x_{0}) based on the model output from the current timestep. + `pred_original_sample` can be used to preview progress or for guidance. + """ + + prev_sample: paddle.Tensor + pred_original_sample: Optional[paddle.Tensor] = None + + +# Copied from ppdiffusers.schedulers.scheduling_ddpm.betas_for_alpha_bar +def betas_for_alpha_bar(num_diffusion_timesteps, max_beta=0.999): + """ + Create a beta schedule that discretizes the given alpha_t_bar function, which defines the cumulative product of + (1-beta) over time from t = [0,1]. + + Contains a function alpha_bar that takes an argument t and transforms it to the cumulative product of (1-beta) up + to that part of the diffusion process. + + + Args: + num_diffusion_timesteps (`int`): the number of betas to produce. + max_beta (`float`): the maximum beta to use; use values lower than 1 to + prevent singularities. + + Returns: + betas (`np.ndarray`): the betas used by the scheduler to step the model outputs + """ + + def alpha_bar(time_step): + return math.cos((time_step + 0.008) / 1.008 * math.pi / 2) ** 2 + + betas = [] + for i in range(num_diffusion_timesteps): + t1 = i / num_diffusion_timesteps + t2 = (i + 1) / num_diffusion_timesteps + betas.append(min(1 - alpha_bar(t2) / alpha_bar(t1), max_beta)) + return paddle.to_tensor(betas, dtype=paddle.float32) + + +class PreconfigLMSDiscreteScheduler(SchedulerMixin, ConfigMixin): + """ + Linear Multistep Scheduler for discrete beta schedules. Based on the original k-diffusion implementation by + Katherine Crowson: + https://github.com/crowsonkb/k-diffusion/blob/481677d114f6ea445aa009cf5bd7a9cdee909e47/k_diffusion/sampling.py#L181 + + [`~ConfigMixin`] takes care of storing all config attributes that are passed in the scheduler's `__init__` + function, such as `num_train_timesteps`. They can be accessed via `scheduler.config.num_train_timesteps`. + [`SchedulerMixin`] provides general loading and saving functionality via the [`SchedulerMixin.save_pretrained`] and + [`~SchedulerMixin.from_pretrained`] functions. + + Args: + num_train_timesteps (`int`): number of diffusion steps used to train the model. + beta_start (`float`): the starting `beta` value of inference. + beta_end (`float`): the final `beta` value. + beta_schedule (`str`): + the beta schedule, a mapping from a beta range to a sequence of betas for stepping the model. Choose from + `linear` or `scaled_linear`. + trained_betas (`np.ndarray`, optional): + option to pass an array of betas directly to the constructor to bypass `beta_start`, `beta_end` etc. + prediction_type (`str`, default `epsilon`, optional): + prediction type of the scheduler function, one of `epsilon` (predicting the noise of the diffusion + process), `sample` (directly predicting the noisy sample`) or `v_prediction` (see section 2.4 + https://imagen.research.google/video/paper.pdf) + """ + + _compatibles = [e.name for e in KarrasDiffusionSchedulers] + order = 1 + + @register_to_config + def __init__( + self, + num_train_timesteps: int = 1000, + beta_start: float = 0.0001, + beta_end: float = 0.02, + beta_schedule: str = "linear", + trained_betas: Optional[Union[np.ndarray, List[float]]] = None, + prediction_type: str = "epsilon", + preconfig=True, + ): + if trained_betas is not None: + self.betas = paddle.to_tensor(trained_betas, dtype=paddle.float32) + elif beta_schedule == "linear": + self.betas = paddle.linspace(beta_start, beta_end, num_train_timesteps, dtype=paddle.float32) + elif beta_schedule == "scaled_linear": + # this schedule is very specific to the latent diffusion model. + self.betas = ( + paddle.linspace(beta_start**0.5, beta_end**0.5, num_train_timesteps, dtype=paddle.float32) ** 2 + ) + elif beta_schedule == "squaredcos_cap_v2": + # Glide cosine schedule + self.betas = betas_for_alpha_bar(num_train_timesteps) + else: + raise NotImplementedError(f"{beta_schedule} does is not implemented for {self.__class__}") + + self.alphas = 1.0 - self.betas + self.alphas_cumprod = paddle.cumprod(self.alphas, 0) + + sigmas = np.array(((1 - self.alphas_cumprod) / self.alphas_cumprod) ** 0.5) + sigmas = np.concatenate([sigmas[::-1], [0.0]]).astype(np.float32) + self.sigmas = paddle.to_tensor(sigmas) + + # standard deviation of the initial noise distribution + self.init_noise_sigma = self.sigmas.max() + + # setable values + self.num_inference_steps = None + timesteps = np.linspace(0, num_train_timesteps - 1, num_train_timesteps, dtype=float)[::-1].copy() + self.timesteps = paddle.to_tensor(timesteps, dtype=paddle.float32) + self.derivatives = [] + self.is_scale_input_called = False + self.preconfig = preconfig + + def scale_model_input( + self, sample: paddle.Tensor, timestep: Union[float, paddle.Tensor], **kwargs + ) -> paddle.Tensor: + """ + Scales the denoising model input by `(sigma**2 + 1) ** 0.5` to match the K-LMS algorithm. + + Args: + sample (`paddle.Tensor`): input sample + timestep (`float` or `paddle.Tensor`): the current timestep in the diffusion chain + + Returns: + `paddle.Tensor`: scaled input sample + """ + if kwargs.get("step_index") is not None: + step_index = kwargs["step_index"] + else: + step_index = (self.timesteps == timestep).nonzero().item() + self.is_scale_input_called = True + if not self.preconfig: + sigma = self.sigmas[step_index] + sample = sample / ((sigma**2 + 1) ** 0.5) + return sample + else: + return sample * self.latent_scales[step_index] + + def get_lms_coefficient(self, order, t, current_order): + """ + Compute a linear multistep coefficient. + + Args: + order (TODO): + t (TODO): + current_order (TODO): + """ + + def lms_derivative(tau): + prod = 1.0 + for k in range(order): + if current_order == k: + continue + prod *= (tau - self.sigmas[t - k]) / (self.sigmas[t - current_order] - self.sigmas[t - k]) + return prod + + integrated_coeff = integrate.quad(lms_derivative, self.sigmas[t], self.sigmas[t + 1], epsrel=1e-4)[0] + + return integrated_coeff + + def set_timesteps(self, num_inference_steps: int, preconfig_order: int = 4): + """ + Sets the timesteps used for the diffusion chain. Supporting function to be run before inference. + + Args: + num_inference_steps (`int`): + the number of diffusion steps used when generating samples with a pre-trained model. + """ + self.num_inference_steps = num_inference_steps + + timesteps = np.linspace(0, self.config.num_train_timesteps - 1, num_inference_steps, dtype=float)[::-1].copy() + sigmas = np.array(((1 - self.alphas_cumprod) / self.alphas_cumprod) ** 0.5) + sigmas = np.interp(timesteps, np.arange(0, len(sigmas)), sigmas) + sigmas = np.concatenate([sigmas, [0.0]]).astype(np.float32) + self.sigmas = paddle.to_tensor(sigmas) + self.timesteps = paddle.to_tensor(timesteps, dtype=paddle.float32) + + self.derivatives = [] + if self.preconfig: + self.order = preconfig_order + self.lms_coeffs = [] + self.latent_scales = [1.0 / ((sigma**2 + 1) ** 0.5) for sigma in self.sigmas] + for step_index in range(self.num_inference_steps): + order = min(step_index + 1, preconfig_order) + self.lms_coeffs.append( + [self.get_lms_coefficient(order, step_index, curr_order) for curr_order in range(order)] + ) + + def step( + self, + model_output: paddle.Tensor, + timestep: Union[float, paddle.Tensor], + sample: paddle.Tensor, + order: int = 4, + return_dict: bool = True, + **kwargs + ) -> Union[PreconfigLMSDiscreteSchedulerOutput, Tuple]: + """ + Predict the sample at the previous timestep by reversing the SDE. Core function to propagate the diffusion + process from the learned model outputs (most often the predicted noise). + + Args: + model_output (`paddle.Tensor`): direct output from learned diffusion model. + timestep (`float`): current timestep in the diffusion chain. + sample (`paddle.Tensor`): + current instance of sample being created by diffusion process. + order: coefficient for multi-step inference. + return_dict (`bool`): option for returning tuple rather than PreconfigLMSDiscreteSchedulerOutput class + Args in kwargs: + step_index (`int`): + return_pred_original_sample (`bool`): option for return pred_original_sample + + Returns: + [`~schedulers.scheduling_utils.PreconfigLMSDiscreteSchedulerOutput`] or `tuple`: + [`~schedulers.scheduling_utils.PreconfigLMSDiscreteSchedulerOutput`] if `return_dict` is True, otherwise a `tuple`. + When returning a tuple, the first element is the sample tensor. + + """ + if not self.is_scale_input_called: + warnings.warn( + "The `scale_model_input` function should be called before `step` to ensure correct denoising. " + "See `StableDiffusionPipeline` for a usage example." + ) + if kwargs.get("return_pred_original_sample") is not None: + return_pred_original_sample = kwargs["return_pred_original_sample"] + else: + return_pred_original_sample = True + if kwargs.get("step_index") is not None: + step_index = kwargs["step_index"] + else: + step_index = (self.timesteps == timestep).nonzero().item() + if self.config.prediction_type == "epsilon" and not return_pred_original_sample: + # if pred_original_sample is no need + self.derivatives.append(model_output) + pred_original_sample = None + else: + sigma = self.sigmas[step_index] + # 1. compute predicted original sample (x_0) from sigma-scaled predicted noise + if self.config.prediction_type == "epsilon": + pred_original_sample = sample - sigma * model_output + elif self.config.prediction_type == "v_prediction": + # * c_out + input * c_skip + pred_original_sample = model_output * (-sigma / (sigma**2 + 1) ** 0.5) + (sample / (sigma**2 + 1)) + elif self.config.prediction_type == "sample": + pred_original_sample = model_output + else: + raise ValueError( + f"prediction_type given as {self.config.prediction_type} must be one of `epsilon`, or `v_prediction`" + ) + # 2. Convert to an ODE derivative + derivative = (sample - pred_original_sample) / sigma + self.derivatives.append(derivative) + + if len(self.derivatives) > order: + self.derivatives.pop(0) + + if not self.preconfig: + # 3. If not preconfiged, compute linear multistep coefficients. + order = min(step_index + 1, order) + lms_coeffs = [self.get_lms_coefficient(order, step_index, curr_order) for curr_order in range(order)] + # 4. Compute previous sample based on the derivatives path + prev_sample = sample + sum( + coeff * derivative for coeff, derivative in zip(lms_coeffs, reversed(self.derivatives)) + ) + else: + # 3. If preconfiged, direct compute previous sample based on the derivatives path + prev_sample = sample + sum( + coeff * derivative + for coeff, derivative in zip(self.lms_coeffs[step_index], reversed(self.derivatives)) + ) + + if not return_dict: + if not return_pred_original_sample: + return (prev_sample,) + else: + return (prev_sample, pred_original_sample) + + return PreconfigLMSDiscreteSchedulerOutput(prev_sample=prev_sample, pred_original_sample=pred_original_sample) + + def add_noise( + self, + original_samples: paddle.Tensor, + noise: paddle.Tensor, + timesteps: paddle.Tensor, + ) -> paddle.Tensor: + # Fix 0D tensor + if paddle.is_tensor(timesteps) and timesteps.ndim == 0: + timesteps = timesteps.unsqueeze(0) + # Make sure sigmas and timesteps have the same dtype as original_samples + sigmas = self.sigmas.cast(original_samples.dtype) + schedule_timesteps = self.timesteps + + step_indices = [(schedule_timesteps == t).nonzero().item() for t in timesteps] + + sigma = sigmas[step_indices].flatten() + while len(sigma.shape) < len(original_samples.shape): + sigma = sigma.unsqueeze(-1) + + noisy_samples = original_samples + noise * sigma + return noisy_samples + + def __len__(self): + return self.config.num_train_timesteps diff --git a/VLMEvalKit_old/PaddleMIX/ppdiffusers/ppdiffusers/utils/downloader/__init__.py b/VLMEvalKit_old/PaddleMIX/ppdiffusers/ppdiffusers/utils/downloader/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..f82ac7ab81aa87600e3dfab5ecd9550fee617c4f --- /dev/null +++ b/VLMEvalKit_old/PaddleMIX/ppdiffusers/ppdiffusers/utils/downloader/__init__.py @@ -0,0 +1,218 @@ +# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +from pathlib import Path +from typing import Dict, Literal, Optional, Union + +from huggingface_hub.utils import ( + EntryNotFoundError, + LocalEntryNotFoundError, + RepositoryNotFoundError, + RevisionNotFoundError, +) +from requests import HTTPError + +from .aistudio_hub_download import ( + aistudio_hub_download, + aistudio_hub_file_exists, + aistudio_hub_try_to_load_from_cache, +) +from .bos_download import bos_download, bos_file_exists, bos_try_to_load_from_cache +from .hf_hub_download import ( + hf_hub_download, + hf_hub_file_exists, + hf_hub_try_to_load_from_cache, +) + + +def bos_aistudio_hf_download( + repo_id: str = None, + filename: str = None, + subfolder: Optional[str] = None, + repo_type: Optional[str] = None, + revision: Optional[str] = None, + library_name: Optional[str] = None, + library_version: Optional[str] = None, + cache_dir: Union[str, Path, None] = None, + local_dir: Union[str, Path, None] = None, + local_dir_use_symlinks: Union[bool, Literal["auto"]] = "auto", + user_agent: Union[Dict, str, None] = None, + force_download: bool = False, + proxies: Optional[Dict] = None, + etag_timeout: float = 10, + resume_download: bool = False, + token: Union[bool, str, None] = None, + local_files_only: bool = False, + endpoint: Optional[str] = None, + url: Optional[str] = None, + from_bos: bool = True, + from_aistudio: bool = False, + from_hf_hub: bool = False, +) -> str: + assert repo_id is not None, "repo_id cannot be None" + assert filename is not None, "filename cannot be None" + + download_kwargs = dict( + repo_id=repo_id, + filename=filename, + subfolder=subfolder if subfolder is not None else "", + repo_type=repo_type, + revision=revision, + library_name=library_name, + library_version=library_version, + cache_dir=cache_dir, + local_dir=local_dir, + local_dir_use_symlinks=local_dir_use_symlinks, + user_agent=user_agent, + force_download=force_download, + proxies=proxies, + etag_timeout=etag_timeout, + resume_download=resume_download, + token=token, + local_files_only=local_files_only, + endpoint=endpoint, + ) + cached_file = None + log_endpoint = "N/A" + log_filename = os.path.join(download_kwargs["subfolder"], filename) + try: + if from_aistudio: + log_endpoint = "Aistudio Hub" + cached_file = aistudio_hub_download( + **download_kwargs, + ) + elif from_hf_hub: + log_endpoint = "Huggingface Hub" + cached_file = hf_hub_download( + **download_kwargs, + ) + else: + log_endpoint = "BOS" + download_kwargs["url"] = url + cached_file = bos_download( + **download_kwargs, + ) + except LocalEntryNotFoundError: + raise EnvironmentError( + "Cannot find the requested files in the cached path and" + " outgoing traffic has been disabled. To enable model look-ups" + " and downloads online, set 'local_files_only' to False." + ) + except RepositoryNotFoundError: + raise EnvironmentError( + f"{repo_id} is not a local folder and is not a valid model identifier " + f"listed on '{log_endpoint}'\nIf this is a private repository, make sure to pass a " + "token having permission to this repo." + ) + except RevisionNotFoundError: + raise EnvironmentError( + f"{revision} is not a valid git identifier (branch name, tag name or commit id) that exists for " + "this model name. Check the model page at " + f"'{log_endpoint}' for available revisions." + ) + except EntryNotFoundError: + raise EnvironmentError(f"{repo_id} does not appear to have a file named {log_filename}.") + except HTTPError as err: + raise EnvironmentError(f"There was a specific connection error when trying to load {repo_id}:\n{err}") + except ValueError: + raise EnvironmentError( + f"We couldn't connect to '{log_endpoint}' to load this model, couldn't find it" + f" in the cached files and it looks like {repo_id} is not the path to a" + f" directory containing a file named {log_filename} or" + " \nCheckout your internet connection or see how to run the library in offline mode." + ) + except EnvironmentError: + raise EnvironmentError( + f"Can't load the model for '{repo_id}'. If you were trying to load it from " + f"'{log_endpoint}', make sure you don't have a local directory with the same name. " + f"Otherwise, make sure '{repo_id}' is the correct path to a directory " + f"containing a file named {log_filename}" + ) + return cached_file + + +def bos_aistudio_hf_file_exist( + repo_id: str = None, + filename: str = None, + subfolder: Optional[str] = None, + repo_type: Optional[str] = None, + revision: Optional[str] = None, + token: Optional[str] = None, + endpoint: Optional[str] = None, + from_bos: bool = True, + from_aistudio: bool = False, + from_hf_hub: bool = False, +): + assert repo_id is not None, "repo_id cannot be None" + assert filename is not None, "filename cannot be None" + + if subfolder is None: + subfolder = "" + filename = os.path.join(subfolder, filename) + if from_aistudio: + out = aistudio_hub_file_exists( + repo_id=repo_id, + filename=filename, + repo_type=repo_type, + revision=revision, + token=token, + endpoint=endpoint, + ) + elif from_hf_hub: + out = hf_hub_file_exists( + repo_id=repo_id, + filename=filename, + repo_type=repo_type, + revision=revision, + token=token, + ) + else: + out = bos_file_exists( + repo_id=repo_id, + filename=filename, + repo_type=repo_type, + revision=revision, + token=token, # donot need token + endpoint=endpoint, + ) + return out + + +def bos_aistudio_hf_try_to_load_from_cache( + repo_id: str, + filename: str, + cache_dir: Union[str, Path, None] = None, + subfolder: str = None, + revision: Optional[str] = None, + repo_type: Optional[str] = None, + from_bos: bool = True, + from_aistudio: bool = False, + from_hf_hub: bool = False, +): + if subfolder is None: + subfolder = "" + load_kwargs = dict( + repo_id=repo_id, + filename=os.path.join(subfolder, filename), + cache_dir=cache_dir, + revision=revision, + repo_type=repo_type, + ) + if from_aistudio: + return aistudio_hub_try_to_load_from_cache(**load_kwargs) + elif from_hf_hub: + return hf_hub_try_to_load_from_cache(**load_kwargs) + else: + return bos_try_to_load_from_cache(**load_kwargs) diff --git a/VLMEvalKit_old/PaddleMIX/ppdiffusers/ppdiffusers/utils/downloader/aistudio_hub_download.py b/VLMEvalKit_old/PaddleMIX/ppdiffusers/ppdiffusers/utils/downloader/aistudio_hub_download.py new file mode 100644 index 0000000000000000000000000000000000000000..de8f4bc00cb4db6efbdb36248877f452e216434f --- /dev/null +++ b/VLMEvalKit_old/PaddleMIX/ppdiffusers/ppdiffusers/utils/downloader/aistudio_hub_download.py @@ -0,0 +1,729 @@ +# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import io +import logging +import os +import re +import shutil +import tempfile +from contextlib import contextmanager +from functools import partial +from pathlib import Path +from typing import Dict, Generator, Literal, Optional, Union +from urllib.parse import quote + +import requests +from filelock import FileLock +from huggingface_hub.utils import ( + EntryNotFoundError, + FileMetadataError, + GatedRepoError, + HfHubHTTPError, + LocalEntryNotFoundError, + RepositoryNotFoundError, + RevisionNotFoundError, +) + +logger = logging.getLogger(__name__) + +from .common import ( + _CACHED_NO_EXIST, + DEFAULT_ETAG_TIMEOUT, + DEFAULT_LOCAL_DIR_AUTO_SYMLINK_THRESHOLD, + DEFAULT_REQUEST_TIMEOUT, + AistudioBosFileMetadata, + OfflineModeIsEnabled, + _cache_commit_hash_for_specific_revision, + _check_disk_space, + _chmod_and_replace, + _create_symlink, + _get_pointer_path, + _is_true, + _normalize_etag, + _request_wrapper, + _to_local_dir, + http_get, + raise_for_status, + repo_folder_name, +) + +VERSION = "0.1.5" +ENDPOINT = os.getenv("AISTUDIO_ENDPOINT", "http://git.aistudio.baidu.com") + +AISTUDIO_URL_TEMPLATE = ENDPOINT + "/api/v1/repos/{user_name}/{repo_name}/contents/{filename}" + + +default_home = os.path.join(os.path.expanduser("~"), ".cache") +AISTUDIO_HOME = os.path.expanduser( + os.getenv( + "AISTUDIO_HOME", + os.path.join(os.getenv("XDG_CACHE_HOME", default_home), "paddle"), + ) +) +default_cache_path = os.path.join(AISTUDIO_HOME, "aistudio") +AISTUDIO_HUB_CACHE = os.getenv("AISTUDIO_HUB_CACHE", default_cache_path) + + +DEFAULT_REVISION = "master" +REPO_TYPE_MODEL = "model" +REPO_TYPES = [None, REPO_TYPE_MODEL] + + +REGEX_COMMIT_HASH = re.compile(r"^[0-9a-f]{40}$") + + +# TOKEN +AISTUDIO_TOKEN_PATH = os.path.join(AISTUDIO_HOME, "token") +AISTUDIO_HUB_DISABLE_IMPLICIT_TOKEN: bool = _is_true(os.environ.get("AISTUDIO_HUB_DISABLE_IMPLICIT_TOKEN")) + + +class LocalTokenNotFoundError(EnvironmentError): + """Raised if local token is required but not found.""" + + +def _clean_token(token: Optional[str]) -> Optional[str]: + """Clean token by removing trailing and leading spaces and newlines. + + If token is an empty string, return None. + """ + if token is None: + return None + return token.replace("\r", "").replace("\n", "").strip() or None + + +def _get_token_from_environment() -> Optional[str]: + return _clean_token(os.environ.get("AISTUDIO_ACCESS_TOKEN") or os.environ.get("AISTUDIO_TOKEN")) + + +def _get_token_from_file() -> Optional[str]: + try: + return _clean_token(Path(AISTUDIO_TOKEN_PATH).read_text()) + except FileNotFoundError: + return None + + +def get_token() -> Optional[str]: + """ + Get token if user is logged in. + + Note: in most cases, you should use [`build_aistudio_headers`] instead. This method is only useful + if you want to retrieve the token for other purposes than sending an HTTP request. + + Token is retrieved in priority from the `AISTUDIO_ACCESS_TOKEN` environment variable. Otherwise, we read the token file located + in the Aistudio home folder. Returns None if user is not logged in. + + Returns: + `str` or `None`: The token, `None` if it doesn't exist. + """ + return _get_token_from_environment() or _get_token_from_file() + + +def get_token_to_send(token: Optional[Union[bool, str]]) -> Optional[str]: + """Select the token to send from either `token` or the cache.""" + # Case token is explicitly provided + if isinstance(token, str): + return token + + # Case token is explicitly forbidden + if token is False: + return None + + # Token is not provided: we get it from local cache + cached_token = get_token() + + # Case token is explicitly required + if token is True: + if cached_token is None: + raise LocalTokenNotFoundError( + "Token is required (`token=True`), but no token found. You" + " to provide a token or be logged in to Aistudio Hub . See" + "https://ai.baidu.com/ai-doc/AISTUDIO/slmkadt9z#2-%E5%A6%82%E4%BD%95%E4%BD%BF%E7%94%A8%E8%AE%BF%E9%97%AE%E4%BB%A4%E7%89%8C." + ) + return cached_token + + # Case implicit use of the token is forbidden by env variable + if AISTUDIO_HUB_DISABLE_IMPLICIT_TOKEN: + return None + + # Otherwise: we use the cached token as the user has not explicitly forbidden it + return cached_token + + +def _validate_token_to_send(token: Optional[str], is_write_action: bool) -> None: + if is_write_action: + if token is None: + raise ValueError( + "Token is required (write-access action) but no token found. You need" + " to provide a token or be logged in to Aistudio Hub . See" + "https://ai.baidu.com/ai-doc/AISTUDIO/slmkadt9z#2-%E5%A6%82%E4%BD%95%E4%BD%BF%E7%94%A8%E8%AE%BF%E9%97%AE%E4%BB%A4%E7%89%8C." + ) + + +def build_aistudio_headers( + *, + token: Optional[Union[bool, str]] = None, + is_write_action: bool = False, + library_name: Optional[str] = None, + library_version: Optional[str] = None, + user_agent: Union[Dict, str, None] = None, +) -> Dict[str, str]: + # Get auth token to send + token_to_send = get_token_to_send(token) + _validate_token_to_send(token_to_send, is_write_action=is_write_action) + + # Combine headers + headers = {"Content-Type": "application/json", "SDK-Version": str(VERSION)} + if token_to_send is not None: + headers["Authorization"] = f"token {token_to_send}" + return headers + + +def get_aistudio_file_metadata( + url: str, + token: Union[bool, str, None] = None, + proxies: Optional[Dict] = None, + timeout: Optional[float] = DEFAULT_REQUEST_TIMEOUT, + library_name: Optional[str] = None, + library_version: Optional[str] = None, + user_agent: Union[Dict, str, None] = None, +): + """Fetch metadata of a file versioned on the Hub for a given url. + + Args: + url (`str`): + File url, for example returned by [`aistudio_hub_url`]. + token (`str` or `bool`, *optional*): + A token to be used for the download. + - If `True`, the token is read from the Aistudio config + folder. + - If `False` or `None`, no token is provided. + - If a string, it's used as the authentication token. + proxies (`dict`, *optional*): + Dictionary mapping protocol to the URL of the proxy passed to + `requests.request`. + timeout (`float`, *optional*, defaults to 10): + How many seconds to wait for the server to send metadata before giving up. + library_name (`str`, *optional*): + The name of the library to which the object corresponds. + library_version (`str`, *optional*): + The version of the library. + user_agent (`dict`, `str`, *optional*): + The user-agent info in the form of a dictionary or a string. + + Returns: + A [`AistudioBosFileMetadata`] object containing metadata such as location, etag, size and + commit_hash. + """ + headers = build_aistudio_headers( + token=token, library_name=library_name, library_version=library_version, user_agent=user_agent + ) + headers["Accept-Encoding"] = "identity" # prevent any compression => we want to know the real size of the file + + # Retrieve metadata + r = _request_wrapper( + method="GET", + url=url, + headers=headers, + allow_redirects=False, + follow_relative_redirects=True, + proxies=proxies, + timeout=timeout, + ) + raise_for_status(r) + res = r.json() + + # Return + return AistudioBosFileMetadata( + commit_hash=res["sha"], + etag=_normalize_etag(res["last_commit_sha"]), + location=res["git_url"], + size=res["size"], + ) + + +def aistudio_hub_url( + repo_id: str, + filename: str, + *, + subfolder: Optional[str] = None, + repo_type: Optional[str] = None, + revision: Optional[str] = None, + endpoint: Optional[str] = None, +) -> str: + if subfolder == "": + subfolder = None + if subfolder is not None: + filename = f"{subfolder}/{filename}" + + if repo_type is None: + repo_type = REPO_TYPES[-1] + if repo_type not in REPO_TYPES: + raise ValueError("Invalid repo type") + if revision is None: + revision = DEFAULT_REVISION + + # NEW ADD + if "/" not in repo_id: + raise ValueError("repo_id must be in the format of 'namespace/name'") + user_name, repo_name = repo_id.split("/") + user_name = user_name.strip() + repo_name = repo_name.strip() + + url = AISTUDIO_URL_TEMPLATE.format( + user_name=quote(user_name, safe=""), repo_name=quote(repo_name, safe=""), filename=quote(filename) + ) + # Update endpoint if provided + if endpoint is not None and url.startswith(ENDPOINT): + url = endpoint + url[len(ENDPOINT) :] + + if revision != "master": + url += f"?ref={quote(revision, safe='')}" + return url + + +def aistudio_hub_download( + repo_id: str = None, + filename: str = None, + subfolder: Optional[str] = None, + repo_type: Optional[str] = None, + revision: Optional[str] = None, + library_name: Optional[str] = None, + library_version: Optional[str] = None, + cache_dir: Union[str, Path, None] = None, + local_dir: Union[str, Path, None] = None, + local_dir_use_symlinks: Union[bool, Literal["auto"]] = "auto", + # TODO + user_agent: Union[Dict, str, None] = None, + force_download: bool = False, + proxies: Optional[Dict] = None, + etag_timeout: float = DEFAULT_ETAG_TIMEOUT, + resume_download: bool = False, + token: Optional[str] = None, + local_files_only: bool = False, + endpoint: Optional[str] = None, + **kwargs, +): + + if cache_dir is None: + cache_dir = AISTUDIO_HUB_CACHE + if revision is None: + revision = DEFAULT_REVISION + if isinstance(cache_dir, Path): + cache_dir = str(cache_dir) + if isinstance(local_dir, Path): + local_dir = str(local_dir) + locks_dir = os.path.join(cache_dir, ".locks") + + if subfolder == "": + subfolder = None + if subfolder is not None: + # This is used to create a URL, and not a local path, hence the forward slash. + filename = f"{subfolder}/{filename}" + + if repo_type is None: + repo_type = REPO_TYPES[-1] + if repo_type not in REPO_TYPES: + raise ValueError(f"Invalid repo type: {repo_type}. Accepted repo types are: {str(REPO_TYPES)}") + + storage_folder = os.path.join(cache_dir, repo_folder_name(repo_id=repo_id, repo_type=repo_type)) + os.makedirs(storage_folder, exist_ok=True) + + # cross platform transcription of filename, to be used as a local file path. + relative_filename = os.path.join(*filename.split("/")) + if os.name == "nt": + if relative_filename.startswith("..\\") or "\\..\\" in relative_filename: + raise ValueError( + f"Invalid filename: cannot handle filename '{relative_filename}' on Windows. Please ask the repository" + " owner to rename this file." + ) + + # if user provides a commit_hash and they already have the file on disk, + # shortcut everything. + # TODO, 当前不支持commit id下载,因此这个肯定跑的。 + if not force_download: # REGEX_COMMIT_HASH.match(revision) + pointer_path = _get_pointer_path(storage_folder, revision, relative_filename) + if os.path.exists(pointer_path): + if local_dir is not None: + return _to_local_dir(pointer_path, local_dir, relative_filename, use_symlinks=local_dir_use_symlinks) + return pointer_path + + url = aistudio_hub_url(repo_id, filename, repo_type=repo_type, revision=revision, endpoint=endpoint) + + headers = build_aistudio_headers( + token=token, + library_name=library_name, + library_version=library_version, + user_agent=user_agent, + ) + url_to_download = url.replace("/contents/", "/media/") + + etag = None + commit_hash = None + expected_size = None + head_call_error: Optional[Exception] = None + if not local_files_only: + try: + try: + metadata = get_aistudio_file_metadata( + url=url, + token=token, + proxies=proxies, + timeout=etag_timeout, + library_name=library_name, + library_version=library_version, + user_agent=user_agent, + ) + except EntryNotFoundError as http_error: # noqa: F841 + raise + # Commit hash must exist + # TODO,这里修改了commit hash,强迫为revision了。 + commit_hash = revision # metadata.commit_hash + if commit_hash is None: + raise FileMetadataError( + "Distant resource does not seem to be on aistudio hub. It is possible that a configuration issue" + " prevents you from downloading resources from aistudio hub. Please check your firewall" + " and proxy settings and make sure your SSL certificates are updated." + ) + + # Etag must exist + etag = metadata.etag + # We favor a custom header indicating the etag of the linked resource, and + # we fallback to the regular etag header. + # If we don't have any of those, raise an error. + if etag is None: + raise FileMetadataError( + "Distant resource does not have an ETag, we won't be able to reliably ensure reproducibility." + ) + + # Expected (uncompressed) size + expected_size = metadata.size + + except (requests.exceptions.SSLError, requests.exceptions.ProxyError): + # Actually raise for those subclasses of ConnectionError + raise + except ( + requests.exceptions.ConnectionError, + requests.exceptions.Timeout, + OfflineModeIsEnabled, + ) as error: + # Otherwise, our Internet connection is down. + # etag is None + head_call_error = error + pass + except (RevisionNotFoundError, EntryNotFoundError): + # The repo was found but the revision or entry doesn't exist on the Hub (never existed or got deleted) + raise + except requests.HTTPError as error: + # Multiple reasons for an http error: + # - Repository is private and invalid/missing token sent + # - Repository is gated and invalid/missing token sent + # - Hub is down (error 500 or 504) + # => let's switch to 'local_files_only=True' to check if the files are already cached. + # (if it's not the case, the error will be re-raised) + head_call_error = error + pass + except FileMetadataError as error: + # Multiple reasons for a FileMetadataError: + # - Wrong network configuration (proxy, firewall, SSL certificates) + # - Inconsistency on the Hub + # => let's switch to 'local_files_only=True' to check if the files are already cached. + # (if it's not the case, the error will be re-raised) + head_call_error = error + pass + + # etag can be None for several reasons: + # 1. we passed local_files_only. + # 2. we don't have a connection + # 3. Hub is down (HTTP 500 or 504) + # 4. repo is not found -for example private or gated- and invalid/missing token sent + # 5. Hub is blocked by a firewall or proxy is not set correctly. + # => Try to get the last downloaded one from the specified revision. + # + # If the specified revision is a commit hash, look inside "snapshots". + # If the specified revision is a branch or tag, look inside "refs". + if etag is None: + # In those cases, we cannot force download. + if force_download: + raise ValueError( + "We have no connection or you passed local_files_only, so force_download is not an accepted option." + ) + + # Try to get "commit_hash" from "revision" + commit_hash = None + if REGEX_COMMIT_HASH.match(revision): + commit_hash = revision + else: + ref_path = os.path.join(storage_folder, "refs", revision) + if os.path.isfile(ref_path): + with open(ref_path) as f: + commit_hash = f.read() + + # Return pointer file if exists + if commit_hash is not None: + pointer_path = _get_pointer_path(storage_folder, commit_hash, relative_filename) + if os.path.exists(pointer_path): + if local_dir is not None: + return _to_local_dir( + pointer_path, local_dir, relative_filename, use_symlinks=local_dir_use_symlinks + ) + return pointer_path + + # If we couldn't find an appropriate file on disk, raise an error. + # If files cannot be found and local_files_only=True, + # the models might've been found if local_files_only=False + # Notify the user about that + if local_files_only: + raise LocalEntryNotFoundError( + "Cannot find the requested files in the disk cache and outgoing traffic has been disabled. To enable" + " aistudio hub look-ups and downloads online, set 'local_files_only' to False." + ) + elif isinstance(head_call_error, RepositoryNotFoundError) or isinstance(head_call_error, GatedRepoError): + # Repo not found => let's raise the actual error + raise head_call_error + else: + # Otherwise: most likely a connection issue or Hub downtime => let's warn the user + raise LocalEntryNotFoundError( + "An error happened while trying to locate the file on the Hub and we cannot find the requested files" + " in the local cache. Please check your connection and try again or make sure your Internet connection" + " is on." + ) from head_call_error + + # From now on, etag and commit_hash are not None. + assert etag is not None, "etag must have been retrieved from server" + assert commit_hash is not None, "commit_hash must have been retrieved from server" + blob_path = os.path.join(storage_folder, "blobs", etag) + pointer_path = _get_pointer_path(storage_folder, commit_hash, relative_filename) + + os.makedirs(os.path.dirname(blob_path), exist_ok=True) + os.makedirs(os.path.dirname(pointer_path), exist_ok=True) + # if passed revision is not identical to commit_hash + # then revision has to be a branch name or tag name. + # In that case store a ref. + _cache_commit_hash_for_specific_revision(storage_folder, revision, commit_hash) + + if os.path.exists(pointer_path) and not force_download: + if local_dir is not None: + return _to_local_dir(pointer_path, local_dir, relative_filename, use_symlinks=local_dir_use_symlinks) + return pointer_path + + if os.path.exists(blob_path) and not force_download: + # we have the blob already, but not the pointer + if local_dir is not None: # to local dir + return _to_local_dir(blob_path, local_dir, relative_filename, use_symlinks=local_dir_use_symlinks) + else: # or in snapshot cache + _create_symlink(blob_path, pointer_path, new_blob=False) + return pointer_path + + # Prevent parallel downloads of the same file with a lock. + # etag could be duplicated across repos, + lock_path = os.path.join(locks_dir, repo_folder_name(repo_id=repo_id, repo_type=repo_type), f"{etag}.lock") + + # Some Windows versions do not allow for paths longer than 255 characters. + # In this case, we must specify it is an extended path by using the "\\?\" prefix. + if os.name == "nt" and len(os.path.abspath(lock_path)) > 255: + lock_path = "\\\\?\\" + os.path.abspath(lock_path) + + if os.name == "nt" and len(os.path.abspath(blob_path)) > 255: + blob_path = "\\\\?\\" + os.path.abspath(blob_path) + + Path(lock_path).parent.mkdir(parents=True, exist_ok=True) + with FileLock(lock_path): + # If the download just completed while the lock was activated. + if os.path.exists(pointer_path) and not force_download: + # Even if returning early like here, the lock will be released. + if local_dir is not None: + return _to_local_dir(pointer_path, local_dir, relative_filename, use_symlinks=local_dir_use_symlinks) + return pointer_path + + if resume_download: + incomplete_path = blob_path + ".incomplete" + + @contextmanager + def _resumable_file_manager() -> Generator[io.BufferedWriter, None, None]: + with open(incomplete_path, "ab") as f: + yield f + + temp_file_manager = _resumable_file_manager + if os.path.exists(incomplete_path): + resume_size = os.stat(incomplete_path).st_size + else: + resume_size = 0 + else: + temp_file_manager = partial( # type: ignore + tempfile.NamedTemporaryFile, mode="wb", dir=cache_dir, delete=False + ) + resume_size = 0 + + # Download to temporary file, then copy to cache dir once finished. + # Otherwise you get corrupt cache entries if the download gets interrupted. + with temp_file_manager() as temp_file: + logger.info("downloading %s to %s", url, temp_file.name) + + if expected_size is not None: # might be None if HTTP header not set correctly + # Check tmp path + _check_disk_space(expected_size, os.path.dirname(temp_file.name)) + + # Check destination + _check_disk_space(expected_size, os.path.dirname(blob_path)) + if local_dir is not None: + _check_disk_space(expected_size, local_dir) + + http_get( + url_to_download, + temp_file, + proxies=proxies, + resume_size=resume_size, + headers=headers, + expected_size=expected_size, + ) + if local_dir is None: + logger.debug(f"Storing {url} in cache at {blob_path}") + _chmod_and_replace(temp_file.name, blob_path) + _create_symlink(blob_path, pointer_path, new_blob=True) + else: + local_dir_filepath = os.path.join(local_dir, relative_filename) + os.makedirs(os.path.dirname(local_dir_filepath), exist_ok=True) + + # If "auto" (default) copy-paste small files to ease manual editing but symlink big files to save disk + # In both cases, blob file is cached. + is_big_file = os.stat(temp_file.name).st_size > DEFAULT_LOCAL_DIR_AUTO_SYMLINK_THRESHOLD + if local_dir_use_symlinks is True or (local_dir_use_symlinks == "auto" and is_big_file): + logger.debug(f"Storing {url} in cache at {blob_path}") + _chmod_and_replace(temp_file.name, blob_path) + logger.debug("Create symlink to local dir") + _create_symlink(blob_path, local_dir_filepath, new_blob=False) + elif local_dir_use_symlinks == "auto" and not is_big_file: + logger.debug(f"Storing {url} in cache at {blob_path}") + _chmod_and_replace(temp_file.name, blob_path) + logger.debug("Duplicate in local dir (small file and use_symlink set to 'auto')") + shutil.copyfile(blob_path, local_dir_filepath) + else: + logger.debug(f"Storing {url} in local_dir at {local_dir_filepath} (not cached).") + _chmod_and_replace(temp_file.name, local_dir_filepath) + pointer_path = local_dir_filepath # for return value + + return pointer_path + + +def aistudio_hub_file_exists( + repo_id: str, + filename: str, + *, + repo_type: Optional[str] = None, + revision: Optional[str] = None, + token: Optional[str] = None, + endpoint: Optional[str] = None, +) -> bool: + """ + Checks if a file exists in a repository on the Aistudio Hub. + + Args: + repo_id (`str`): + A namespace (user or an organization) and a repo name separated + by a `/`. + filename (`str`): + The name of the file to check, for example: + `"config.json"` + repo_type (`str`, *optional*): + Set to `"dataset"` or `"space"` if getting repository info from a dataset or a space, + `None` or `"model"` if getting repository info from a model. Default is `None`. + revision (`str`, *optional*): + The revision of the repository from which to get the information. Defaults to `"main"` branch. + token (`bool` or `str`, *optional*): + A valid authentication token (see https://huggingface.co/settings/token). + If `None` or `True` and machine is logged in (through `huggingface-cli login` + or [`~login`]), token will be retrieved from the cache. + If `False`, token is not sent in the request header. + + Returns: + True if the file exists, False otherwise. + + + + Examples: + ```py + >>> from huggingface_hub import file_exists + >>> file_exists("bigcode/starcoder", "config.json") + True + >>> file_exists("bigcode/starcoder", "not-a-file") + False + >>> file_exists("bigcode/not-a-repo", "config.json") + False + ``` + + + """ + url = aistudio_hub_url( + repo_id=repo_id, repo_type=repo_type, revision=revision, filename=filename, endpoint=endpoint + ) + try: + if token is None: + token = get_token() + get_aistudio_file_metadata(url, token=token) + return True + except GatedRepoError: # raise specifically on gated repo + raise + except (RepositoryNotFoundError, EntryNotFoundError, RevisionNotFoundError, HfHubHTTPError): + return False + + +def aistudio_hub_try_to_load_from_cache( + repo_id: str, + filename: str, + cache_dir: Union[str, Path, None] = None, + revision: Optional[str] = None, + repo_type: Optional[str] = None, +): + if revision is None: + revision = DEFAULT_REVISION + if repo_type is None: + repo_type = REPO_TYPES[-1] + if repo_type not in REPO_TYPES: + raise ValueError(f"Invalid repo type: {repo_type}. Accepted repo types are: {str(REPO_TYPES)}") + if cache_dir is None: + cache_dir = AISTUDIO_HUB_CACHE + + object_id = repo_id.replace("/", "--") + repo_cache = os.path.join(cache_dir, f"{repo_type}s--{object_id}") + if not os.path.isdir(repo_cache): + # No cache for this model + return None + + refs_dir = os.path.join(repo_cache, "refs") + snapshots_dir = os.path.join(repo_cache, "snapshots") + no_exist_dir = os.path.join(repo_cache, ".no_exist") + + # Resolve refs (for instance to convert main to the associated commit sha) + if os.path.isdir(refs_dir): + revision_file = os.path.join(refs_dir, revision) + if os.path.isfile(revision_file): + with open(revision_file) as f: + revision = f.read() + + # Check if file is cached as "no_exist" + if os.path.isfile(os.path.join(no_exist_dir, revision, filename)): + return _CACHED_NO_EXIST + + # Check if revision folder exists + if not os.path.exists(snapshots_dir): + return None + cached_shas = os.listdir(snapshots_dir) + if revision not in cached_shas: + # No cache for this revision and we won't try to return a random revision + return None + + # Check if file exists in cache + cached_file = os.path.join(snapshots_dir, revision, filename) + return cached_file if os.path.isfile(cached_file) else None diff --git a/VLMEvalKit_old/PaddleMIX/ppdiffusers/ppdiffusers/utils/downloader/bos_download.py b/VLMEvalKit_old/PaddleMIX/ppdiffusers/ppdiffusers/utils/downloader/bos_download.py new file mode 100644 index 0000000000000000000000000000000000000000..372784b9a0888898962f4a136e7efd74ef69cd40 --- /dev/null +++ b/VLMEvalKit_old/PaddleMIX/ppdiffusers/ppdiffusers/utils/downloader/bos_download.py @@ -0,0 +1,637 @@ +# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import io +import logging +import os +import re +import shutil +import tempfile +from contextlib import contextmanager +from functools import partial +from pathlib import Path +from typing import Dict, Generator, Literal, Optional, Union +from urllib.parse import quote + +import requests +from filelock import FileLock +from huggingface_hub.utils import ( + EntryNotFoundError, + FileMetadataError, + GatedRepoError, + HfHubHTTPError, + LocalEntryNotFoundError, + RepositoryNotFoundError, + RevisionNotFoundError, +) + +logger = logging.getLogger(__name__) + +from .common import ( + _CACHED_NO_EXIST, + DEFAULT_ETAG_TIMEOUT, + DEFAULT_LOCAL_DIR_AUTO_SYMLINK_THRESHOLD, + DEFAULT_REQUEST_TIMEOUT, + REPO_ID_SEPARATOR, + AistudioBosFileMetadata, + OfflineModeIsEnabled, + _as_int, + _cache_commit_hash_for_specific_revision, + _check_disk_space, + _chmod_and_replace, + _create_symlink, + _get_pointer_path, + _normalize_etag, + _request_wrapper, + _to_local_dir, + http_get, + raise_for_status, +) + + +def repo_folder_name(*, repo_id: str, repo_type: str) -> str: + """Return a serialized version of a aistudio repo name and type, safe for disk storage + as a single non-nested folder. + + Example: models--julien-c--EsperBERTo-small + """ + # remove all `/` occurrences to correctly convert repo to directory name + parts = [f"{repo_type}", *repo_id.split("/")] + return REPO_ID_SEPARATOR.join(parts) + + +ENDPOINT = os.getenv("PPNLP_ENDPOINT", "https://bj.bcebos.com/paddlenlp") +ENDPOINT_v2 = "https://paddlenlp.bj.bcebos.com" + +BOS_URL_TEMPLATE = ENDPOINT + "/{repo_type}/community/{repo_id}/{revision}/{filename}" +BOS_URL_TEMPLATE_WITHOUT_REVISION = ENDPOINT + "/{repo_type}/community/{repo_id}/{filename}" + + +default_home = os.path.join(os.path.expanduser("~"), ".cache") +BOS_HOME = os.path.expanduser( + os.getenv( + "BOS_HOME", + os.path.join(os.getenv("XDG_CACHE_HOME", default_home), "paddle"), + ) +) +default_cache_path = os.path.join(BOS_HOME, "bos") +BOS_CACHE = os.getenv("BOS_CACHE", default_cache_path) + + +DEFAULT_REVISION = "main" +REPO_TYPE_MODEL = "models" +REPO_TYPES = [None, REPO_TYPE_MODEL] + + +REGEX_COMMIT_HASH = re.compile(r"^[0-9a-f]{40}$") + + +def get_bos_file_metadata( + url: str, + token: Union[bool, str, None] = None, + proxies: Optional[Dict] = None, + timeout: Optional[float] = DEFAULT_REQUEST_TIMEOUT, + library_name: Optional[str] = None, + library_version: Optional[str] = None, + user_agent: Union[Dict, str, None] = None, +): + """Fetch metadata of a file versioned on the Hub for a given url. + + Args: + url (`str`): + File url, for example returned by [`bos_url`]. + token (`str` or `bool`, *optional*): + A token to be used for the download. + - If `True`, the token is read from the BOS config + folder. + - If `False` or `None`, no token is provided. + - If a string, it's used as the authentication token. + proxies (`dict`, *optional*): + Dictionary mapping protocol to the URL of the proxy passed to + `requests.request`. + timeout (`float`, *optional*, defaults to 10): + How many seconds to wait for the server to send metadata before giving up. + library_name (`str`, *optional*): + The name of the library to which the object corresponds. + library_version (`str`, *optional*): + The version of the library. + user_agent (`dict`, `str`, *optional*): + The user-agent info in the form of a dictionary or a string. + + Returns: + A [`AistudioBosFileMetadata`] object containing metadata such as location, etag, size and + commit_hash. + """ + headers = {} + headers["Accept-Encoding"] = "identity" # prevent any compression => we want to know the real size of the file + + # Retrieve metadata + r = _request_wrapper( + method="HEAD", + url=url, + headers=headers, + allow_redirects=False, + follow_relative_redirects=True, + proxies=proxies, + timeout=timeout, + ) + raise_for_status(r) + + # Return + return AistudioBosFileMetadata( + commit_hash=None, + etag=_normalize_etag(r.headers.get("ETag")), + location=url, + size=_as_int(r.headers.get("Content-Length")), + ) + + +def bos_url( + repo_id: str, + filename: str, + *, + subfolder: Optional[str] = None, + repo_type: Optional[str] = None, + revision: Optional[str] = None, + endpoint: Optional[str] = None, +) -> str: + if subfolder == "": + subfolder = None + if subfolder is not None: + filename = f"{subfolder}/{filename}" + + if repo_type is None: + repo_type = REPO_TYPES[-1] + if repo_type not in REPO_TYPES: + raise ValueError("Invalid repo type") + if revision is None: + revision = DEFAULT_REVISION + + if revision == DEFAULT_REVISION: + url = BOS_URL_TEMPLATE_WITHOUT_REVISION.format( + repo_type=repo_type, + repo_id=repo_id, + filename=filename, + ) + else: + url = BOS_URL_TEMPLATE.format( + repo_type=repo_type, + repo_id=repo_id, + revision=quote(revision, safe=""), + filename=filename, + ) + # Update endpoint if provided + if endpoint is not None and url.startswith(ENDPOINT): + url = endpoint + url[len(ENDPOINT) :] + return url + + +def bos_download( + repo_id: str = None, + filename: str = None, + subfolder: Optional[str] = None, + repo_type: Optional[str] = None, + revision: Optional[str] = None, + library_name: Optional[str] = None, + library_version: Optional[str] = None, + cache_dir: Union[str, Path, None] = None, + local_dir: Union[str, Path, None] = None, + local_dir_use_symlinks: Union[bool, Literal["auto"]] = "auto", + # TODO + user_agent: Union[Dict, str, None] = None, + force_download: bool = False, + proxies: Optional[Dict] = None, + etag_timeout: float = DEFAULT_ETAG_TIMEOUT, + resume_download: bool = False, + token: Optional[str] = None, + local_files_only: bool = False, + endpoint: Optional[str] = None, + url: Optional[str] = None, + **kwargs, +): + if url is not None: + assert url.startswith(ENDPOINT) or url.startswith( + ENDPOINT_v2 + ), f"URL must start with {ENDPOINT} or {ENDPOINT_v2}" + if repo_id is None: + if url.startswith(ENDPOINT): + repo_id = "/".join(url[len(ENDPOINT) + 1 :].split("/")[:-1]) + else: + repo_id = "/".join(url[len(ENDPOINT_v2) + 1 :].split("/")[:-1]) + if filename is None: + filename = url.split("/")[-1] + subfolder = None + + if cache_dir is None: + cache_dir = BOS_CACHE + if revision is None: + revision = DEFAULT_REVISION + if isinstance(cache_dir, Path): + cache_dir = str(cache_dir) + if isinstance(local_dir, Path): + local_dir = str(local_dir) + locks_dir = os.path.join(cache_dir, ".locks") + + if subfolder == "": + subfolder = None + if subfolder is not None: + # This is used to create a URL, and not a local path, hence the forward slash. + filename = f"{subfolder}/{filename}" + + if repo_type is None: + repo_type = REPO_TYPES[-1] + if repo_type not in REPO_TYPES: + raise ValueError(f"Invalid repo type: {repo_type}. Accepted repo types are: {str(REPO_TYPES)}") + + storage_folder = os.path.join(cache_dir, repo_folder_name(repo_id=repo_id, repo_type=repo_type)) + os.makedirs(storage_folder, exist_ok=True) + + # cross platform transcription of filename, to be used as a local file path. + relative_filename = os.path.join(*filename.split("/")) + if os.name == "nt": + if relative_filename.startswith("..\\") or "\\..\\" in relative_filename: + raise ValueError( + f"Invalid filename: cannot handle filename '{relative_filename}' on Windows. Please ask the repository" + " owner to rename this file." + ) + + # if user provides a commit_hash and they already have the file on disk, + # shortcut everything. + # TODO, 当前不支持commit id下载,因此这个肯定跑的。 + if not force_download: # REGEX_COMMIT_HASH.match(revision) + pointer_path = _get_pointer_path(storage_folder, revision, relative_filename) + if os.path.exists(pointer_path): + if local_dir is not None: + return _to_local_dir(pointer_path, local_dir, relative_filename, use_symlinks=local_dir_use_symlinks) + return pointer_path + + if url is None: + url = bos_url(repo_id, filename, repo_type=repo_type, revision=revision, endpoint=endpoint) + headers = None + url_to_download = url + + etag = None + commit_hash = None + expected_size = None + head_call_error: Optional[Exception] = None + if not local_files_only: + try: + try: + metadata = get_bos_file_metadata( + url=url, + token=token, + proxies=proxies, + timeout=etag_timeout, + library_name=library_name, + library_version=library_version, + user_agent=user_agent, + ) + except EntryNotFoundError as http_error: # noqa: F841 + raise + # Commit hash must exist + # TODO,这里修改了commit hash,强迫为revision了。 + commit_hash = revision # metadata.commit_hash + if commit_hash is None: + raise FileMetadataError( + "Distant resource does not seem to be on aistudio hub. It is possible that a configuration issue" + " prevents you from downloading resources from aistudio hub. Please check your firewall" + " and proxy settings and make sure your SSL certificates are updated." + ) + + # Etag must exist + etag = metadata.etag + # We favor a custom header indicating the etag of the linked resource, and + # we fallback to the regular etag header. + # If we don't have any of those, raise an error. + if etag is None: + raise FileMetadataError( + "Distant resource does not have an ETag, we won't be able to reliably ensure reproducibility." + ) + + # Expected (uncompressed) size + expected_size = metadata.size + + except (requests.exceptions.SSLError, requests.exceptions.ProxyError): + # Actually raise for those subclasses of ConnectionError + raise + except ( + requests.exceptions.ConnectionError, + requests.exceptions.Timeout, + OfflineModeIsEnabled, + ) as error: + # Otherwise, our Internet connection is down. + # etag is None + head_call_error = error + pass + except (RevisionNotFoundError, EntryNotFoundError): + # The repo was found but the revision or entry doesn't exist on the Hub (never existed or got deleted) + raise + except requests.HTTPError as error: + # Multiple reasons for an http error: + # - Repository is private and invalid/missing token sent + # - Repository is gated and invalid/missing token sent + # - Hub is down (error 500 or 504) + # => let's switch to 'local_files_only=True' to check if the files are already cached. + # (if it's not the case, the error will be re-raised) + head_call_error = error + pass + except FileMetadataError as error: + # Multiple reasons for a FileMetadataError: + # - Wrong network configuration (proxy, firewall, SSL certificates) + # - Inconsistency on the Hub + # => let's switch to 'local_files_only=True' to check if the files are already cached. + # (if it's not the case, the error will be re-raised) + head_call_error = error + pass + + # etag can be None for several reasons: + # 1. we passed local_files_only. + # 2. we don't have a connection + # 3. Hub is down (HTTP 500 or 504) + # 4. repo is not found -for example private or gated- and invalid/missing token sent + # 5. Hub is blocked by a firewall or proxy is not set correctly. + # => Try to get the last downloaded one from the specified revision. + # + # If the specified revision is a commit hash, look inside "snapshots". + # If the specified revision is a branch or tag, look inside "refs". + if etag is None: + # In those cases, we cannot force download. + if force_download: + raise ValueError( + "We have no connection or you passed local_files_only, so force_download is not an accepted option." + ) + + # Try to get "commit_hash" from "revision" + commit_hash = None + if REGEX_COMMIT_HASH.match(revision): + commit_hash = revision + else: + ref_path = os.path.join(storage_folder, "refs", revision) + if os.path.isfile(ref_path): + with open(ref_path) as f: + commit_hash = f.read() + + # Return pointer file if exists + if commit_hash is not None: + pointer_path = _get_pointer_path(storage_folder, commit_hash, relative_filename) + if os.path.exists(pointer_path): + if local_dir is not None: + return _to_local_dir( + pointer_path, local_dir, relative_filename, use_symlinks=local_dir_use_symlinks + ) + return pointer_path + + # If we couldn't find an appropriate file on disk, raise an error. + # If files cannot be found and local_files_only=True, + # the models might've been found if local_files_only=False + # Notify the user about that + if local_files_only: + raise LocalEntryNotFoundError( + "Cannot find the requested files in the disk cache and outgoing traffic has been disabled. To enable" + " BOS look-ups and downloads online, set 'local_files_only' to False." + ) + elif isinstance(head_call_error, RepositoryNotFoundError) or isinstance(head_call_error, GatedRepoError): + # Repo not found => let's raise the actual error + raise head_call_error + else: + # Otherwise: most likely a connection issue or Hub downtime => let's warn the user + raise LocalEntryNotFoundError( + "An error happened while trying to locate the file on the Hub and we cannot find the requested files" + " in the local cache. Please check your connection and try again or make sure your Internet connection" + " is on." + ) from head_call_error + + # From now on, etag and commit_hash are not None. + assert etag is not None, "etag must have been retrieved from server" + assert commit_hash is not None, "commit_hash must have been retrieved from server" + blob_path = os.path.join(storage_folder, "blobs", etag) + pointer_path = _get_pointer_path(storage_folder, commit_hash, relative_filename) + + os.makedirs(os.path.dirname(blob_path), exist_ok=True) + os.makedirs(os.path.dirname(pointer_path), exist_ok=True) + # if passed revision is not identical to commit_hash + # then revision has to be a branch name or tag name. + # In that case store a ref. + _cache_commit_hash_for_specific_revision(storage_folder, revision, commit_hash) + + if os.path.exists(pointer_path) and not force_download: + if local_dir is not None: + return _to_local_dir(pointer_path, local_dir, relative_filename, use_symlinks=local_dir_use_symlinks) + return pointer_path + + if os.path.exists(blob_path) and not force_download: + # we have the blob already, but not the pointer + if local_dir is not None: # to local dir + return _to_local_dir(blob_path, local_dir, relative_filename, use_symlinks=local_dir_use_symlinks) + else: # or in snapshot cache + _create_symlink(blob_path, pointer_path, new_blob=False) + return pointer_path + + # Prevent parallel downloads of the same file with a lock. + # etag could be duplicated across repos, + lock_path = os.path.join(locks_dir, repo_folder_name(repo_id=repo_id, repo_type=repo_type), f"{etag}.lock") + + # Some Windows versions do not allow for paths longer than 255 characters. + # In this case, we must specify it is an extended path by using the "\\?\" prefix. + if os.name == "nt" and len(os.path.abspath(lock_path)) > 255: + lock_path = "\\\\?\\" + os.path.abspath(lock_path) + + if os.name == "nt" and len(os.path.abspath(blob_path)) > 255: + blob_path = "\\\\?\\" + os.path.abspath(blob_path) + + Path(lock_path).parent.mkdir(parents=True, exist_ok=True) + with FileLock(lock_path): + # If the download just completed while the lock was activated. + if os.path.exists(pointer_path) and not force_download: + # Even if returning early like here, the lock will be released. + if local_dir is not None: + return _to_local_dir(pointer_path, local_dir, relative_filename, use_symlinks=local_dir_use_symlinks) + return pointer_path + + if resume_download: + incomplete_path = blob_path + ".incomplete" + + @contextmanager + def _resumable_file_manager() -> Generator[io.BufferedWriter, None, None]: + with open(incomplete_path, "ab") as f: + yield f + + temp_file_manager = _resumable_file_manager + if os.path.exists(incomplete_path): + resume_size = os.stat(incomplete_path).st_size + else: + resume_size = 0 + else: + temp_file_manager = partial( # type: ignore + tempfile.NamedTemporaryFile, mode="wb", dir=cache_dir, delete=False + ) + resume_size = 0 + + # Download to temporary file, then copy to cache dir once finished. + # Otherwise you get corrupt cache entries if the download gets interrupted. + with temp_file_manager() as temp_file: + logger.info("downloading %s to %s", url, temp_file.name) + + if expected_size is not None: # might be None if HTTP header not set correctly + # Check tmp path + _check_disk_space(expected_size, os.path.dirname(temp_file.name)) + + # Check destination + _check_disk_space(expected_size, os.path.dirname(blob_path)) + if local_dir is not None: + _check_disk_space(expected_size, local_dir) + + http_get( + url_to_download, + temp_file, + proxies=proxies, + resume_size=resume_size, + headers=headers, + expected_size=expected_size, + ) + if local_dir is None: + logger.debug(f"Storing {url} in cache at {blob_path}") + _chmod_and_replace(temp_file.name, blob_path) + _create_symlink(blob_path, pointer_path, new_blob=True) + else: + local_dir_filepath = os.path.join(local_dir, relative_filename) + os.makedirs(os.path.dirname(local_dir_filepath), exist_ok=True) + + # If "auto" (default) copy-paste small files to ease manual editing but symlink big files to save disk + # In both cases, blob file is cached. + is_big_file = os.stat(temp_file.name).st_size > DEFAULT_LOCAL_DIR_AUTO_SYMLINK_THRESHOLD + if local_dir_use_symlinks is True or (local_dir_use_symlinks == "auto" and is_big_file): + logger.debug(f"Storing {url} in cache at {blob_path}") + _chmod_and_replace(temp_file.name, blob_path) + logger.debug("Create symlink to local dir") + _create_symlink(blob_path, local_dir_filepath, new_blob=False) + elif local_dir_use_symlinks == "auto" and not is_big_file: + logger.debug(f"Storing {url} in cache at {blob_path}") + _chmod_and_replace(temp_file.name, blob_path) + logger.debug("Duplicate in local dir (small file and use_symlink set to 'auto')") + shutil.copyfile(blob_path, local_dir_filepath) + else: + logger.debug(f"Storing {url} in local_dir at {local_dir_filepath} (not cached).") + _chmod_and_replace(temp_file.name, local_dir_filepath) + pointer_path = local_dir_filepath # for return value + + return pointer_path + + +def bos_file_exists( + repo_id: str, + filename: str, + *, + repo_type: Optional[str] = None, + revision: Optional[str] = None, + token: Optional[str] = None, + endpoint: Optional[str] = None, +) -> bool: + """ + Checks if a file exists in a repository on the Aistudio Hub. + + Args: + repo_id (`str`): + A namespace (user or an organization) and a repo name separated + by a `/`. + filename (`str`): + The name of the file to check, for example: + `"config.json"` + repo_type (`str`, *optional*): + Set to `"dataset"` or `"space"` if getting repository info from a dataset or a space, + `None` or `"model"` if getting repository info from a model. Default is `None`. + revision (`str`, *optional*): + The revision of the repository from which to get the information. Defaults to `"main"` branch. + token (`bool` or `str`, *optional*): + A valid authentication token (see https://huggingface.co/settings/token). + If `None` or `True` and machine is logged in (through `huggingface-cli login` + or [`~login`]), token will be retrieved from the cache. + If `False`, token is not sent in the request header. + + Returns: + True if the file exists, False otherwise. + + + + Examples: + ```py + >>> from huggingface_hub import file_exists + >>> file_exists("bigcode/starcoder", "config.json") + True + >>> file_exists("bigcode/starcoder", "not-a-file") + False + >>> file_exists("bigcode/not-a-repo", "config.json") + False + ``` + + + """ + url = bos_url(repo_id=repo_id, repo_type=repo_type, revision=revision, filename=filename, endpoint=endpoint) + try: + get_bos_file_metadata(url, token=token) + return True + except GatedRepoError: # raise specifically on gated repo + raise + except (RepositoryNotFoundError, EntryNotFoundError, RevisionNotFoundError, HfHubHTTPError): + return False + + +def bos_try_to_load_from_cache( + repo_id: str, + filename: str, + cache_dir: Union[str, Path, None] = None, + revision: Optional[str] = None, + repo_type: Optional[str] = None, +): + if revision is None: + revision = DEFAULT_REVISION + if repo_type is None: + repo_type = REPO_TYPES[-1] + if repo_type not in REPO_TYPES: + raise ValueError(f"Invalid repo type: {repo_type}. Accepted repo types are: {str(REPO_TYPES)}") + if cache_dir is None: + cache_dir = BOS_CACHE + + object_id = repo_id.replace("/", "--") + repo_cache = os.path.join(cache_dir, f"{repo_type}--{object_id}") + if not os.path.isdir(repo_cache): + # No cache for this model + return None + + refs_dir = os.path.join(repo_cache, "refs") + snapshots_dir = os.path.join(repo_cache, "snapshots") + no_exist_dir = os.path.join(repo_cache, ".no_exist") + + # Resolve refs (for instance to convert main to the associated commit sha) + if os.path.isdir(refs_dir): + revision_file = os.path.join(refs_dir, revision) + if os.path.isfile(revision_file): + with open(revision_file) as f: + revision = f.read() + + # Check if file is cached as "no_exist" + if os.path.isfile(os.path.join(no_exist_dir, revision, filename)): + return _CACHED_NO_EXIST + + # Check if revision folder exists + if not os.path.exists(snapshots_dir): + return None + cached_shas = os.listdir(snapshots_dir) + if revision not in cached_shas: + # No cache for this revision and we won't try to return a random revision + return None + + # Check if file exists in cache + cached_file = os.path.join(snapshots_dir, revision, filename) + return cached_file if os.path.isfile(cached_file) else None diff --git a/VLMEvalKit_old/PaddleMIX/ppdiffusers/ppdiffusers/utils/downloader/common.py b/VLMEvalKit_old/PaddleMIX/ppdiffusers/ppdiffusers/utils/downloader/common.py new file mode 100644 index 0000000000000000000000000000000000000000..faaddf5c5ed272a807fceca665a29ff216c5ae63 --- /dev/null +++ b/VLMEvalKit_old/PaddleMIX/ppdiffusers/ppdiffusers/utils/downloader/common.py @@ -0,0 +1,662 @@ +# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import copy +import logging +import os +import re +import shutil +import stat +import tempfile +import threading +import time +import uuid +import warnings +from contextlib import contextmanager +from dataclasses import dataclass +from functools import lru_cache +from pathlib import Path +from typing import BinaryIO, Callable, Dict, Generator, Literal, Optional, Union +from urllib.parse import urlparse + +import requests +from huggingface_hub.utils import ( + BadRequestError, + EntryNotFoundError, + HfHubHTTPError, + tqdm, +) +from requests import HTTPError, Response +from requests.adapters import HTTPAdapter +from requests.models import PreparedRequest + +logger = logging.getLogger(__name__) + +ENV_VARS_TRUE_VALUES = {"1", "ON", "YES", "TRUE"} + + +def _is_true(value: Optional[str]) -> bool: + if value is None: + return False + return value.upper() in ENV_VARS_TRUE_VALUES + + +def _as_int(value: Optional[str]) -> Optional[int]: + if value is None: + return None + return int(value) + + +DISABLE_SYMLINKS_WARNING = False +# Regex to get filename from a "Content-Disposition" header for CDN-served files +HEADER_FILENAME_PATTERN = re.compile(r'filename="(?P.*?)"') +DOWNLOAD_CHUNK_SIZE = 10 * 1024 * 1024 +REPO_ID_SEPARATOR = "--" + +DEFAULT_DOWNLOAD_TIMEOUT = 10 +DEFAULT_REQUEST_TIMEOUT = 10 +DEFAULT_ETAG_TIMEOUT = 10 +DEFAULT_LOCAL_DIR_AUTO_SYMLINK_THRESHOLD: int = 5 * 1024 * 1024 + +OFFLINE = _is_true(os.environ.get("AISTUDIO_BOS_OFFLINE")) +_CACHED_NO_EXIST = object() + + +def _cache_commit_hash_for_specific_revision(storage_folder: str, revision: str, commit_hash: str) -> None: + """Cache reference between a revision (tag, branch or truncated commit hash) and the corresponding commit hash. + + Does nothing if `revision` is already a proper `commit_hash` or reference is already cached. + """ + # if revision != commit_hash: + ref_path = Path(storage_folder) / "refs" / revision + ref_path.parent.mkdir(parents=True, exist_ok=True) + if not ref_path.exists() or commit_hash != ref_path.read_text(): + # Update ref only if has been updated. Could cause useless error in case + # repo is already cached and user doesn't have write access to cache folder. + # See https://github.com/huggingface/huggingface_hub/issues/1216. + ref_path.write_text(commit_hash) + + +def _check_disk_space(expected_size: int, target_dir: Union[str, Path]) -> None: + """Check disk usage and log a warning if there is not enough disk space to download the file. + + Args: + expected_size (`int`): + The expected size of the file in bytes. + target_dir (`str`): + The directory where the file will be stored after downloading. + """ + + target_dir = Path(target_dir) # format as `Path` + for path in [target_dir] + list(target_dir.parents): # first check target_dir, then each parents one by one + try: + target_dir_free = shutil.disk_usage(path).free + if target_dir_free < expected_size: + warnings.warn( + "Not enough free disk space to download the file. " + f"The expected file size is: {expected_size / 1e6:.2f} MB. " + f"The target location {target_dir} only has {target_dir_free / 1e6:.2f} MB free disk space." + ) + return + except OSError: # raise on anything: file does not exist or space disk cannot be checked + pass + + +def http_get( + url: str, + temp_file: BinaryIO, + *, + proxies=None, + resume_size: float = 0, + headers: Optional[Dict[str, str]] = None, + expected_size: Optional[int] = None, + _nb_retries: int = 5, +): + """ + Download a remote file. Do not gobble up errors, and will return errors tailored to the Hugging Face Hub. + + If ConnectionError (SSLError) or ReadTimeout happen while streaming data from the server, it is most likely a + transient error (network outage?). We log a warning message and try to resume the download a few times before + giving up. The method gives up after 5 attempts if no new data has being received from the server. + """ + initial_headers = headers + headers = copy.deepcopy(headers) or {} + if resume_size > 0: + headers["Range"] = "bytes=%d-" % (resume_size,) + + r = _request_wrapper( + method="GET", url=url, stream=True, proxies=proxies, headers=headers, timeout=DEFAULT_DOWNLOAD_TIMEOUT + ) + raise_for_status(r) + content_length = r.headers.get("Content-Length") + + # NOTE: 'total' is the total number of bytes to download, not the number of bytes in the file. + # If the file is compressed, the number of bytes in the saved file will be higher than 'total'. + total = resume_size + int(content_length) if content_length is not None else None + + displayed_name = url + content_disposition = r.headers.get("Content-Disposition") + if content_disposition is not None: + match = HEADER_FILENAME_PATTERN.search(content_disposition) + if match is not None: + # Means file is on CDN + displayed_name = match.groupdict()["filename"] + + # Truncate filename if too long to display + if len(displayed_name) > 40: + displayed_name = f"(…){displayed_name[-40:]}" + + consistency_error_message = ( + f"Consistency check failed: file should be of size {expected_size} but has size" + f" {{actual_size}} ({displayed_name}).\nWe are sorry for the inconvenience. Please retry download and" + " pass `force_download=True, resume_download=False` as argument.\nIf the issue persists, please let us" + " know by opening an issue on https://github.com/huggingface/huggingface_hub." + ) + + # Stream file to buffer + with tqdm( + unit="B", + unit_scale=True, + total=total, + initial=resume_size, + desc=displayed_name, + disable=bool(logger.getEffectiveLevel() == logging.NOTSET), + ) as progress: + new_resume_size = resume_size + try: + for chunk in r.iter_content(chunk_size=DOWNLOAD_CHUNK_SIZE): + if chunk: # filter out keep-alive new chunks + progress.update(len(chunk)) + temp_file.write(chunk) + new_resume_size += len(chunk) + # Some data has been downloaded from the server so we reset the number of retries. + _nb_retries = 5 + except (requests.ConnectionError, requests.ReadTimeout) as e: + # If ConnectionError (SSLError) or ReadTimeout happen while streaming data from the server, it is most likely + # a transient error (network outage?). We log a warning message and try to resume the download a few times + # before giving up. Tre retry mechanism is basic but should be enough in most cases. + if _nb_retries <= 0: + logger.warning("Error while downloading from %s: %s\nMax retries exceeded.", url, str(e)) + raise + logger.warning("Error while downloading from %s: %s\nTrying to resume download...", url, str(e)) + time.sleep(1) + reset_sessions() # In case of SSLError it's best to reset the shared requests.Session objects + return http_get( + url=url, + temp_file=temp_file, + proxies=proxies, + resume_size=new_resume_size, + headers=initial_headers, + expected_size=expected_size, + _nb_retries=_nb_retries - 1, + ) + + if expected_size is not None and expected_size != temp_file.tell(): + raise EnvironmentError( + consistency_error_message.format( + actual_size=temp_file.tell(), + ) + ) + + +def _chmod_and_replace(src: str, dst: str) -> None: + """Set correct permission before moving a blob from tmp directory to cache dir. + + Do not take into account the `umask` from the process as there is no convenient way + to get it that is thread-safe. + + See: + - About umask: https://docs.python.org/3/library/os.html#os.umask + - Thread-safety: https://stackoverflow.com/a/70343066 + - About solution: https://github.com/huggingface/huggingface_hub/pull/1220#issuecomment-1326211591 + - Fix issue: https://github.com/huggingface/huggingface_hub/issues/1141 + - Fix issue: https://github.com/huggingface/huggingface_hub/issues/1215 + """ + # Get umask by creating a temporary file in the cached repo folder. + tmp_file = Path(dst).parent.parent / f"tmp_{uuid.uuid4()}" + try: + tmp_file.touch() + cache_dir_mode = Path(tmp_file).stat().st_mode + os.chmod(src, stat.S_IMODE(cache_dir_mode)) + finally: + tmp_file.unlink() + + shutil.move(src, dst) + + +def repo_folder_name(*, repo_id: str, repo_type: str) -> str: + """Return a serialized version of a aistudio repo name and type, safe for disk storage + as a single non-nested folder. + + Example: models--julien-c--EsperBERTo-small + """ + # remove all `/` occurrences to correctly convert repo to directory name + parts = [f"{repo_type}s", *repo_id.split("/")] + return REPO_ID_SEPARATOR.join(parts) + + +class OfflineModeIsEnabled(ConnectionError): + """Raised when a request is made but `AISTUDIO_HUB_OFFLINE=1` is set as environment variable.""" + + +class OfflineAdapter(HTTPAdapter): + def send(self, request: PreparedRequest, *args, **kwargs) -> Response: + raise OfflineModeIsEnabled( + f"Cannot reach {request.url}: offline mode is enabled. To disable it, please unset the `AISTUDIO_HUB_OFFLINE` environment variable." + ) + + +BACKEND_FACTORY_T = Callable[[], requests.Session] + + +def _default_backend_factory() -> requests.Session: + session = requests.Session() + if OFFLINE: + session.mount("http://", OfflineAdapter()) + session.mount("https://", OfflineAdapter()) + + return session + + +_GLOBAL_BACKEND_FACTORY: BACKEND_FACTORY_T = _default_backend_factory +HTTP_METHOD_T = Literal["GET", "OPTIONS", "HEAD", "POST", "PUT", "PATCH", "DELETE"] + + +@lru_cache +def _get_session_from_cache(process_id: int, thread_id: int) -> requests.Session: + """ + Create a new session per thread using global factory. Using LRU cache (maxsize 128) to avoid memory leaks when + using thousands of threads. Cache is cleared when `configure_http_backend` is called. + """ + return _GLOBAL_BACKEND_FACTORY() + + +def reset_sessions() -> None: + """Reset the cache of sessions. + + Mostly used internally when sessions are reconfigured or an SSLError is raised. + See [`configure_http_backend`] for more details. + """ + _get_session_from_cache.cache_clear() + + +def get_session() -> requests.Session: + """ + Get a `requests.Session` object, using the session factory from the user. + + Use [`get_session`] to get a configured Session. Since `requests.Session` is not guaranteed to be thread-safe, + `huggingface_hub` creates 1 Session instance per thread. They are all instantiated using the same `backend_factory` + set in [`configure_http_backend`]. A LRU cache is used to cache the created sessions (and connections) between + calls. Max size is 128 to avoid memory leaks if thousands of threads are spawned. + + See [this issue](https://github.com/psf/requests/issues/2766) to know more about thread-safety in `requests`. + + Example: + ```py + import requests + from huggingface_hub import configure_http_backend, get_session + + # Create a factory function that returns a Session with configured proxies + def backend_factory() -> requests.Session: + session = requests.Session() + session.proxies = {"http": "http://10.10.1.10:3128", "https": "https://10.10.1.11:1080"} + return session + + # Set it as the default session factory + configure_http_backend(backend_factory=backend_factory) + + # In practice, this is mostly done internally in `huggingface_hub` + session = get_session() + ``` + """ + return _get_session_from_cache(process_id=os.getpid(), thread_id=threading.get_ident()) + + +def _request_wrapper( + method: HTTP_METHOD_T, url: str, *, follow_relative_redirects: bool = False, **params +) -> requests.Response: + """Wrapper around requests methods to follow relative redirects if `follow_relative_redirects=True` even when + `allow_redirection=False`. + + Args: + method (`str`): + HTTP method, such as 'GET' or 'HEAD'. + url (`str`): + The URL of the resource to fetch. + follow_relative_redirects (`bool`, *optional*, defaults to `False`) + If True, relative redirection (redirection to the same site) will be resolved even when `allow_redirection` + kwarg is set to False. Useful when we want to follow a redirection to a renamed repository without + following redirection to a CDN. + **params (`dict`, *optional*): + Params to pass to `requests.request`. + """ + # Recursively follow relative redirects + if follow_relative_redirects: + response = _request_wrapper( + method=method, + url=url, + follow_relative_redirects=False, + **params, + ) + + # If redirection, we redirect only relative paths. + # This is useful in case of a renamed repository. + if 300 <= response.status_code <= 399: + parsed_target = urlparse(response.headers["Location"]) + if parsed_target.netloc == "": + # This means it is a relative 'location' headers, as allowed by RFC 7231. + # (e.g. '/path/to/resource' instead of 'http://domain.tld/path/to/resource') + # We want to follow this relative redirect ! + # + # Highly inspired by `resolve_redirects` from requests library. + # See https://github.com/psf/requests/blob/main/requests/sessions.py#L159 + next_url = urlparse(url)._replace(path=parsed_target.path).geturl() + return _request_wrapper(method=method, url=next_url, follow_relative_redirects=True, **params) + return response + # Perform request and return if status_code is not in the retry list. + response = get_session().request(method=method, url=url, **params) + raise_for_status(response) + return response + + +def _get_pointer_path(storage_folder: str, revision: str, relative_filename: str) -> str: + # Using `os.path.abspath` instead of `Path.resolve()` to avoid resolving symlinks + snapshot_path = os.path.join(storage_folder, "snapshots") + pointer_path = os.path.join(snapshot_path, revision, relative_filename) + if Path(os.path.abspath(snapshot_path)) not in Path(os.path.abspath(pointer_path)).parents: + raise ValueError( + "Invalid pointer path: cannot create pointer path in snapshot folder if" + f" `storage_folder='{storage_folder}'`, `revision='{revision}'` and" + f" `relative_filename='{relative_filename}'`." + ) + return pointer_path + + +def _create_symlink(src: str, dst: str, new_blob: bool = False) -> None: + """Create a symbolic link named dst pointing to src. + + By default, it will try to create a symlink using a relative path. Relative paths have 2 advantages: + - If the cache_folder is moved (example: back-up on a shared drive), relative paths within the cache folder will + not brake. + - Relative paths seems to be better handled on Windows. Issue was reported 3 times in less than a week when + changing from relative to absolute paths. See https://github.com/huggingface/huggingface_hub/issues/1398, + https://github.com/huggingface/diffusers/issues/2729 and https://github.com/huggingface/transformers/pull/22228. + NOTE: The issue with absolute paths doesn't happen on admin mode. + When creating a symlink from the cache to a local folder, it is possible that a relative path cannot be created. + This happens when paths are not on the same volume. In that case, we use absolute paths. + + + The result layout looks something like + └── [ 128] snapshots + ├── [ 128] 2439f60ef33a0d46d85da5001d52aeda5b00ce9f + │ ├── [ 52] README.md -> ../../../blobs/d7edf6bd2a681fb0175f7735299831ee1b22b812 + │ └── [ 76] pytorch_model.bin -> ../../../blobs/403450e234d65943a7dcf7e05a771ce3c92faa84dd07db4ac20f592037a1e4bd + + If symlinks cannot be created on this platform (most likely to be Windows), the workaround is to avoid symlinks by + having the actual file in `dst`. If it is a new file (`new_blob=True`), we move it to `dst`. If it is not a new file + (`new_blob=False`), we don't know if the blob file is already referenced elsewhere. To avoid breaking existing + cache, the file is duplicated on the disk. + + In case symlinks are not supported, a warning message is displayed to the user once when loading `huggingface_hub`. + The warning message can be disable with the `DISABLE_SYMLINKS_WARNING` environment variable. + """ + try: + os.remove(dst) + except OSError: + pass + + abs_src = os.path.abspath(os.path.expanduser(src)) + abs_dst = os.path.abspath(os.path.expanduser(dst)) + abs_dst_folder = os.path.dirname(abs_dst) + + # Use relative_dst in priority + try: + relative_src = os.path.relpath(abs_src, abs_dst_folder) + except ValueError: + # Raised on Windows if src and dst are not on the same volume. This is the case when creating a symlink to a + # local_dir instead of within the cache directory. + # See https://docs.python.org/3/library/os.path.html#os.path.relpath + relative_src = None + + try: + commonpath = os.path.commonpath([abs_src, abs_dst]) + _support_symlinks = are_symlinks_supported(commonpath) + except ValueError: + # Raised if src and dst are not on the same volume. Symlinks will still work on Linux/Macos. + # See https://docs.python.org/3/library/os.path.html#os.path.commonpath + _support_symlinks = os.name != "nt" + except PermissionError: + # Permission error means src and dst are not in the same volume (e.g. destination path has been provided + # by the user via `local_dir`. Let's test symlink support there) + _support_symlinks = are_symlinks_supported(abs_dst_folder) + + # Symlinks are supported => let's create a symlink. + if _support_symlinks: + src_rel_or_abs = relative_src or abs_src + logger.debug(f"Creating pointer from {src_rel_or_abs} to {abs_dst}") + try: + os.symlink(src_rel_or_abs, abs_dst) + return + except FileExistsError: + if os.path.islink(abs_dst) and os.path.realpath(abs_dst) == os.path.realpath(abs_src): + # `abs_dst` already exists and is a symlink to the `abs_src` blob. It is most likely that the file has + # been cached twice concurrently (exactly between `os.remove` and `os.symlink`). Do nothing. + return + else: + # Very unlikely to happen. Means a file `dst` has been created exactly between `os.remove` and + # `os.symlink` and is not a symlink to the `abs_src` blob file. Raise exception. + raise + except PermissionError: + # Permission error means src and dst are not in the same volume (e.g. download to local dir) and symlink + # is supported on both volumes but not between them. Let's just make a hard copy in that case. + pass + + # Symlinks are not supported => let's move or copy the file. + if new_blob: + logger.info(f"Symlink not supported. Moving file from {abs_src} to {abs_dst}") + shutil.move(abs_src, abs_dst) + else: + logger.info(f"Symlink not supported. Copying file from {abs_src} to {abs_dst}") + shutil.copyfile(abs_src, abs_dst) + + +_are_symlinks_supported_in_dir: Dict[str, bool] = {} + + +def _set_write_permission_and_retry(func, path, excinfo): + os.chmod(path, stat.S_IWRITE) + func(path) + + +@contextmanager +def SoftTemporaryDirectory( + suffix: Optional[str] = None, + prefix: Optional[str] = None, + dir: Optional[Union[Path, str]] = None, + **kwargs, +) -> Generator[str, None, None]: + """ + Context manager to create a temporary directory and safely delete it. + + If tmp directory cannot be deleted normally, we set the WRITE permission and retry. + If cleanup still fails, we give up but don't raise an exception. This is equivalent + to `tempfile.TemporaryDirectory(..., ignore_cleanup_errors=True)` introduced in + Python 3.10. + + See https://www.scivision.dev/python-tempfile-permission-error-windows/. + """ + tmpdir = tempfile.TemporaryDirectory(prefix=prefix, suffix=suffix, dir=dir, **kwargs) + yield tmpdir.name + + try: + # First once with normal cleanup + shutil.rmtree(tmpdir.name) + except Exception: + # If failed, try to set write permission and retry + try: + shutil.rmtree(tmpdir.name, onerror=_set_write_permission_and_retry) + except Exception: + pass + + # And finally, cleanup the tmpdir. + # If it fails again, give up but do not throw error + try: + tmpdir.cleanup() + except Exception: + pass + + +def _to_local_dir( + path: str, local_dir: str, relative_filename: str, use_symlinks: Union[bool, Literal["auto"]] +) -> str: + """Place a file in a local dir (different than cache_dir). + + Either symlink to blob file in cache or duplicate file depending on `use_symlinks` and file size. + """ + # Using `os.path.abspath` instead of `Path.resolve()` to avoid resolving symlinks + local_dir_filepath = os.path.join(local_dir, relative_filename) + if Path(os.path.abspath(local_dir)) not in Path(os.path.abspath(local_dir_filepath)).parents: + raise ValueError( + f"Cannot copy file '{relative_filename}' to local dir '{local_dir}': file would not be in the local" + " directory." + ) + + os.makedirs(os.path.dirname(local_dir_filepath), exist_ok=True) + real_blob_path = os.path.realpath(path) + + # If "auto" (default) copy-paste small files to ease manual editing but symlink big files to save disk + if use_symlinks == "auto": + use_symlinks = os.stat(real_blob_path).st_size > DEFAULT_LOCAL_DIR_AUTO_SYMLINK_THRESHOLD + + if use_symlinks: + _create_symlink(real_blob_path, local_dir_filepath, new_blob=False) + else: + shutil.copyfile(real_blob_path, local_dir_filepath) + return local_dir_filepath + + +def _normalize_etag(etag: Optional[str]) -> Optional[str]: + """Normalize ETag HTTP header, so it can be used to create nice filepaths. + + The HTTP spec allows two forms of ETag: + ETag: W/"" + ETag: "" + + For now, we only expect the second form from the server, but we want to be future-proof so we support both. For + more context, see `TestNormalizeEtag` tests and https://github.com/huggingface/huggingface_hub/pull/1428. + + Args: + etag (`str`, *optional*): HTTP header + + Returns: + `str` or `None`: string that can be used as a nice directory name. + Returns `None` if input is None. + """ + if etag is None: + return None + return etag.lstrip("W/").strip('"') + + +@dataclass(frozen=True) +class AistudioBosFileMetadata: + """Data structure containing information about a file versioned on the Aistudio Hub. + + Returned by [`get_aistudio_file_metadata`] based on a URL. + + Args: + commit_hash (`str`, *optional*): + The commit_hash related to the file. + etag (`str`, *optional*): + Etag of the file on the server. + location (`str`): + Location where to download the file. Can be a Hub url or not (CDN). + size (`size`): + Size of the file. In case of an LFS file, contains the size of the actual + LFS file, not the pointer. + """ + + commit_hash: Optional[str] + etag: Optional[str] + location: str + size: Optional[int] + + +def raise_for_status(response: Response, endpoint_name: Optional[str] = None) -> None: + try: + response.raise_for_status() + except HTTPError as e: + if response.status_code == 404: + message = f"{response.status_code} Client Error." + "\n\n" + f"Entry Not Found for url: {response.url}." + raise EntryNotFoundError(message, None) from e + elif response.status_code == 400: + message = ( + f"\n\nBad request for {endpoint_name} endpoint:" if endpoint_name is not None else "\n\nBad request:" + ) + raise BadRequestError(message, response=None) from e + raise HfHubHTTPError(str(e), response=None) from e + + +def are_symlinks_supported(cache_dir: Union[str, Path, None] = None) -> bool: + """Return whether the symlinks are supported on the machine. + + Since symlinks support can change depending on the mounted disk, we need to check + on the precise cache folder. + + Args: + cache_dir (`str`, `Path`, *optional*): + Path to the folder where cached files are stored. + + Returns: [bool] Whether symlinks are supported in the directory. + """ + assert cache_dir is not None + cache_dir = str(Path(cache_dir).expanduser().resolve()) # make it unique + + # Check symlink compatibility only once (per cache directory) at first time use + if cache_dir not in _are_symlinks_supported_in_dir: + _are_symlinks_supported_in_dir[cache_dir] = True + + os.makedirs(cache_dir, exist_ok=True) + with SoftTemporaryDirectory(dir=cache_dir) as tmpdir: + src_path = Path(tmpdir) / "dummy_file_src" + src_path.touch() + dst_path = Path(tmpdir) / "dummy_file_dst" + + # Relative source path as in `_create_symlink`` + relative_src = os.path.relpath(src_path, start=os.path.dirname(dst_path)) + try: + os.symlink(relative_src, dst_path) + except OSError: + # Likely running on Windows + _are_symlinks_supported_in_dir[cache_dir] = False + + if not DISABLE_SYMLINKS_WARNING: + message = ( + "cache-system uses symlinks by default to" + " efficiently store duplicated files but your machine does not" + f" support them in {cache_dir}. Caching files will still work" + " but in a degraded version that might require more space on" + " your disk. This warning can be disabled by setting the" + " `DISABLE_SYMLINKS_WARNING` environment variable." + ) + if os.name == "nt": + message += ( + "\nTo support symlinks on Windows, you either need to" + " activate Developer Mode or to run Python as an" + " administrator. In order to see activate developer mode," + " see this article:" + " https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development" + ) + warnings.warn(message) + + return _are_symlinks_supported_in_dir[cache_dir] diff --git a/VLMEvalKit_old/PaddleMIX/ppdiffusers/ppdiffusers/utils/downloader/hf_hub_download.py b/VLMEvalKit_old/PaddleMIX/ppdiffusers/ppdiffusers/utils/downloader/hf_hub_download.py new file mode 100644 index 0000000000000000000000000000000000000000..bd030852567dd028f4703005cb837100747da80d --- /dev/null +++ b/VLMEvalKit_old/PaddleMIX/ppdiffusers/ppdiffusers/utils/downloader/hf_hub_download.py @@ -0,0 +1,19 @@ +# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from huggingface_hub import file_exists as hf_hub_file_exists # noqa: F401 +from huggingface_hub import hf_hub_download # noqa: F401 +from huggingface_hub import ( # noqa: F401 + try_to_load_from_cache as hf_hub_try_to_load_from_cache, +) diff --git a/VLMEvalKit_old/PaddleMIX/ppdiffusers/requirements.txt b/VLMEvalKit_old/PaddleMIX/ppdiffusers/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..3b7fa12a3c248df7f991de3e61eb3fdc2066ac75 --- /dev/null +++ b/VLMEvalKit_old/PaddleMIX/ppdiffusers/requirements.txt @@ -0,0 +1,18 @@ +paddlenlp>=3.0.0b2 +safetensors>=0.3.1 +ftfy +regex +Pillow +opencv-python +av +# for test +parameterized +requests_mock +omegaconf +note_seq +urllib3<=2.0.0 +einops>=0.6.1 +paddlesde +ligo-segments +huggingface_hub==0.23.0 +hf_transfer diff --git a/VLMEvalKit_old/docs/en/_static/image/logo.svg b/VLMEvalKit_old/docs/en/_static/image/logo.svg new file mode 100644 index 0000000000000000000000000000000000000000..043530572afb48d0eac26b4b53d448aae6e9a9af --- /dev/null +++ b/VLMEvalKit_old/docs/en/_static/image/logo.svg @@ -0,0 +1,24 @@ + + + +Created with Fabric.js 5.3.0 + + + + + + + + + + + + + VLMEvalKit + diff --git a/lightning-hydra-template/src/models/components/__init__.py b/lightning-hydra-template/src/models/components/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/lightning-hydra-template/tests/helpers/__init__.py b/lightning-hydra-template/tests/helpers/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/lightning-hydra-template/tests/helpers/run_if.py b/lightning-hydra-template/tests/helpers/run_if.py new file mode 100644 index 0000000000000000000000000000000000000000..9703af425129d0225d0aeed20dedc3ed35bc7548 --- /dev/null +++ b/lightning-hydra-template/tests/helpers/run_if.py @@ -0,0 +1,142 @@ +"""Adapted from: + +https://github.com/PyTorchLightning/pytorch-lightning/blob/master/tests/helpers/runif.py +""" + +import sys +from typing import Any, Dict, Optional + +import pytest +import torch +from packaging.version import Version +from pkg_resources import get_distribution +from pytest import MarkDecorator + +from tests.helpers.package_available import ( + _COMET_AVAILABLE, + _DEEPSPEED_AVAILABLE, + _FAIRSCALE_AVAILABLE, + _IS_WINDOWS, + _MLFLOW_AVAILABLE, + _NEPTUNE_AVAILABLE, + _SH_AVAILABLE, + _TPU_AVAILABLE, + _WANDB_AVAILABLE, +) + + +class RunIf: + """RunIf wrapper for conditional skipping of tests. + + Fully compatible with `@pytest.mark`. + + Example: + + ```python + @RunIf(min_torch="1.8") + @pytest.mark.parametrize("arg1", [1.0, 2.0]) + def test_wrapper(arg1): + assert arg1 > 0 + ``` + """ + + def __new__( + cls, + min_gpus: int = 0, + min_torch: Optional[str] = None, + max_torch: Optional[str] = None, + min_python: Optional[str] = None, + skip_windows: bool = False, + sh: bool = False, + tpu: bool = False, + fairscale: bool = False, + deepspeed: bool = False, + wandb: bool = False, + neptune: bool = False, + comet: bool = False, + mlflow: bool = False, + **kwargs: Dict[Any, Any], + ) -> MarkDecorator: + """Creates a new `@RunIf` `MarkDecorator` decorator. + + :param min_gpus: Min number of GPUs required to run test. + :param min_torch: Minimum pytorch version to run test. + :param max_torch: Maximum pytorch version to run test. + :param min_python: Minimum python version required to run test. + :param skip_windows: Skip test for Windows platform. + :param tpu: If TPU is available. + :param sh: If `sh` module is required to run the test. + :param fairscale: If `fairscale` module is required to run the test. + :param deepspeed: If `deepspeed` module is required to run the test. + :param wandb: If `wandb` module is required to run the test. + :param neptune: If `neptune` module is required to run the test. + :param comet: If `comet` module is required to run the test. + :param mlflow: If `mlflow` module is required to run the test. + :param kwargs: Native `pytest.mark.skipif` keyword arguments. + """ + conditions = [] + reasons = [] + + if min_gpus: + conditions.append(torch.cuda.device_count() < min_gpus) + reasons.append(f"GPUs>={min_gpus}") + + if min_torch: + torch_version = get_distribution("torch").version + conditions.append(Version(torch_version) < Version(min_torch)) + reasons.append(f"torch>={min_torch}") + + if max_torch: + torch_version = get_distribution("torch").version + conditions.append(Version(torch_version) >= Version(max_torch)) + reasons.append(f"torch<{max_torch}") + + if min_python: + py_version = ( + f"{sys.version_info.major}.{sys.version_info.minor}.{sys.version_info.micro}" + ) + conditions.append(Version(py_version) < Version(min_python)) + reasons.append(f"python>={min_python}") + + if skip_windows: + conditions.append(_IS_WINDOWS) + reasons.append("does not run on Windows") + + if tpu: + conditions.append(not _TPU_AVAILABLE) + reasons.append("TPU") + + if sh: + conditions.append(not _SH_AVAILABLE) + reasons.append("sh") + + if fairscale: + conditions.append(not _FAIRSCALE_AVAILABLE) + reasons.append("fairscale") + + if deepspeed: + conditions.append(not _DEEPSPEED_AVAILABLE) + reasons.append("deepspeed") + + if wandb: + conditions.append(not _WANDB_AVAILABLE) + reasons.append("wandb") + + if neptune: + conditions.append(not _NEPTUNE_AVAILABLE) + reasons.append("neptune") + + if comet: + conditions.append(not _COMET_AVAILABLE) + reasons.append("comet") + + if mlflow: + conditions.append(not _MLFLOW_AVAILABLE) + reasons.append("mlflow") + + reasons = [rs for cond, rs in zip(conditions, reasons) if cond] + return pytest.mark.skipif( + condition=any(conditions), + reason=f"Requires: [{' + '.join(reasons)}]", + **kwargs, + ) diff --git a/lightning-hydra-template/tests/helpers/run_sh_command.py b/lightning-hydra-template/tests/helpers/run_sh_command.py new file mode 100644 index 0000000000000000000000000000000000000000..fdd2ed633f1185dd7936924616be6a6359a7bca7 --- /dev/null +++ b/lightning-hydra-template/tests/helpers/run_sh_command.py @@ -0,0 +1,22 @@ +from typing import List + +import pytest + +from tests.helpers.package_available import _SH_AVAILABLE + +if _SH_AVAILABLE: + import sh + + +def run_sh_command(command: List[str]) -> None: + """Default method for executing shell commands with `pytest` and `sh` package. + + :param command: A list of shell commands as strings. + """ + msg = None + try: + sh.python(command) + except sh.ErrorReturnCode as e: + msg = e.stderr.decode() + if msg: + pytest.fail(msg=msg)