tuandunghcmut
/

vlm_clone_2

Model card Files Files and versions Community

tuandunghcmut commited on Apr 10

Commit

727399d

verified ·

1 Parent(s): 8ec1a3c

Add files using upload-large-folder tool

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitattributes +1 -0
DeepSeek-VL2/vg.jpg +0 -0
VLMEvalKit_old/PaddleMIX/paddlemix/examples/GOT_OCR_2_0/README.md +110 -0
VLMEvalKit_old/PaddleMIX/paddlemix/examples/GOT_OCR_2_0/requirement.txt +3 -0
VLMEvalKit_old/PaddleMIX/paddlemix/examples/GOT_OCR_2_0/run_train.sh +78 -0
VLMEvalKit_old/PaddleMIX/paddlemix/examples/GOT_OCR_2_0/train_GOT.py +243 -0
VLMEvalKit_old/PaddleMIX/paddlemix/examples/ppdocbee/app.py +350 -0
VLMEvalKit_old/PaddleMIX/paddlemix/models/GOT/utils/conversation.py +400 -0
VLMEvalKit_old/PaddleMIX/paddlemix/models/audioldm2/encoders/phoneme_encoder/__init__.py +13 -0
VLMEvalKit_old/PaddleMIX/paddlemix/models/audioldm2/encoders/phoneme_encoder/cleaners.py +103 -0
VLMEvalKit_old/PaddleMIX/paddlemix/models/audioldm2/encoders/phoneme_encoder/symbols.py +28 -0
VLMEvalKit_old/PaddleMIX/paddlemix/models/audioldm2/encoders/phoneme_encoder/text.py +62 -0
VLMEvalKit_old/PaddleMIX/paddlemix/models/audioldm2/unet/attention.py +199 -0
VLMEvalKit_old/PaddleMIX/paddlemix/models/diffsinger/basics/base_augmentation.py +46 -0
VLMEvalKit_old/PaddleMIX/paddlemix/models/diffsinger/basics/base_binarizer.py +330 -0
VLMEvalKit_old/PaddleMIX/paddlemix/models/diffsinger/basics/base_exporter.py +72 -0
VLMEvalKit_old/PaddleMIX/paddlemix/models/diffsinger/basics/base_svs_infer.py +149 -0
VLMEvalKit_old/PaddleMIX/paddlemix/models/diffsinger/basics/base_vocoder.py +37 -0
VLMEvalKit_old/PaddleMIX/paddlemix/models/diffsinger/modules/aux_decoder/convnext.py +103 -0
VLMEvalKit_old/PaddleMIX/paddlemix/models/diffsinger/modules/backbones/__init__.py +26 -0
VLMEvalKit_old/PaddleMIX/paddlemix/models/diffsinger/modules/backbones/lynxnet.py +188 -0
VLMEvalKit_old/PaddleMIX/paddlemix/models/diffsinger/modules/backbones/wavenet.py +120 -0
VLMEvalKit_old/PaddleMIX/paddlemix/models/diffsinger/modules/commons/common_layers.py +187 -0
VLMEvalKit_old/PaddleMIX/paddlemix/models/diffsinger/modules/commons/espnet_positional_embedding.py +129 -0
VLMEvalKit_old/PaddleMIX/paddlemix/models/diffsinger/modules/compat.py +35 -0
VLMEvalKit_old/PaddleMIX/paddlemix/models/diffsinger/modules/core/__init__.py +16 -0
VLMEvalKit_old/PaddleMIX/paddlemix/models/diffsinger/modules/core/ddpm.py +521 -0
VLMEvalKit_old/PaddleMIX/paddlemix/models/diffsinger/modules/core/reflow.py +311 -0
VLMEvalKit_old/PaddleMIX/paddlemix/models/diffsinger/modules/fastspeech/acoustic_encoder.py +110 -0
VLMEvalKit_old/PaddleMIX/paddlemix/models/diffsinger/modules/fastspeech/param_adaptor.py +88 -0
VLMEvalKit_old/PaddleMIX/paddlemix/models/diffsinger/modules/fastspeech/tts_modules.py +473 -0
VLMEvalKit_old/PaddleMIX/paddlemix/models/diffsinger/modules/fastspeech/variance_encoder.py +151 -0
VLMEvalKit_old/PaddleMIX/paddlemix/models/diffsinger/modules/hnsep/vr/__init__.py +42 -0
VLMEvalKit_old/PaddleMIX/paddlemix/models/diffsinger/modules/hnsep/vr/layers.py +140 -0
VLMEvalKit_old/PaddleMIX/paddlemix/models/diffsinger/modules/hnsep/vr/nets.py +185 -0
VLMEvalKit_old/PaddleMIX/paddlemix/models/diffsinger/modules/nsf_hifigan/env.py +46 -0
VLMEvalKit_old/PaddleMIX/paddlemix/models/diffsinger/modules/nsf_hifigan/models.py +380 -0
VLMEvalKit_old/PaddleMIX/paddlemix/models/diffsinger/modules/nsf_hifigan/nvSTFT.py +104 -0
VLMEvalKit_old/PaddleMIX/paddlemix/models/diffsinger/modules/nsf_hifigan/utils.py +27 -0
VLMEvalKit_old/PaddleMIX/paddlemix/models/diffsinger/modules/pe/pm.py +30 -0
VLMEvalKit_old/PaddleMIX/paddlemix/models/diffsinger/modules/pe/rmvpe/__init__.py +19 -0
VLMEvalKit_old/PaddleMIX/paddlemix/models/diffsinger/modules/pe/rmvpe/constants.py +21 -0
VLMEvalKit_old/PaddleMIX/paddlemix/models/diffsinger/modules/pe/rmvpe/deepunet.py +194 -0
VLMEvalKit_old/PaddleMIX/paddlemix/models/diffsinger/modules/pe/rmvpe/inference.py +80 -0
VLMEvalKit_old/PaddleMIX/paddlemix/models/diffsinger/modules/pe/rmvpe/model.py +54 -0
VLMEvalKit_old/PaddleMIX/paddlemix/models/diffsinger/modules/pe/rmvpe/seq.py +30 -0
VLMEvalKit_old/PaddleMIX/paddlemix/models/diffsinger/modules/pe/rmvpe/spec.py +65 -0
VLMEvalKit_old/PaddleMIX/paddlemix/models/diffsinger/modules/pe/rmvpe/utils.py +54 -0
VLMEvalKit_old/PaddleMIX/paddlemix/models/diffsinger/modules/toplevel.py +323 -0
VLMEvalKit_old/PaddleMIX/paddlemix/models/diffsinger/utils/__init__.py +342 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+DeepSeek-VL2/vg.jpg filter=lfs diff=lfs merge=lfs -text

DeepSeek-VL2/vg.jpg ADDED Viewed

VLMEvalKit_old/PaddleMIX/paddlemix/examples/GOT_OCR_2_0/README.md ADDED Viewed

	@@ -0,0 +1,110 @@

+# GOT-OCR2.0
+## 1. 模型介绍
+[GOT-OCR2.0](https://arxiv.org/abs/2409.01704)是由 StepFun 和中国科学院大学推出的专用于通用 OCR 任务的多模态大模型，参数量 0.6B，是一款极具突破性的通用OCR多模态模型，旨在解决传统OCR系统（OCR-1.0）和当前大规模视觉语言模型（LVLMs）在OCR任务中的局限性。
+**本仓库支持的模型权重:**
+| Model              |
+|--------------------|
+| stepfun-ai/GOT-OCR2_0  |
+注意：与huggingface权重同名，但权重为paddle框架的Tensor，使用`xxx.from_pretrained("stepfun-ai/GOT-OCR2_0")`即可自动下载该权重文件夹到缓存目录。
+## 2. 环境要求
+- **python >= 3.10**
+- **paddlepaddle-gpu 要求3.0.0b2版本或develop版本**
+```
+# 安装示例
+python -m pip install paddlepaddle-gpu==3.0.0b2 -i https://www.paddlepaddle.org.cn/packages/stable/cu118/
+```
+- **paddlenlp == 3.0.0b3**
+- **paddlenlp要求是3.0.0b3版本**
+```
+# 安装示例
+python -m pip install paddlenlp==3.0.0b3
+```
+- **其他环境要求**
+```
+pip install -r requirements.txt
+```
+## 3 推理预测
+注意：GOT-OCR2.0 模型推理显存约需4G，不支持数据类型为"float16"进行推理。
+### 3.1. plain texts OCR:
+```bash
+python paddlemix/examples/GOT_OCR_2_0/got_ocr2_0_infer.py \
+  --model_name_or_path stepfun-ai/GOT-OCR2_0 \
+  --image_file paddlemix/demo_images/hospital.jpeg \
+  --ocr_type ocr \
+  --dtype "bfloat16" \
+```
+### 3.2. format texts OCR:
+```bash
+python paddlemix/examples/GOT_OCR_2_0/got_ocr2_0_infer.py \
+  --model_name_or_path stepfun-ai/GOT-OCR2_0 \
+  --image_file paddlemix/demo_images/hospital.jpeg \
+  --ocr_type format \
+  --dtype "bfloat16" \
+```
+### 3.3. multi_crop plain texts OCR:
+```bash
+python paddlemix/examples/GOT_OCR_2_0/got_ocr2_0_infer.py \
+  --model_name_or_path stepfun-ai/GOT-OCR2_0 \
+  --image_file paddlemix/demo_images/hospital.jpeg \
+  --ocr_type ocr \
+  --multi_crop \
+  --dtype "bfloat16" \
+```
+## 4 训练
+与[官方github代码库](https://github.com/Ucas-HaoranWei/GOT-OCR2.0/?tab=readme-ov-file#train)一样，目前仅支持基于GOT权重的post-training(stage-2/stage-3)，其中stage2是全参数微调，stage3是冻结vision encoder后微调，默认训练方式是stage2全参数微调，训练显存约10GB每卡。
+### 数据集下载
+PaddleMIX团队提供了一个改版的SynthDoG-EN数据集，统一修改了其原先的question为```<image>\nOCR:```，下载链接为：
+```
+wget https://paddlenlp.bj.bcebos.com/datasets/paddlemix/playground/synthdog_en.tar # 2.4G
+```
+synthdog_en.tar包括了图片images文件夹和标注json文件，需下载解压或软链接在PaddleMIX/目录下。
+### 数据集格式
+同[官方例子](https://github.com/Ucas-HaoranWei/GOT-OCR2.0/blob/main/assets/train_sample.jpg)，其中question统一为```<image>\nOCR:```，answer是其OCR结果。
+### 训练命令
+```bash
+sh paddlemix/examples/GOT_OCR_2_0/run_train.sh
+```
+注意：默认训练方式是stage2全参数微调，训练显存约10GB每卡。也可通过设置```--freeze_vision_tower True```冻结vision encoder后微调。
+### 训完后推理
+```bash
+python paddlemix/examples/GOT_OCR_2_0/got_ocr2_0_infer.py \
+  --model_name_or_path work_dirs/got_ocr_20/ \
+  --image_file paddlemix/demo_images/hospital.jpeg \
+  --ocr_type ocr \
+```
+## 参考文献
+```BibTeX
+@article{wei2024general,
+  title={General OCR Theory: Towards OCR-2.0 via a Unified End-to-end Model},
+  author={Wei, Haoran and Liu, Chenglong and Chen, Jinyue and Wang, Jia and Kong, Lingyu and Xu, Yanming and Ge, Zheng and Zhao, Liang and Sun, Jianjian and Peng, Yuang and others},
+  journal={arXiv preprint arXiv:2409.01704},
+  year={2024}
+}
+```

VLMEvalKit_old/PaddleMIX/paddlemix/examples/GOT_OCR_2_0/requirement.txt ADDED Viewed

	@@ -0,0 +1,3 @@

+megfile
+natsort
+paddlenlp==3.0.0b3

VLMEvalKit_old/PaddleMIX/paddlemix/examples/GOT_OCR_2_0/run_train.sh ADDED Viewed

	@@ -0,0 +1,78 @@

+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+set -x
+GPUS=${GPUS:-8}
+BATCH_SIZE=${BATCH_SIZE:-32}
+PER_DEVICE_BATCH_SIZE=${PER_DEVICE_BATCH_SIZE:-1}
+GRADIENT_ACC=$((BATCH_SIZE / PER_DEVICE_BATCH_SIZE / GPUS))
+tensor_parallel_degree=${tensor_parallel_degree:-1}
+sharding_parallel_degree=$((GPUS / tensor_parallel_degree))
+export PYTHONPATH="${PYTHONPATH}:$(pwd)"
+export MASTER_PORT=34229
+export TF_CPP_MIN_LOG_LEVEL=3
+OUTPUT_DIR='work_dirs/got_ocr_20'
+if [ ! -d "$OUTPUT_DIR" ]; then
+  mkdir -p "$OUTPUT_DIR"
+fi
+TRAINING_MODEL_RESUME="None"
+TRAINER_INSTANCES='127.0.0.1'
+MASTER='127.0.0.1:8080'
+# --freeze_vision_tower False \ # True for stage3
+TRAINING_PYTHON="python -m paddle.distributed.launch --master ${MASTER} --nnodes 1 --nproc_per_node ${GPUS} --rank 0 --ips ${TRAINER_INSTANCES} --run_mode=collective"
+${TRAINING_PYTHON} --log_dir ${OUTPUT_DIR}/paddle_distributed_logs \
+  paddlemix/examples/GOT_OCR_2_0/train_GOT.py \
+  --do_train \
+  --model_name_or_path "stepfun-ai/GOT-OCR2_0" \
+  --output_dir ${OUTPUT_DIR} \
+  --logging_dir ${OUTPUT_DIR}/logs \
+  --meta_path paddlemix/examples/GOT_OCR_2_0/configs/demo_dataset.json \
+  --overwrite_output_dir True \
+  --dataloader_num_workers 8 \
+  --bf16 True \
+  --fp16 False \
+  --fp16_opt_level "O2" \
+  --num_train_epochs 1 \
+  --per_device_train_batch_size ${PER_DEVICE_BATCH_SIZE} \
+  --gradient_accumulation_steps ${GRADIENT_ACC} \
+  --freeze_vision_tower False \
+  --use_im_start_end True   \
+  --max_seq_length 8192 \
+  --recompute False \
+  --max_grad_norm 1.0 \
+  --evaluation_strategy "no" \
+  --save_strategy "steps" \
+  --save_steps 200 \
+  --save_total_limit 1 \
+  --learning_rate 2e-5 \
+  --weight_decay 0. \
+  --warmup_ratio 0.001 \
+  --optim "adamw" \
+  --lr_scheduler_type "cosine" \
+  --logging_steps 1 \
+  --report_to "visualdl" \
+  --tensor_parallel_degree=${tensor_parallel_degree} \
+  --sharding_parallel_degree=${sharding_parallel_degree} \
+  --pipeline_parallel_degree=1 \
+  --sep_parallel_degree=1 \
+  --sharding="stage1" \
+  2>&1 | tee -a "${OUTPUT_DIR}/training_log.txt"

VLMEvalKit_old/PaddleMIX/paddlemix/examples/GOT_OCR_2_0/train_GOT.py ADDED Viewed

	@@ -0,0 +1,243 @@

+# Adopted from https://github.com/lm-sys/FastChat. Below is the original copyright:
+# Adopted from tatsu-lab@stanford_alpaca. Below is the original copyright:
+#    Copyright 2023 Rohan Taori, Ishaan Gulrajani, Tianyi Zhang, Yann Dubois, Xuechen Li
+#
+#    Licensed under the Apache License, Version 2.0 (the "License");
+#    you may not use this file except in compliance with the License.
+#    You may obtain a copy of the License at
+#
+#        http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS,
+#    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#    See the License for the specific language governing permissions and
+#    limitations under the License.
+import logging
+import os
+import sys
+from dataclasses import dataclass, field
+from typing import Optional
+import paddle
+import paddle.distributed as dist
+from paddlenlp.trainer import PdArgumentParser, TrainingArguments, set_seed
+from paddlenlp.trainer.trainer import Trainer
+from paddlenlp.trainer.trainer_utils import get_last_checkpoint
+from paddlenlp.transformers import QWenTokenizer
+from paddlemix.datasets.got_dataset import make_supervised_data_module
+from paddlemix.models.GOT.GOT_ocr_2_0 import GOTQwenForCausalLM
+from paddlemix.models.GOT.utils.utils import smart_tokenizer_and_embedding_resize
+logger = logging.getLogger(__name__)
+def print_trainable_params(model: paddle.nn.Layer) -> None:
+    trainable_params, all_param = 0, 0
+    for k, param in model.named_parameters():
+        num_params = param.size
+        if num_params == 0 and hasattr(param, "ds_numel"):
+            num_params = param.ds_numel
+        all_param += num_params
+        if not param.stop_gradient:
+            # print('{}, shape: {}, requires grad: {}'.format(k, param.shape, not param.stop_gradient))
+            trainable_params += num_params
+    print(
+        "trainable params: {:d} || all params: {:d} || trainable%: {:.4f}".format(
+            trainable_params, all_param, 100 * trainable_params / all_param
+        )
+    )
+@dataclass
+class ModelArguments:
+    model_name_or_path: Optional[str] = field(default="stepfun-ai/GOT-OCR2_0")
+    use_cache: bool = field(default=False)
+    vision_tower: Optional[str] = field(default="openai/clip-vit-large-patch14")
+    freeze_vision_tower: bool = field(default=False)
+    freeze_lm_model: bool = field(default=False)
+    pretrained_stage1_model: Optional[str] = field(default=None)  # mlp &/ vision tower
+    vision_select_layer: Optional[int] = field(default=-1)  # default to the last layer
+    use_im_start_end: bool = field(default=False)
+@dataclass
+class DataArguments:
+    datasets: str = field(default=None, metadata={"help": "combinations of the training data."})
+    meta_path: Optional[str] = field(
+        default=None,
+        metadata={"help": "The path of the meta file of datasets."},
+    )
+    sep_image_conv_front: bool = False
+    image_token_len: int = 256
+    image_aspect_ratio: str = "square"
+    conversation_version: str = "mpt"
+    box_limit: int = 0
+    max_seq_length: int = 8192
+@dataclass
+class GOTTrainingArguments(TrainingArguments):
+    cache_dir: Optional[str] = field(default=None)
+    optim: str = field(default="adamw_torch")
+    remove_unused_columns: bool = field(default=False)
+    force_fsdp: bool = field(default=False)
+    interleave: bool = field(default=False)
+    with_box: bool = field(default=False)
+    model_max_length: int = field(
+        default=512,
+        metadata={"help": "Maximum sequence length. Sequences will be right padded (and possibly truncated)."},
+    )
+    lora_enable: bool = False
+    lora_r: int = 8
+    lora_alpha: int = 16
+    lora_dropout: float = 0.05
+    lora_weight_path: str = ""
+    lora_bias: str = "none"
+def train():
+    parser = PdArgumentParser((ModelArguments, DataArguments, GOTTrainingArguments))
+    if len(sys.argv) == 2 and sys.argv[1].endswith(".json"):
+        # If we pass only one argument to the script, and it's the path to a json file,
+        # let's parse it to get our arguments.
+        model_args, data_args, training_args = parser.parse_json_file(json_file=os.path.abspath(sys.argv[1]))
+    else:
+        model_args, data_args, training_args = parser.parse_args_into_dataclasses()
+    training_args.print_config(model_args, "Model")
+    training_args.print_config(data_args, "Data")
+    # Detecting last checkpoint and eventually continue from last checkpoint.
+    last_checkpoint = None
+    if os.path.isdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir:
+        last_checkpoint = get_last_checkpoint(training_args.output_dir)
+        if last_checkpoint is None and len(os.listdir(training_args.output_dir)) > 0:
+            raise ValueError(
+                f"Output directory ({training_args.output_dir}) already exists and is not empty. "
+                "Use --overwrite_output_dir to overcome."
+            )
+        elif last_checkpoint is not None and training_args.resume_from_checkpoint is None:
+            logger.info(
+                f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change "
+                "the `--output_dir` or add `--overwrite_output_dir` to train from scratch."
+            )
+    # Load model
+    if training_args.fp16_opt_level == "O2":
+        if training_args.fp16:
+            dtype = "float16"
+        elif training_args.bf16 and paddle.amp.is_bfloat16_supported():
+            dtype = "bfloat16"
+        else:
+            raise ValueError("Please specific dtype: --fp16 or --bf16")
+    else:
+        dtype = "float32"
+    # Set seed before initializing model.
+    set_seed(training_args.seed)
+    # Load pretrained model, tokenizer, and image processor
+    tokenizer_path = model_args.model_name_or_path
+    print(f"Loading Tokenizer: {tokenizer_path}")
+    tokenizer = QWenTokenizer.from_pretrained(
+        model_args.model_name_or_path, padding_side="right", model_max_length=training_args.model_max_length
+    )
+    print("tokenizer", tokenizer)
+    # print("len(tokenizer)", len(tokenizer))
+    # print("tokenizer.added_tokens_encoder", tokenizer.added_tokens_encoder)
+    # print("tokenizer.added_tokens_decoder", tokenizer.added_tokens_decoder)
+    model = GOTQwenForCausalLM.from_pretrained(model_args.model_name_or_path, dtype=dtype)
+    smart_tokenizer_and_embedding_resize(
+        special_tokens_dict=dict(pad_token="<|endoftext|>"),
+        tokenizer=tokenizer,
+        model=model,
+    )
+    vision_tower_dict = model.get_model().initialize_vision_modules(
+        vision_tower=model_args.vision_tower,
+        pretrained_stage1_model=model_args.pretrained_stage1_model,
+        freeze_vision_tower=model_args.freeze_vision_tower,
+        use_im_start_end=model_args.use_im_start_end,
+        vision_select_layer=model_args.vision_select_layer,
+        dtype=dtype,
+    )
+    model.initialize_vision_tokenizer(
+        tokenizer=tokenizer,
+        freeze_lm_model=model_args.freeze_lm_model,
+        pretrained_stage1_model=model_args.pretrained_stage1_model,
+    )
+    # 'image_processor_high
+    data_args.image_token_len = 256
+    data_args.image_processor = vision_tower_dict["image_processor"]
+    data_args.image_processor_high = vision_tower_dict["image_processor_high"]
+    data_args.use_im_start_end = model_args.use_im_start_end
+    def _freeze_params(module):
+        for param in module.parameters():
+            param.stop_gradient = not False
+    # mixed relation, to be fixed
+    if model_args.freeze_lm_model:
+        _freeze_params(model.get_model().mm_projector)
+        _freeze_params(model.get_model().mm_projector_vary)
+        _freeze_params(model.get_input_embeddings())
+    if model_args.freeze_vision_tower:
+        _freeze_params(model.qwen2.vision_tower_high)
+    print_trainable_params(model)
+    # trainable params: 464959488 || all params: 560528640 || trainable%: 82.9502 # stage3
+    # trainable params: 560528640 || all params: 560528640 || trainable%: 100 # stage2
+    params_grad = [p.numel() for n, p in model.named_parameters() if not p.stop_gradient]
+    print(f"Number of Mapping Trainable Parameters: {int(sum(params_grad)) / (1 << 20):.2f} M")
+    # print trainable parameters
+    if dist.get_rank() == 0:
+        for name, param in model.named_parameters():
+            if not param.stop_gradient:
+                logger.info(name)
+    # set seed for paddle dataloaders
+    set_seed(training_args.seed)
+    data_module = make_supervised_data_module(
+        interleave=training_args.interleave, with_box=training_args.with_box, tokenizer=tokenizer, data_args=data_args
+    )
+    trainer = Trainer(
+        model=model,
+        args=training_args,
+        tokenizer=tokenizer,
+        **data_module,
+    )
+    # Training
+    if training_args.do_train:
+        checkpoint = None
+        if training_args.resume_from_checkpoint is not None:
+            checkpoint = training_args.resume_from_checkpoint
+        elif last_checkpoint is not None:
+            checkpoint = last_checkpoint
+        train_result = trainer.train(resume_from_checkpoint=checkpoint)
+        trainer.save_model()  # Saves the tokenizer too for easy upload
+        metrics = train_result.metrics
+        try:
+            metrics["train_samples"] = len(data_module["train_dataset"])
+        except:
+            metrics["train_samples"] = -1
+        trainer.log_metrics("train", metrics)
+        trainer.save_metrics("train", metrics)
+        trainer.save_state()
+if __name__ == "__main__":
+    train()

VLMEvalKit_old/PaddleMIX/paddlemix/examples/ppdocbee/app.py ADDED Viewed

	@@ -0,0 +1,350 @@

+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import hashlib
+import os
+import os.path
+import sys
+import tempfile
+import time
+from datetime import datetime
+import gradio as gr
+import numpy as np
+import paddle
+from PIL import Image
+# 设置使用的GPU设备
+os.environ["CUDA_VISIBLE_DEVICES"] = "0"
+# 模型配置
+model_path = "PaddleMIX/PPDocBee-2B-1129"
+dtype = "bfloat16"  # V100请改成float16
+# 全局变量定义
+model = None
+processor = None
+min_pixels = 256 * 28 * 28  # 最小像素数
+max_pixels = 48 * 48 * 28 * 28  # 最大像素数
+SERVER_NAME = "localhost"
+SERVER_PORR = 8080
+def check_and_install_paddlemix():
+    try:
+        from paddlemix.models.qwen2_vl.modeling_qwen2_vl import (
+            Qwen2VLForConditionalGeneration,
+        )
+        print("Required Qwen2VL model successfully installed")
+    except ImportError:
+        print("Failed to install required Qwen2VL model even after running the script")
+        sys.exit(1)
+# 在继续之前检查所需模型
+check_and_install_paddlemix()
+from paddlemix.models.qwen2_vl import MIXQwen2Tokenizer
+from paddlemix.models.qwen2_vl.modeling_qwen2_vl import Qwen2VLForConditionalGeneration
+from paddlemix.processors.qwen2_vl_processing import (
+    Qwen2VLImageProcessor,
+    Qwen2VLProcessor,
+    process_vision_info,
+)
+# 示例使用HTTP链接
+EXAMPLES = [
+    [
+        "维修保养、其他注意事项的注意点中，电池需为什么型号的？",
+        "paddlemix/demo_images/shuomingshu_20.png",
+    ],
+    [
+        "产品期限是多久？",
+        "paddlemix/demo_images/shuomingshu_39.png",
+    ],
+]
+class ImageCache:
+    """图片缓存管理类"""
+    def __init__(self):
+        """初始化图片缓存"""
+        self.temp_dir = tempfile.mkdtemp()
+        self.current_image = None
+        self.is_example = False  # 标记当前图片是否为示例图片
+        print(f"Created temporary directory for image cache: {self.temp_dir}")
+    def cleanup_previous(self):
+        """清理之前的缓存图片"""
+        if self.current_image and os.path.exists(self.current_image) and not self.is_example:
+            try:
+                os.unlink(self.current_image)
+                print(f"Cleaned up previous image: {self.current_image}")
+            except Exception as e:
+                print(f"Error cleaning up previous image: {e}")
+    def cache_image(self, image_path, is_example=False):
+        """
+        缓存图片并返回缓存路径
+        Args:
+            image_path: 图片文件路径
+            is_example: 是否为示例图片
+        Returns:
+            缓存后的图片路径
+        """
+        if not image_path:
+            return None
+        try:
+            # 如果是示例图片且已经在使用中，直接返回
+            if is_example and self.current_image == image_path and self.is_example:
+                return self.current_image
+            # 创建安全的文件名
+            timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+            file_hash = hashlib.md5(str(time.time()).encode()).hexdigest()[:8]
+            _, ext = os.path.splitext(image_path)
+            if not ext:
+                ext = ".jpg"  # 默认扩展名
+            new_filename = f"image_{timestamp}_{file_hash}{ext}"
+            # 在临时目录中创建新路径
+            new_path = os.path.join(self.temp_dir, new_filename) if not is_example else image_path
+            if not is_example:
+                # 处理上传的图片文件
+                with Image.open(image_path) as img:
+                    # 如果需要，转换为RGB
+                    if img.mode != "RGB":
+                        img = img.convert("RGB")
+                    img.save(new_path)
+                # 更新当前图片之前清理之前的图片
+                self.cleanup_previous()
+            self.current_image = new_path
+            self.is_example = is_example
+            return new_path
+        except Exception as e:
+            print(f"Error caching image: {e}")
+            return image_path
+# 创建全局图片缓存管理器
+image_cache = ImageCache()
+def load_model():
+    """加载模型并进行内存优化"""
+    global model, processor
+    if model is None:
+        # 加载模型和处理器
+        model = Qwen2VLForConditionalGeneration.from_pretrained(
+            model_path,
+            dtype=dtype,
+        )
+        image_processor = Qwen2VLImageProcessor()
+        tokenizer = MIXQwen2Tokenizer.from_pretrained(model_path)
+        processor = Qwen2VLProcessor(image_processor, tokenizer, min_pixels=min_pixels, max_pixels=max_pixels)
+        # 设置为评估模式
+        model.eval()
+    del tokenizer
+    return model, processor
+def clear_cache():
+    """清理GPU缓存"""
+    if paddle.device.cuda.memory_allocated() > 0:
+        paddle.device.cuda.empty_cache()
+        import gc
+        gc.collect()
+def multimodal_understanding(image, question, seed=42, top_p=0.95, temperature=0.1):
+    """
+    多模态理解主函数
+    Args:
+        image: 输入图片
+        question: 问题文本
+        seed: 随机种子
+        top_p: 采样参数
+        temperature: 温度参数
+    Yields:
+        处理状态和结果
+    """
+    # 输入验证
+    if not image:
+        yield "⚠️ 请上传图片后再开始对话。"
+        return
+    if not question or question.strip() == "":
+        yield "⚠️ 请输入您的问题后再开始对话。"
+        return
+    try:
+        start_time = time.time()
+        yield "🔄 正在处理您的请求，请稍候..."
+        # 检查超时
+        if time.time() - start_time > 200:
+            yield "⏳ 系统当前用户繁多，请等待10分钟后再次尝试。感谢您的理解！"
+            return
+        clear_cache()
+        # 设置随机种子
+        paddle.seed(seed)
+        np.random.seed(seed)
+        # 处理图片缓存
+        is_example = any(image == example[1] for example in EXAMPLES)
+        cached_image = image_cache.cache_image(image, is_example=is_example)
+        if not cached_image:
+            return "图片处理失败，请检查图片格式是否正确。"
+        # 构建提示文本
+        prompts = question + "\n请用图片中完整出现的内容回答，可以是单词、短语或句子，针对问题回答尽可能详细和完整，并保持格式、单位、符号和标点都与图片中的文字内容完全一致。"
+        # 构建消息
+        messages = [
+            {
+                "role": "user",
+                "content": [
+                    {
+                        "type": "image",
+                        "image": cached_image,
+                    },
+                    {"type": "text", "text": prompts},
+                ],
+            }
+        ]
+        yield "模型正在分析图片内容..."
+        # 处理视觉信息
+        image_inputs, video_inputs = process_vision_info(messages)
+        image_pad_token = "<|vision_start|><|image_pad|><|vision_end|>"
+        text = f"<|im_start|>system\n你是一个非常棒的多模态理解的AI助手。<|im_end|>\n<|im_start|>user\n{image_pad_token}{prompts}<|im_end|>\n<|im_start|>assistant\n"
+        # 生成回答
+        with paddle.no_grad():
+            inputs = processor(
+                text=[text], images=image_inputs, videos=video_inputs, padding=True, return_tensors="pd"
+            )
+            yield "正在生成回答..."
+            generated_ids = model.generate(
+                **inputs,
+                max_new_tokens=1024,
+                top_p=top_p,
+                temperature=temperature,
+                num_beams=1,
+                do_sample=True,
+                use_cache=True,
+            )
+            output_text = processor.batch_decode(
+                generated_ids[0], skip_special_tokens=True, clean_up_tokenization_spaces=False
+            )[0]
+        # 清理内存
+        del inputs, generated_ids
+        clear_cache()
+        yield output_text
+    except Exception as e:
+        error_message = f"处理过程中出现错误: {str(e)}\n请重试或在评论区留下你的问题。"
+        return error_message
+def process_example(question, image):
+    """处理示例图片的包装函数"""
+    cached_path = image_cache.cache_image(image, is_example=True)
+    return multimodal_understanding(cached_path, question)
+def handle_image_upload(image):
+    """处理图片上传"""
+    if image is None:
+        return None
+    try:
+        cached_path = image_cache.cache_image(image, is_example=False)
+        return cached_path
+    except Exception as e:
+        print(f"Error handling image upload: {e}")
+        return None
+# model, processor = load_model()
+# # image = "/home/aistudio/work/doc-lark/PaddleMIX/paddlemix/demo_images/examples_image1.jpg"
+# print(multimodal_understanding(EXAMPLES[1][1],EXAMPLES[1][0]))
+# Gradio界面配置
+with gr.Blocks() as demo:
+    gr.Markdown(
+        value="""
+    # 🤖 PP-DocBee(2B): Multimodal Document Understanding Demo
+    📚 原始模型来自 [PaddleMIX](https://github.com/PaddlePaddle/PaddleMIX)  （🌟 一个基于飞桨PaddlePaddle框架构建的多模态大模型套件）
+    """
+    )
+    with gr.Row():
+        image_input = gr.Image(type="filepath", label="📷 Upload Image or Input URL")
+        with gr.Column():
+            question_input = gr.Textbox(label="💭 Question", placeholder="Enter your question here...")
+            und_seed_input = gr.Number(label="🎲 Seed", precision=0, value=42)
+            top_p = gr.Slider(minimum=0, maximum=1, value=0.95, step=0.05, label="📊 Top P")
+            temperature = gr.Slider(minimum=0, maximum=1, value=0.1, step=0.05, label="🌡️ Temperature")
+    image_input.upload(fn=handle_image_upload, inputs=[image_input], outputs=[image_input])
+    understanding_button = gr.Button("💬 Chat", variant="primary")
+    understanding_output = gr.Textbox(label="🤖 Response", interactive=False)
+    gr.Examples(
+        examples=EXAMPLES,
+        inputs=[question_input, image_input],
+        outputs=understanding_output,
+        fn=process_example,
+        cache_examples=True,
+        run_on_click=True,
+    )
+    # 加载模型
+    clear_cache()
+    model, processor = load_model()
+    clear_cache()
+    understanding_button.click(
+        fn=multimodal_understanding,
+        inputs=[image_input, question_input, und_seed_input, top_p, temperature],
+        outputs=understanding_output,
+        api_name="chat",
+    )
+if __name__ == "__main__":
+    # 创建队列
+    demo.queue()
+    demo.launch(server_name=SERVER_NAME, server_port=SERVER_PORR, share=True, ssr_mode=False, max_threads=1)  # 限制并发请求数

VLMEvalKit_old/PaddleMIX/paddlemix/models/GOT/utils/conversation.py ADDED Viewed

	@@ -0,0 +1,400 @@

+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import dataclasses
+from enum import Enum, auto
+from typing import List
+class SeparatorStyle(Enum):
+    """Different separator style."""
+    SINGLE = auto()
+    TWO = auto()
+    MPT = auto()
+@dataclasses.dataclass
+class Conversation:
+    """A class that keeps all conversation history."""
+    system: str
+    roles: List[str]
+    messages: List[List[str]]
+    offset: int
+    sep_style: SeparatorStyle = SeparatorStyle.SINGLE
+    sep: str = "<|im_end|>"
+    sep2: str = None
+    version: str = "Unknown"
+    skip_next: bool = False
+    def get_prompt(self):
+        if self.sep_style == SeparatorStyle.SINGLE:
+            ret = self.system + self.sep + "\n"
+            for role, message in self.messages:
+                if message:
+                    if type(message) is tuple:
+                        message, _, _ = message
+                    ret += role + ": " + message + self.sep
+                else:
+                    ret += role + ":"
+            return ret
+        elif self.sep_style == SeparatorStyle.TWO:
+            seps = [self.sep, self.sep2]
+            ret = self.system + seps[0]
+            for i, (role, message) in enumerate(self.messages):
+                if message:
+                    if type(message) is tuple:
+                        message, _, _ = message
+                    ret += role + ": " + message + seps[i % 2]
+                else:
+                    ret += role + ":"
+            return ret
+        if self.sep_style == SeparatorStyle.MPT:
+            if self.system:
+                ret = self.system + self.sep
+            else:
+                ret = ""
+            for role, message in self.messages:
+                if message:
+                    if type(message) is tuple:
+                        message, _, _ = message
+                    ret += role + message + self.sep
+                else:
+                    ret += role
+            return ret
+        else:
+            raise ValueError(f"Invalid style: {self.sep_style}")
+    def append_message(self, role, message):
+        self.messages.append([role, message])
+    def get_images(self, return_pil=False):
+        images = []
+        for i, (role, msg) in enumerate(self.messages[self.offset :]):
+            if i % 2 == 0:
+                if type(msg) is tuple:
+                    import base64
+                    from io import BytesIO
+                    from PIL import Image
+                    msg, image, image_process_mode = msg
+                    if image_process_mode == "Pad":
+                        def expand2square(pil_img, background_color=(122, 116, 104)):
+                            width, height = pil_img.size
+                            if width == height:
+                                return pil_img
+                            elif width > height:
+                                result = Image.new(pil_img.mode, (width, width), background_color)
+                                # result.paste(pil_img, (0, (width - height) // 2))
+                                result.paste(pil_img)
+                                return result
+                            else:
+                                result = Image.new(pil_img.mode, (height, height), background_color)
+                                # result.paste(pil_img, ((height - width) // 2, 0))
+                                result.paste(pil_img)
+                                return result
+                        image = expand2square(image)
+                    elif image_process_mode == "Crop":
+                        max_hw, min_hw = max(image.size), min(image.size)
+                        aspect_ratio = max_hw / min_hw
+                        max_len, min_len = 800, 400
+                        shortest_edge = int(min(max_len / aspect_ratio, min_len, min_hw))
+                        longest_edge = int(shortest_edge * aspect_ratio)
+                        W, H = image.size
+                        if H > W:
+                            H, W = longest_edge, shortest_edge
+                        else:
+                            H, W = shortest_edge, longest_edge
+                        image = image.resize((W, H))
+                    elif image_process_mode == "Resize":
+                        image = image.resize((224, 224))
+                    else:
+                        raise ValueError(f"Invalid image_process_mode: {image_process_mode}")
+                    if return_pil:
+                        images.append(image)
+                    else:
+                        buffered = BytesIO()
+                        image.convert("RGB").save(buffered, format="JPEG")
+                        img_b64_str = base64.b64encode(buffered.getvalue()).decode()
+                        images.append(img_b64_str)
+        return images
+    def to_gradio_chatbot(self):
+        ret = []
+        for i, (role, msg) in enumerate(self.messages[self.offset :]):
+            if i % 2 == 0:
+                if type(msg) is tuple:
+                    import base64
+                    from io import BytesIO
+                    msg, image, image_process_mode = msg
+                    max_hw, min_hw = max(image.size), min(image.size)
+                    aspect_ratio = max_hw / min_hw
+                    max_len, min_len = 800, 400
+                    shortest_edge = int(min(max_len / aspect_ratio, min_len, min_hw))
+                    longest_edge = int(shortest_edge * aspect_ratio)
+                    W, H = image.size
+                    if H > W:
+                        H, W = longest_edge, shortest_edge
+                    else:
+                        H, W = shortest_edge, longest_edge
+                    image = image.resize((W, H))
+                    # image = image.resize((224, 224))
+                    buffered = BytesIO()
+                    image.save(buffered, format="JPEG")
+                    img_b64_str = base64.b64encode(buffered.getvalue()).decode()
+                    img_str = f'<img src="data:image/png;base64,{img_b64_str}" alt="user upload image" />'
+                    msg = msg.replace("<image>", img_str)
+                ret.append([msg, None])
+            else:
+                ret[-1][-1] = msg
+        return ret
+    def copy(self):
+        return Conversation(
+            system=self.system,
+            roles=self.roles,
+            messages=[[x, y] for x, y in self.messages],
+            offset=self.offset,
+            sep_style=self.sep_style,
+            sep=self.sep,
+            sep2=self.sep2,
+        )
+    def dict(self):
+        if len(self.get_images()) > 0:
+            return {
+                "system": self.system,
+                "roles": self.roles,
+                "messages": [[x, y[0] if type(y) is tuple else y] for x, y in self.messages],
+                "offset": self.offset,
+                "sep": self.sep,
+                "sep2": self.sep2,
+            }
+        return {
+            "system": self.system,
+            "roles": self.roles,
+            "messages": self.messages,
+            "offset": self.offset,
+            "sep": self.sep,
+            "sep2": self.sep2,
+        }
+conv_v1 = Conversation(
+    system="A chat between a curious human and an artificial intelligence assistant. "
+    "The assistant gives helpful, detailed, and polite answers to the human's questions.",
+    roles=("Human", "Assistant"),
+    messages=(
+        ("Human", "Give three tips for staying healthy."),
+        (
+            "Assistant",
+            "Sure, here are three tips for staying healthy:\n"
+            "1. Exercise regularly: Regular physical activity can help improve your overall health and wellbeing. "
+            "It can also help reduce your risk of chronic conditions such as obesity, diabetes, heart disease, "
+            "and certain cancers. Aim for at least 150 minutes of moderate-intensity aerobic exercise or "
+            "75 minutes of vigorous-intensity aerobic exercise per week, along with muscle-strengthening "
+            "activities at least two days per week.\n"
+            "2. Eat a balanced diet: Eating a balanced diet that is rich in fruits, "
+            "vegetables, whole grains, lean proteins, and healthy fats can help support "
+            "your overall health. Try to limit your intake of processed and high-sugar foods, "
+            "and aim to drink plenty of water throughout the day.\n"
+            "3. Get enough sleep: Getting enough quality sleep is essential for your physical "
+            "and mental health. Adults should aim for seven to nine hours of sleep per night. "
+            "Establish a regular sleep schedule and try to create a relaxing bedtime routine to "
+            "help improve the quality of your sleep.",
+        ),
+    ),
+    offset=2,
+    sep_style=SeparatorStyle.SINGLE,
+    sep="###",
+)
+conv_v1_2 = Conversation(
+    system="A chat between a curious human and an artificial intelligence assistant. "
+    "The assistant gives helpful, detailed, and polite answers to the human's questions.",
+    roles=("Human", "Assistant"),
+    messages=(
+        ("Human", "What are the key differences between renewable and non-renewable energy sources?"),
+        (
+            "Assistant",
+            "Renewable energy sources are those that can be replenished naturally in a relatively "
+            "short amount of time, such as solar, wind, hydro, geothermal, and biomass. "
+            "Non-renewable energy sources, on the other hand, are finite and will eventually be "
+            "depleted, such as coal, oil, and natural gas. Here are some key differences between "
+            "renewable and non-renewable energy sources:\n"
+            "1. Availability: Renewable energy sources are virtually inexhaustible, while non-renewable "
+            "energy sources are finite and will eventually run out.\n"
+            "2. Environmental impact: Renewable energy sources have a much lower environmental impact "
+            "than non-renewable sources, which can lead to air and water pollution, greenhouse gas emissions, "
+            "and other negative effects.\n"
+            "3. Cost: Renewable energy sources can be more expensive to initially set up, but they typically "
+            "have lower operational costs than non-renewable sources.\n"
+            "4. Reliability: Renewable energy sources are often more reliable and can be used in more remote "
+            "locations than non-renewable sources.\n"
+            "5. Flexibility: Renewable energy sources are often more flexible and can be adapted to different "
+            "situations and needs, while non-renewable sources are more rigid and inflexible.\n"
+            "6. Sustainability: Renewable energy sources are more sustainable over the long term, while "
+            "non-renewable sources are not, and their depletion can lead to economic and social instability.\n",
+        ),
+    ),
+    offset=2,
+    sep_style=SeparatorStyle.SINGLE,
+    sep="###",
+)
+conv_vicuna_v1_1 = Conversation(
+    system="A chat between a curious user and an artificial intelligence assistant. "
+    "The assistant gives helpful, detailed, and polite answers to the user's questions.",
+    roles=("USER", "ASSISTANT"),
+    version="v1",
+    messages=(),
+    offset=0,
+    sep_style=SeparatorStyle.TWO,
+    sep=" ",
+    sep2="</s>",
+)
+conv_mpt = Conversation(
+    system="""<|im_start|>system
+You should follow the instructions carefully and explain your answers in detail.""",
+    # system = None,
+    roles=("<|im_start|>user\n", "<|im_start|>assistant\n"),
+    version="mpt",
+    messages=(),
+    offset=0,
+    sep_style=SeparatorStyle.MPT,
+    sep="<|im_end|>",
+)
+conv_mpt_eval = Conversation(
+    system="",
+    roles=("<|im_start|>user\n", "<|im_start|>assistant\n"),
+    version="mpt",
+    messages=(),
+    offset=0,
+    sep_style=SeparatorStyle.MPT,
+    sep="<|im_end|>",
+)
+conv_mpt_text = Conversation(
+    system="""<|im_start|>system
+- You are a helpful assistant chatbot trained by MosaicML.
+- You answer questions.
+- You are excited to be able to help the user, but will refuse to do anything that could be considered harmful to the user.
+- You are more than just an information source, you are also able to write poetry, short stories, and make jokes.""",
+    roles=("<|im_start|>user\n", "<|im_start|>assistant\n"),
+    version="mpt",
+    messages=(),
+    offset=0,
+    sep_style=SeparatorStyle.MPT,
+    sep="<|im_end|>",
+)
+conv_bair_v1 = Conversation(
+    system="BEGINNING OF CONVERSATION:",
+    roles=("USER", "GPT"),
+    messages=(),
+    offset=0,
+    sep_style=SeparatorStyle.TWO,
+    sep=" ",
+    sep2="</s>",
+)
+simple_conv = Conversation(
+    system="",
+    roles=("Human", "Assistant"),
+    messages=(),
+    offset=0,
+    sep_style=SeparatorStyle.SINGLE,
+    sep="###",
+)
+simple_conv_multimodal = Conversation(
+    system="You are GOT, a large language and vision assistant trained by Foundation Model Group, Megvii Technology."
+    "You are able to understand the visual content that the user provides, and assist the user with a variety of tasks using natural language."
+    "Follow the instructions carefully and explain your answers in detail.",
+    # system="",
+    roles=("Human", "Assistant"),
+    messages=(("Human", "Hi!"), ("Assistant", "Hi there!  How can I help you today?\n")),
+    offset=2,
+    sep_style=SeparatorStyle.SINGLE,
+    sep="###",
+)
+simple_conv_mpt_multimodal = Conversation(
+    system="""<|im_start|>system
+- You are GOT, a large language and vision assistant trained by Foundation Model Group, Megvii Technology.
+- You are able to understand the visual content that the user provides, and assist the user with a variety of tasks using natural language.
+- You should follow the instructions carefully and explain your answers in detail.""",
+    roles=("<|im_start|>user\n", "<|im_start|>assistant\n"),
+    version="mpt",
+    messages=(),
+    offset=0,
+    sep_style=SeparatorStyle.MPT,
+    sep="<|im_end|>",
+)
+simple_conv_legacy = Conversation(
+    system="You are GOT, a large language model trained by Foundation Model Group, Megvii Technology."
+    "You are designed to assist human with a variety of tasks using natural language."
+    "Follow the instructions carefully.",
+    roles=("Human", "Assistant"),
+    messages=(("Human", "Hi!\n\n### Response:"), ("Assistant", "Hi there!  How can I help you today?\n")),
+    offset=2,
+    sep_style=SeparatorStyle.SINGLE,
+    sep="###",
+)
+conv_llava_v1 = Conversation(
+    system="You are GOT, a large language and vision assistant trained by Foundation Model Group, Megvii Technology."
+    "You are able to understand the visual content that the user provides, and assist the user with a variety of tasks using natural language."
+    "Follow the instructions carefully and explain your answers in detail.",
+    roles=("USER", "ASSISTANT"),
+    version="v1",
+    messages=(),
+    offset=0,
+    sep_style=SeparatorStyle.TWO,
+    sep=" ",
+    sep2="</s>",
+)
+default_conversation = conv_mpt
+conv_templates = {
+    "default": simple_conv_multimodal,
+    "simple": simple_conv,
+    "simple_legacy": simple_conv_legacy,
+    "multimodal": simple_conv,
+    "mpt_multimodal": simple_conv_mpt_multimodal,
+    "llava_v1": conv_llava_v1,
+    "mpt_eval": conv_mpt_eval,
+    # fastchat
+    "v1": conv_vicuna_v1_1,
+    "bair_v1": conv_bair_v1,
+    "vicuna_v1_1": conv_vicuna_v1_1,
+    "mpt": conv_mpt,
+    "mpt_text": conv_mpt_text,
+}

VLMEvalKit_old/PaddleMIX/paddlemix/models/audioldm2/encoders/phoneme_encoder/__init__.py ADDED Viewed

	@@ -0,0 +1,13 @@

+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.

VLMEvalKit_old/PaddleMIX/paddlemix/models/audioldm2/encoders/phoneme_encoder/cleaners.py ADDED Viewed

	@@ -0,0 +1,103 @@

+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" from https://github.com/keithito/tacotron """
+import re
+from unidecode import unidecode
+from phonemizer import phonemize
+__all__ = [
+  "basic_cleaners",
+  "transliteration_cleaners",
+  "english_cleaners",
+  "english_cleaners2"
+]
+# Regular expression matching whitespace:
+_whitespace_re = re.compile(r'\s+')
+# List of (regular expression, replacement) pairs for abbreviations:
+_abbreviations = [(re.compile('\\b%s\\.' % x[0], re.IGNORECASE), x[1]) for x in [
+  ('mrs', 'misess'),
+  ('mr', 'mister'),
+  ('dr', 'doctor'),
+  ('st', 'saint'),
+  ('co', 'company'),
+  ('jr', 'junior'),
+  ('maj', 'major'),
+  ('gen', 'general'),
+  ('drs', 'doctors'),
+  ('rev', 'reverend'),
+  ('lt', 'lieutenant'),
+  ('hon', 'honorable'),
+  ('sgt', 'sergeant'),
+  ('capt', 'captain'),
+  ('esq', 'esquire'),
+  ('ltd', 'limited'),
+  ('col', 'colonel'),
+  ('ft', 'fort'),
+]]
+def expand_abbreviations(text):
+  for regex, replacement in _abbreviations:
+    text = re.sub(regex, replacement, text)
+  return text
+def lowercase(text):
+  return text.lower()
+def collapse_whitespace(text):
+  return re.sub(_whitespace_re, ' ', text)
+def convert_to_ascii(text):
+  return unidecode(text)
+def basic_cleaners(text):
+  '''Basic pipeline that lowercases and collapses whitespace without transliteration.'''
+  text = lowercase(text)
+  text = collapse_whitespace(text)
+  return text
+def transliteration_cleaners(text):
+  '''Pipeline for non-English text that transliterates to ASCII.'''
+  text = convert_to_ascii(text)
+  text = lowercase(text)
+  text = collapse_whitespace(text)
+  return text
+def english_cleaners(text):
+  '''Pipeline for English text, including abbreviation expansion.'''
+  text = convert_to_ascii(text)
+  text = lowercase(text)
+  text = expand_abbreviations(text)
+  phonemes = phonemize(text, language='en-us', backend='espeak', strip=True)
+  phonemes = collapse_whitespace(phonemes)
+  return phonemes
+def english_cleaners2(text):
+  '''Pipeline for English text, including abbreviation expansion. + punctuation + stress'''
+  text = convert_to_ascii(text)
+  text = lowercase(text)
+  text = expand_abbreviations(text)
+  phonemes = phonemize(text, language='en-us', backend='espeak', strip=True, preserve_punctuation=True, with_stress=True)
+  phonemes = collapse_whitespace(phonemes)
+  return phonemes

VLMEvalKit_old/PaddleMIX/paddlemix/models/audioldm2/encoders/phoneme_encoder/symbols.py ADDED Viewed

	@@ -0,0 +1,28 @@

+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+'''
+Defines the set of symbols used in text input to the model.
+'''
+_pad        = '_'
+_punctuation = ';:,.!?¡¿—…"«»“” '
+_letters = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz'
+_letters_ipa = "ɑɐɒæɓʙβɔɕçɗɖðʤəɘɚɛɜɝɞɟʄɡɠɢʛɦɧħɥʜɨɪʝɭɬɫɮʟɱɯɰŋɳɲɴøɵɸθœɶʘɹɺɾɻʀʁɽʂʃʈʧʉʊʋⱱʌɣɤʍχʎʏʑʐʒʔʡʕʢǀǁǂǃˈˌːˑʼʴʰʱʲʷˠˤ˞↓↑→↗↘'̩'ᵻ"
+# Export all symbols:
+symbols = [_pad] + list(_punctuation) + list(_letters) + list(_letters_ipa)
+# Special symbol ids
+SPACE_ID = symbols.index(" ")

VLMEvalKit_old/PaddleMIX/paddlemix/models/audioldm2/encoders/phoneme_encoder/text.py ADDED Viewed

	@@ -0,0 +1,62 @@

+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" from https://github.com/keithito/tacotron """
+from .cleaners import *
+from .symbols import symbols
+# Mappings from symbol to numeric ID and vice versa:
+_symbol_to_id = {s: i for i, s in enumerate(symbols)}
+_id_to_symbol = {i: s for i, s in enumerate(symbols)}
+cleaner = english_cleaners2
+def text_to_sequence(text, cleaner_names):
+  '''Converts a string of text to a sequence of IDs corresponding to the symbols in the text.
+    Args:
+      text: string to convert to a sequence
+      cleaner_names: names of the cleaner functions to run the text through
+    Returns:
+      List of integers corresponding to the symbols in the text
+  '''
+  sequence = []
+  clean_text = _clean_text(text, cleaner_names)
+  for symbol in clean_text:
+    symbol_id = _symbol_to_id[symbol]
+    sequence += [symbol_id]
+  return sequence
+def cleaned_text_to_sequence(cleaned_text):
+  '''Converts a string of text to a sequence of IDs corresponding to the symbols in the text.
+    Args:
+      text: string to convert to a sequence
+    Returns:
+      List of integers corresponding to the symbols in the text
+  '''
+  sequence = [_symbol_to_id[symbol] for symbol in cleaned_text]
+  return sequence
+def sequence_to_text(sequence):
+  '''Converts a sequence of IDs back to a string'''
+  result = ''
+  for symbol_id in sequence:
+    s = _id_to_symbol[symbol_id]
+    result += s
+  return result
+def _clean_text(text, cleaner_names):
+  text = cleaner(text)
+  return text

VLMEvalKit_old/PaddleMIX/paddlemix/models/audioldm2/unet/attention.py ADDED Viewed

	@@ -0,0 +1,199 @@

+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import paddle
+from paddle import nn
+from ppdiffusers.models.attention import GEGLU
+from einops import rearrange, repeat
+from ..diffusionwrapper import default
+def Normalize(in_channels):
+    return nn.GroupNorm(
+        num_groups=32, num_channels=in_channels, epsilon=1e-6
+    )
+class FeedForward(nn.Layer):
+    def __init__(self, dim, dim_out=None, mult=4, glu=False, dropout=0.0):
+        super().__init__()
+        inner_dim = int(dim * mult)
+        dim_out = default(dim_out, dim)
+        project_in = (
+            nn.Sequential(nn.Linear(dim, inner_dim), nn.GELU())
+            if not glu
+            else GEGLU(dim, inner_dim)
+        )
+        self.net = nn.Sequential(
+            project_in, nn.Dropout(dropout), nn.Linear(inner_dim, dim_out)
+        )
+    def forward(self, x):
+        return self.net(x)
+class CrossAttention(nn.Layer):
+    def __init__(self, query_dim, context_dim=None, heads=8, dim_head=64, dropout=0.0):
+        super().__init__()
+        inner_dim = dim_head * heads
+        context_dim = default(context_dim, query_dim)
+        self.scale = dim_head**-0.5
+        self.heads = heads
+        self.to_q = nn.Linear(query_dim, inner_dim, bias_attr=False)
+        self.to_k = nn.Linear(context_dim, inner_dim, bias_attr=False)
+        self.to_v = nn.Linear(context_dim, inner_dim, bias_attr=False)
+        self.to_out = nn.Sequential(
+            nn.Linear(inner_dim, query_dim), nn.Dropout(dropout)
+        )
+    def forward(self, x, context=None, mask=None):
+        h = self.heads
+        q = self.to_q(x)
+        context = default(context, x)
+        k = self.to_k(context)
+        v = self.to_v(context)
+        q, k, v = map(lambda t: rearrange(t, "b n (h d) -> (b h) n d", h=h), (q, k, v))
+        sim = paddle.einsum("b i d, b j d -> b i j", q, k) * self.scale
+        if mask is not None:
+            mask = rearrange(mask, "b ... -> b (...)")
+            max_neg_value = -paddle.finfo(sim.dtype).max
+            mask = repeat(mask, "b j -> (b h) () j", h=h)
+            tmp = paddle.full(sim.shape, max_neg_value, sim.dtype)
+            sim = paddle.where(~(mask == 1), tmp, sim)
+        # attention, what we cannot get enough of
+        attn = nn.functional.softmax(sim, axis=-1)
+        out = paddle.einsum("b i j, b j d -> b i d", attn, v)
+        out = rearrange(out, "(b h) n d -> b n (h d)", h=h)
+        return self.to_out(out)
+class LinearAttention(nn.Layer):
+    def __init__(self, dim, heads=4, dim_head=32):
+        super().__init__()
+        self.heads = heads
+        hidden_dim = dim_head * heads
+        self.to_qkv = nn.Conv2D(dim, hidden_dim * 3, 1, bias_attr=False)
+        self.to_out = nn.Conv2D(hidden_dim, dim, 1)
+    def forward(self, x):
+        b, c, h, w = x.shape
+        qkv = self.to_qkv(x)
+        q, k, v = rearrange(
+            qkv, "b (qkv heads c) h w -> qkv b heads c (h w)", heads=self.heads, qkv=3
+        )
+        k = nn.functional.softmax(k, axis=-1)
+        context = paddle.einsum("bhdn,bhen->bhde", k, v)
+        out = paddle.einsum("bhde,bhdn->bhen", context, q)
+        out = rearrange(
+            out, "b heads c (h w) -> b (heads c) h w", heads=self.heads, h=h, w=w
+        )
+        return self.to_out(out)
+class BasicTransformerBlock(nn.Layer):
+    def __init__(
+        self,
+        dim,
+        n_heads,
+        d_head,
+        dropout=0.0,
+        context_dim=None,
+        gated_ff=True,
+        checkpoint=True,
+    ):
+        super().__init__()
+        self.attn1 = CrossAttention(
+            query_dim=dim, heads=n_heads, dim_head=d_head, dropout=dropout
+        )  # is a self-attention
+        self.ff = FeedForward(dim, dropout=dropout, glu=gated_ff)
+        self.attn2 = CrossAttention(
+            query_dim=dim,
+            context_dim=context_dim,
+            heads=n_heads,
+            dim_head=d_head,
+            dropout=dropout,
+        )  # is self-attn if context is none
+        self.norm1 = nn.LayerNorm(dim)
+        self.norm2 = nn.LayerNorm(dim)
+        self.norm3 = nn.LayerNorm(dim)
+        self.checkpoint = checkpoint
+    def forward(self, x, context=None, mask=None):
+        x = self.attn1(self.norm1(x)) + x
+        x = self.attn2(self.norm2(x), context=context, mask=mask) + x
+        x = self.ff(self.norm3(x)) + x
+        return x
+class SpatialTransformer(nn.Layer):
+    """
+    Transformer block for image-like data.
+    First, project the input (aka embedding)
+    and reshape to b, t, d.
+    Then apply standard transformer action.
+    Finally, reshape to image
+    """
+    def __init__(
+        self,
+        in_channels,
+        n_heads,
+        d_head,
+        depth=1,
+        dropout=0.0,
+        context_dim=None,
+    ):
+        super().__init__()
+        context_dim = context_dim
+        self.in_channels = in_channels
+        inner_dim = n_heads * d_head
+        self.norm = Normalize(in_channels)
+        self.proj_in = nn.Conv2D(
+            in_channels, inner_dim, kernel_size=1, stride=1, padding=0
+        )
+        self.transformer_blocks = nn.LayerList(
+            [
+                BasicTransformerBlock(
+                    inner_dim, n_heads, d_head, dropout=dropout, context_dim=context_dim
+                )
+                for d in range(depth)
+            ]
+        )
+        weight_attr = paddle.ParamAttr(
+            initializer=nn.initializer.Constant(value=0.0)
+        )
+        self.proj_out = nn.Conv2D(inner_dim, in_channels, kernel_size=1, stride=1, padding=0, weight_attr=weight_attr)
+    def forward(self, x, context=None, mask=None):
+        # note: if no context is given, cross-attention defaults to self-attention
+        b, c, h, w = x.shape
+        x_in = x
+        x = self.norm(x)
+        x = self.proj_in(x)
+        x = rearrange(x, "b c h w -> b (h w) c")
+        for block in self.transformer_blocks:
+            x = block(x, context=context, mask=mask)
+        x = rearrange(x, "b (h w) c -> b c h w", h=h, w=w)
+        x = self.proj_out(x)
+        return x + x_in

VLMEvalKit_old/PaddleMIX/paddlemix/models/diffsinger/basics/base_augmentation.py ADDED Viewed

	@@ -0,0 +1,46 @@

+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from utils.hparams import hparams
+class BaseAugmentation:
+    """
+    Base class for data augmentation.
+    All methods of this class should be thread-safe.
+    1. *process_item*:
+        Apply augmentation to one piece of data.
+    """
+    def __init__(self, data_dirs: list, augmentation_args: dict):
+        self.raw_data_dirs = data_dirs
+        self.augmentation_args = augmentation_args
+        self.timestep = hparams["hop_size"] / hparams["audio_sample_rate"]
+    def process_item(self, item: dict, **kwargs) -> dict:
+        raise NotImplementedError()
+def require_same_keys(func):
+    def run(*args, **kwargs):
+        item: dict = args[1]
+        res: dict = func(*args, **kwargs)
+        assert set(item.keys()) == set(
+            res.keys()
+        ), f"""Item keys mismatch after augmentation.
+Before: {sorted(item.keys())}
+After: {sorted(res.keys())}"""
+        return res
+    return run

VLMEvalKit_old/PaddleMIX/paddlemix/models/diffsinger/basics/base_binarizer.py ADDED Viewed

	@@ -0,0 +1,330 @@

+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import json
+import pathlib
+import pickle
+import random
+import shutil
+import warnings
+from copy import deepcopy
+import numpy as np
+import paddle
+from tqdm import tqdm
+from utils.hparams import hparams
+from utils.indexed_datasets import IndexedDatasetBuilder
+from utils.multiprocess_utils import chunked_multiprocess_run
+from utils.phoneme_utils import build_phoneme_list, locate_dictionary
+from utils.plot import distribution_to_figure
+from utils.text_encoder import TokenTextEncoder
+class BinarizationError(Exception):
+    pass
+class BaseBinarizer:
+    """
+    Base class for data processing.
+    1. *process* and *process_data_split*:
+        process entire data, generate the train-test split (support parallel processing);
+    2. *process_item*:
+        process singe piece of data;
+    3. *get_pitch*:
+        infer the pitch using some algorithm;
+    4. *get_align*:
+        get the alignment using 'mel2ph' format (see https://arxiv.org/abs/1905.09263).
+    5. phoneme encoder, voice encoder, etc.
+    Subclasses should define:
+    1. *load_metadata*:
+        how to read multiple datasets from files;
+    2. *train_item_names*, *valid_item_names*, *test_item_names*:
+        how to split the dataset;
+    3. load_ph_set:
+        the phoneme set.
+    """
+    def __init__(self, data_dir=None, data_attrs=None):
+        if data_dir is None:
+            data_dir = hparams["raw_data_dir"]
+        if not isinstance(data_dir, list):
+            data_dir = [data_dir]
+        self.raw_data_dirs = [pathlib.Path(d) for d in data_dir]
+        self.binary_data_dir = pathlib.Path(hparams["binary_data_dir"])
+        self.data_attrs = [] if data_attrs is None else data_attrs
+        self.binarization_args = hparams["binarization_args"]
+        self.augmentation_args = hparams.get("augmentation_args", {})
+        self.device = str("cuda" if paddle.device.cuda.device_count() >= 1 else "cpu").replace("cuda", "gpu")
+        self.spk_map = None
+        self.spk_ids = hparams["spk_ids"]
+        self.speakers = hparams["speakers"]
+        self.build_spk_map()
+        self.items = {}
+        self.item_names: list = None
+        self._train_item_names: list = None
+        self._valid_item_names: list = None
+        self.phone_encoder = TokenTextEncoder(vocab_list=build_phoneme_list())
+        self.timestep = hparams["hop_size"] / hparams["audio_sample_rate"]
+    def build_spk_map(self):
+        assert isinstance(self.speakers, list), "Speakers must be a list"
+        assert len(self.speakers) == len(
+            self.raw_data_dirs
+        ), "Number of raw data dirs must equal number of speaker names!"
+        if len(self.spk_ids) == 0:
+            self.spk_ids = list(range(len(self.raw_data_dirs)))
+        else:
+            assert len(self.spk_ids) == len(
+                self.raw_data_dirs
+            ), "Length of explicitly given spk_ids must equal the number of raw datasets."
+        assert (
+            max(self.spk_ids) < hparams["num_spk"]
+        ), f"Index in spk_id sequence {self.spk_ids} is out of range. All values should be smaller than num_spk."
+        self.spk_map = {}
+        for spk_name, spk_id in zip(self.speakers, self.spk_ids):
+            if spk_name in self.spk_map and self.spk_map[spk_name] != spk_id:
+                raise ValueError(
+                    f"Invalid speaker ID assignment. Name '{spk_name}' is assigned with different speaker IDs: {self.spk_map[spk_name]} and {spk_id}."
+                )
+            self.spk_map[spk_name] = spk_id
+        print("| spk_map: ", self.spk_map)
+    def load_meta_data(self, raw_data_dir: pathlib.Path, ds_id, spk_id):
+        raise NotImplementedError()
+    def split_train_valid_set(self, item_names):
+        """
+        Split the dataset into training set and validation set.
+        :return: train_item_names, valid_item_names
+        """
+        prefixes = {str(pr): (1) for pr in hparams["test_prefixes"]}
+        valid_item_names = {}
+        for prefix in deepcopy(prefixes):
+            if prefix in item_names:
+                valid_item_names[prefix] = 1
+                prefixes.pop(prefix)
+        for prefix in deepcopy(prefixes):
+            matched = False
+            for name in item_names:
+                if name.split(":")[-1] == prefix:
+                    valid_item_names[name] = 1
+                    matched = True
+            if matched:
+                prefixes.pop(prefix)
+        for prefix in deepcopy(prefixes):
+            matched = False
+            for name in item_names:
+                if name.startswith(prefix):
+                    valid_item_names[name] = 1
+                    matched = True
+            if matched:
+                prefixes.pop(prefix)
+        for prefix in deepcopy(prefixes):
+            matched = False
+            for name in item_names:
+                if name.split(":")[-1].startswith(prefix):
+                    valid_item_names[name] = 1
+                    matched = True
+            if matched:
+                prefixes.pop(prefix)
+        if len(prefixes) != 0:
+            warnings.warn(
+                f"The following rules in test_prefixes have no matching names in the dataset: {', '.join(prefixes.keys())}",
+                category=UserWarning,
+            )
+            warnings.filterwarnings("default")
+        valid_item_names = list(valid_item_names.keys())
+        assert len(valid_item_names) > 0, "Validation set is empty!"
+        train_item_names = [x for x in item_names if x not in set(valid_item_names)]
+        assert len(train_item_names) > 0, "Training set is empty!"
+        return train_item_names, valid_item_names
+    @property
+    def train_item_names(self):
+        return self._train_item_names
+    @property
+    def valid_item_names(self):
+        return self._valid_item_names
+    def meta_data_iterator(self, prefix):
+        if prefix == "train":
+            item_names = self.train_item_names
+        else:
+            item_names = self.valid_item_names
+        for item_name in item_names:
+            meta_data = self.items[item_name]
+            yield item_name, meta_data
+    def process(self):
+        for ds_id, spk_id, data_dir in zip(range(len(self.raw_data_dirs)), self.spk_ids, self.raw_data_dirs):
+            self.load_meta_data(pathlib.Path(data_dir), ds_id=ds_id, spk_id=spk_id)
+        self.item_names = sorted(list(self.items.keys()))
+        self._train_item_names, self._valid_item_names = self.split_train_valid_set(self.item_names)
+        if self.binarization_args["shuffle"]:
+            random.shuffle(self.item_names)
+        self.binary_data_dir.mkdir(parents=True, exist_ok=True)
+        spk_map_fn = self.binary_data_dir / "spk_map.json"
+        with open(spk_map_fn, "w", encoding="utf-8") as f:
+            json.dump(self.spk_map, f)
+        shutil.copy(locate_dictionary(), self.binary_data_dir / "dictionary.txt")
+        self.check_coverage()
+        try:
+            self.process_dataset("valid")
+            self.process_dataset(
+                "train",
+                num_workers=int(self.binarization_args["num_workers"]),
+                apply_augmentation=any(args["enabled"] for args in self.augmentation_args.values()),
+            )
+        except KeyboardInterrupt:
+            exit(-1)
+    def check_coverage(self):
+        ph_required = set(build_phoneme_list())
+        phoneme_map = {}
+        for ph in ph_required:
+            phoneme_map[ph] = 0
+        ph_occurred = []
+        for item_name in self.items:
+            ph_occurred += self.items[item_name]["ph_seq"]
+            if len(ph_occurred) == 0:
+                raise BinarizationError(f"Empty tokens in {item_name}.")
+        for ph in ph_occurred:
+            if ph not in ph_required:
+                continue
+            phoneme_map[ph] += 1
+        ph_occurred = set(ph_occurred)
+        print("===== Phoneme Distribution Summary =====")
+        for i, key in enumerate(sorted(phoneme_map.keys())):
+            if i == len(ph_required) - 1:
+                end = "\n"
+            elif i % 10 == 9:
+                end = ",\n"
+            else:
+                end = ", "
+            print(f"'{key}': {phoneme_map[key]}", end=end)
+        x = sorted(phoneme_map.keys())
+        values = [phoneme_map[k] for k in x]
+        plt = distribution_to_figure(
+            title="Phoneme Distribution Summary",
+            x_label="Phoneme",
+            y_label="Number of occurrences",
+            items=x,
+            values=values,
+        )
+        filename = self.binary_data_dir / "phoneme_distribution.jpg"
+        plt.savefig(fname=filename, bbox_inches="tight", pad_inches=0.25)
+        print(f"| save summary to '{filename}'")
+        if ph_occurred != ph_required:
+            unrecognizable_phones = ph_occurred.difference(ph_required)
+            missing_phones = ph_required.difference(ph_occurred)
+            raise BinarizationError(
+                f"""transcriptions and dictionary mismatch.
+ (+) {sorted(unrecognizable_phones)}
+ (-) {sorted(missing_phones)}"""
+            )
+    def process_dataset(self, prefix, num_workers=0, apply_augmentation=False):
+        args = []
+        builder = IndexedDatasetBuilder(self.binary_data_dir, prefix=prefix, allowed_attr=self.data_attrs)
+        total_sec = {k: (0.0) for k in self.spk_map}
+        total_raw_sec = {k: (0.0) for k in self.spk_map}
+        extra_info = {"names": {}, "spk_ids": {}, "spk_names": {}, "lengths": {}}
+        max_no = -1
+        for item_name, meta_data in self.meta_data_iterator(prefix):
+            args.append([item_name, meta_data, self.binarization_args])
+        aug_map = self.arrange_data_augmentation(self.meta_data_iterator(prefix)) if apply_augmentation else {}
+        def postprocess(_item):
+            nonlocal total_sec, total_raw_sec, extra_info, max_no
+            if _item is None:
+                return
+            item_no = builder.add_item(_item)
+            max_no = max(max_no, item_no)
+            for k, v in _item.items():
+                if isinstance(v, np.ndarray):
+                    if k not in extra_info:
+                        extra_info[k] = {}
+                    extra_info[k][item_no] = tuple(v.shape)[0]
+            extra_info["names"][item_no] = _item["name"].split(":", 1)[-1]
+            extra_info["spk_ids"][item_no] = _item["spk_id"]
+            extra_info["spk_names"][item_no] = _item["spk_name"]
+            extra_info["lengths"][item_no] = _item["length"]
+            total_raw_sec[_item["spk_name"]] += _item["seconds"]
+            total_sec[_item["spk_name"]] += _item["seconds"]
+            for task in aug_map.get(_item["name"], []):
+                aug_item = task["func"](_item, **task["kwargs"])
+                aug_item_no = builder.add_item(aug_item)
+                max_no = max(max_no, aug_item_no)
+                for k, v in aug_item.items():
+                    if isinstance(v, np.ndarray):
+                        if k not in extra_info:
+                            extra_info[k] = {}
+                        extra_info[k][aug_item_no] = tuple(v.shape)[0]
+                extra_info["names"][aug_item_no] = aug_item["name"].split(":", 1)[-1]
+                extra_info["spk_ids"][aug_item_no] = aug_item["spk_id"]
+                extra_info["spk_names"][aug_item_no] = aug_item["spk_name"]
+                extra_info["lengths"][aug_item_no] = aug_item["length"]
+                total_sec[aug_item["spk_name"]] += aug_item["seconds"]
+        try:
+            if num_workers > 0:
+                for item in tqdm(
+                    chunked_multiprocess_run(self.process_item, args, num_workers=num_workers),
+                    total=len(list(self.meta_data_iterator(prefix))),
+                ):
+                    postprocess(item)
+            else:
+                for a in tqdm(args):
+                    item = self.process_item(*a)
+                    postprocess(item)
+            for k in extra_info:
+                assert set(extra_info[k]) == set(range(max_no + 1)), f"Item numbering is not consecutive."
+                extra_info[k] = list(map(lambda x: x[1], sorted(extra_info[k].items(), key=lambda x: x[0])))
+        except KeyboardInterrupt:
+            builder.finalize()
+            raise
+        builder.finalize()
+        if prefix == "train":
+            extra_info.pop("names")
+            extra_info.pop("spk_names")
+        with open(self.binary_data_dir / f"{prefix}.meta", "wb") as f:
+            pickle.dump(extra_info, f)
+        if apply_augmentation:
+            print(f"| {prefix} total duration (before augmentation): {sum(total_raw_sec.values()):.2f}s")
+            print(
+                f"| {prefix} respective duration (before augmentation): "
+                + ", ".join(f"{k}={v:.2f}s" for k, v in total_raw_sec.items())
+            )
+            print(
+                f"| {prefix} total duration (after augmentation): {sum(total_sec.values()):.2f}s ({sum(total_sec.values()) / sum(total_raw_sec.values()):.2f}x)"
+            )
+            print(
+                f"| {prefix} respective duration (after augmentation): "
+                + ", ".join(f"{k}={v:.2f}s" for k, v in total_sec.items())
+            )
+        else:
+            print(f"| {prefix} total duration: {sum(total_raw_sec.values()):.2f}s")
+            print(f"| {prefix} respective duration: " + ", ".join(f"{k}={v:.2f}s" for k, v in total_raw_sec.items()))
+    def arrange_data_augmentation(self, data_iterator):
+        """
+        Code for all types of data augmentation should be added here.
+        """
+        raise NotImplementedError()
+    def process_item(self, item_name, meta_data, binarization_args):
+        raise NotImplementedError()

VLMEvalKit_old/PaddleMIX/paddlemix/models/diffsinger/basics/base_exporter.py ADDED Viewed

	@@ -0,0 +1,72 @@

+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import json
+from pathlib import Path
+from typing import Union
+import paddle
+from utils.hparams import hparams
+class BaseExporter:
+    def __init__(
+        self, device: Union[str, (paddle.CPUPlace, paddle.CUDAPlace, str)] = None, cache_dir: Path = None, **kwargs
+    ):
+        self.device = (
+            device
+            if device is not None
+            else str("cuda" if paddle.device.cuda.device_count() >= 1 else "cpu").replace("cuda", "gpu")
+        )
+        self.cache_dir: Path = (
+            cache_dir.resolve() if cache_dir is not None else Path(__file__).parent.parent / "deployment" / "cache"
+        )
+        self.cache_dir.mkdir(parents=True, exist_ok=True)
+    def build_spk_map(self) -> dict:
+        if hparams["use_spk_id"]:
+            with open(Path(hparams["work_dir"]) / "spk_map.json", "r", encoding="utf8") as f:
+                spk_map = json.load(f)
+            assert isinstance(spk_map, dict) and len(spk_map) > 0, "Invalid or empty speaker map!"
+            assert len(spk_map) == len(set(spk_map.values())), "Duplicate speaker id in speaker map!"
+            return spk_map
+        else:
+            return {}
+    def build_model(self) -> paddle.nn.Layer:
+        """
+        Creates an instance of nn.Module and load its state dict on the target device.
+        """
+        raise NotImplementedError()
+    def export_model(self, path: Path):
+        """
+        Exports the model to ONNX format.
+        :param path: the target model path
+        """
+        raise NotImplementedError()
+    def export_attachments(self, path: Path):
+        """
+        Exports related files and configs (e.g. the dictionary) to the target directory.
+        :param path: the target directory
+        """
+        raise NotImplementedError()
+    def export(self, path: Path):
+        """
+        Exports all the artifacts to the target directory.
+        :param path: the target directory
+        """
+        raise NotImplementedError()

VLMEvalKit_old/PaddleMIX/paddlemix/models/diffsinger/basics/base_svs_infer.py ADDED Viewed

	@@ -0,0 +1,149 @@

+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import Dict, Tuple
+import numpy as np
+import paddle
+from paddlemix.models.diffsinger.utils import hparams
+from paddlemix.models.diffsinger.utils.infer_utils import resample_align_curve
+class BaseSVSInfer:
+    """
+    Base class for SVS inference models.
+    Subclasses should define:
+    1. *build_model*:
+        how to build the model;
+    2. *run_model*:
+        how to run the model (typically, generate a mel-spectrogram and
+        pass it to the pre-built vocoder);
+    3. *preprocess_input*:
+        how to preprocess user input.
+    4. *infer_once*
+        infer from raw inputs to the final outputs
+    """
+    def __init__(self, device=None):
+        if device is None:
+            device = "gpu" if paddle.device.cuda.device_count() >= 1 else "cpu"
+        self.device = device
+        self.timestep = hparams["hop_size"] / hparams["audio_sample_rate"]
+        self.spk_map = {}
+        self.model: paddle.nn.Layer = None
+    def build_model(self, ckpt_steps=None) -> paddle.nn.Layer:
+        raise NotImplementedError()
+    def load_speaker_mix(
+        self, param_src: dict, summary_dst: dict, mix_mode: str = "frame", mix_length: int = None
+    ) -> Tuple[paddle.Tensor, paddle.Tensor]:
+        """
+        :param param_src: param dict
+        :param summary_dst: summary dict
+        :param mix_mode: 'token' or 'frame'
+        :param mix_length: total tokens or frames to mix
+        :return: spk_mix_id [B=1, 1, N], spk_mix_value [B=1, T, N]
+        """
+        assert mix_mode == "token" or mix_mode == "frame"
+        param_key = "spk_mix" if mix_mode == "frame" else "ph_spk_mix"
+        summary_solo_key = "spk" if mix_mode == "frame" else "ph_spk"
+        spk_mix_map = param_src.get(param_key)
+        dynamic = False
+        if spk_mix_map is None:
+            for name in self.spk_map.keys():
+                spk_mix_map = {name: 1.0}
+                break
+        else:
+            for name in spk_mix_map:
+                assert name in self.spk_map, f"Speaker '{name}' not found."
+        if len(spk_mix_map) == 1:
+            summary_dst[summary_solo_key] = list(spk_mix_map.keys())[0]
+        elif any([isinstance(val, str) for val in spk_mix_map.values()]):
+            print_mix = "|".join(spk_mix_map.keys())
+            summary_dst[param_key] = f"dynamic({print_mix})"
+            dynamic = True
+        else:
+            print_mix = "|".join([f"{n}:{'%.3f' % spk_mix_map[n]}" for n in spk_mix_map])
+            summary_dst[param_key] = f"static({print_mix})"
+        spk_mix_id_list = []
+        spk_mix_value_list = []
+        if dynamic:
+            for name, values in spk_mix_map.items():
+                spk_mix_id_list.append(self.spk_map[name])
+                if isinstance(values, str):
+                    if mix_mode == "token":
+                        cur_spk_mix_value = values.split()
+                        assert (
+                            len(cur_spk_mix_value) == mix_length
+                        ), "Speaker mix checks failed. In dynamic token-level mix, number of proportion values must equal number of tokens."
+                        cur_spk_mix_value = paddle.to_tensor(data=np.array(cur_spk_mix_value, "float32")).to(
+                            self.device
+                        )[None]
+                    else:
+                        cur_spk_mix_value = paddle.to_tensor(
+                            data=resample_align_curve(
+                                np.array(values.split(), "float32"),
+                                original_timestep=float(param_src["spk_mix_timestep"]),
+                                target_timestep=self.timestep,
+                                align_length=mix_length,
+                            )
+                        ).to(self.device)[None]
+                    assert paddle.all(
+                        x=cur_spk_mix_value >= 0.0
+                    ), f"""Speaker mix checks failed.
+Proportions of speaker '{name}' on some {mix_mode}s are negative."""
+                else:
+                    assert (
+                        values >= 0.0
+                    ), f"""Speaker mix checks failed.
+Proportion of speaker '{name}' is negative."""
+                    cur_spk_mix_value = paddle.full(shape=(1, mix_length), fill_value=values, dtype="float32")
+                spk_mix_value_list.append(cur_spk_mix_value)
+            spk_mix_id = paddle.to_tensor(data=spk_mix_id_list, dtype="int64").to(self.device)[None, None]
+            spk_mix_value = paddle.stack(x=spk_mix_value_list, axis=2)
+            spk_mix_value_sum = paddle.sum(x=spk_mix_value, axis=2, keepdim=True)
+            assert paddle.all(
+                x=spk_mix_value_sum > 0.0
+            ), f"""Speaker mix checks failed.
+Proportions of speaker mix on some frames sum to zero."""
+            spk_mix_value /= spk_mix_value_sum
+        else:
+            for name, value in spk_mix_map.items():
+                spk_mix_id_list.append(self.spk_map[name])
+                assert (
+                    value >= 0.0
+                ), f"""Speaker mix checks failed.
+Proportion of speaker '{name}' is negative."""
+                spk_mix_value_list.append(value)
+            spk_mix_id = paddle.to_tensor(data=spk_mix_id_list, dtype="int64").to(self.device)[None, None]
+            spk_mix_value = paddle.to_tensor(data=spk_mix_value_list, dtype="float32").to(self.device)[None, None]
+            spk_mix_value_sum = spk_mix_value.sum()
+            assert (
+                spk_mix_value_sum > 0.0
+            ), f"""Speaker mix checks failed.
+Proportions of speaker mix sum to zero."""
+            spk_mix_value /= spk_mix_value_sum
+        return spk_mix_id, spk_mix_value
+    def preprocess_input(self, param: dict, idx=0) -> Dict[str, paddle.Tensor]:
+        raise NotImplementedError()
+    def forward_model(self, sample: Dict[str, paddle.Tensor]):
+        raise NotImplementedError()
+    def run_inference(self, params, **kwargs):
+        raise NotImplementedError()

VLMEvalKit_old/PaddleMIX/paddlemix/models/diffsinger/basics/base_vocoder.py ADDED Viewed

	@@ -0,0 +1,37 @@

+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+class BaseVocoder:
+    def to_device(self, device):
+        """
+        :param device: torch.device or str
+        """
+        raise NotImplementedError()
+    def get_device(self):
+        """
+        :return: device: torch.device or str
+        """
+        raise NotImplementedError()
+    def spec2wav(self, mel, **kwargs):
+        """
+        :param mel: [T, 80]
+        :return: wav: [T']
+        """
+        raise NotImplementedError()

VLMEvalKit_old/PaddleMIX/paddlemix/models/diffsinger/modules/aux_decoder/convnext.py ADDED Viewed

	@@ -0,0 +1,103 @@

+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import sys
+from typing import Optional
+import paddle
+from paddlemix.models.diffsinger.utils import paddle_aux
+class ConvNeXtBlock(paddle.nn.Layer):
+    """ConvNeXt Block adapted from https://github.com/facebookresearch/ConvNeXt to 1D audio signal.
+    Args:
+        dim (int): Number of input channels.
+        intermediate_dim (int): Dimensionality of the intermediate layer.
+        layer_scale_init_value (float, optional): Initial value for the layer scale. None means no scaling.
+            Defaults to None.
+    """
+    def __init__(
+        self, dim: int, intermediate_dim: int, layer_scale_init_value: Optional[float] = None, drop_out: float = 0.0
+    ):
+        super().__init__()
+        self.dwconv = paddle.nn.Conv1D(in_channels=dim, out_channels=dim, kernel_size=7, padding=3, groups=dim)
+        self.norm = paddle.nn.LayerNorm(normalized_shape=dim, epsilon=1e-06)
+        self.pwconv1 = paddle.nn.Linear(in_features=dim, out_features=intermediate_dim)
+        self.act = paddle.nn.GELU()
+        self.pwconv2 = paddle.nn.Linear(in_features=intermediate_dim, out_features=dim)
+        self.gamma = (
+            paddle.base.framework.EagerParamBase.from_tensor(
+                tensor=layer_scale_init_value * paddle.ones(shape=dim), trainable=True
+            )
+            if layer_scale_init_value > 0
+            else None
+        )
+        self.drop_path = paddle.nn.Identity()
+        self.dropout = paddle.nn.Dropout(p=drop_out) if drop_out > 0.0 else paddle.nn.Identity()
+    def forward(self, x: paddle.Tensor) -> paddle.Tensor:
+        residual = x
+        x = self.dwconv(x)
+        x = x.transpose(perm=paddle_aux.transpose_aux_func(x.ndim, 1, 2))
+        x = self.norm(x)
+        x = self.pwconv1(x)
+        x = self.act(x)
+        x = self.pwconv2(x)
+        if self.gamma is not None:
+            x = self.gamma * x
+        x = x.transpose(perm=paddle_aux.transpose_aux_func(x.ndim, 1, 2))
+        x = self.dropout(x)
+        x = residual + self.drop_path(x)
+        return x
+class ConvNeXtDecoder(paddle.nn.Layer):
+    def __init__(self, in_dims, out_dims, /, *, num_channels=512, num_layers=6, kernel_size=7, dropout_rate=0.1):
+        super().__init__()
+        self.inconv = paddle.nn.Conv1D(
+            in_channels=in_dims,
+            out_channels=num_channels,
+            kernel_size=kernel_size,
+            stride=1,
+            padding=(kernel_size - 1) // 2,
+        )
+        self.conv = paddle.nn.LayerList(
+            sublayers=(
+                ConvNeXtBlock(
+                    dim=num_channels,
+                    intermediate_dim=num_channels * 4,
+                    layer_scale_init_value=1e-06,
+                    drop_out=dropout_rate,
+                )
+                for _ in range(num_layers)
+            )
+        )
+        self.outconv = paddle.nn.Conv1D(
+            in_channels=num_channels,
+            out_channels=out_dims,
+            kernel_size=kernel_size,
+            stride=1,
+            padding=(kernel_size - 1) // 2,
+        )
+    def forward(self, x, infer=False):
+        x = x.transpose(perm=paddle_aux.transpose_aux_func(x.ndim, 1, 2))
+        x = self.inconv(x)
+        for conv in self.conv:
+            x = conv(x)
+        x = self.outconv(x)
+        x = x.transpose(perm=paddle_aux.transpose_aux_func(x.ndim, 1, 2))
+        return x

VLMEvalKit_old/PaddleMIX/paddlemix/models/diffsinger/modules/backbones/__init__.py ADDED Viewed

	@@ -0,0 +1,26 @@

+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import paddle
+from paddlemix.models.diffsinger.modules.backbones.lynxnet import LYNXNet
+from paddlemix.models.diffsinger.modules.backbones.wavenet import WaveNet
+from paddlemix.models.diffsinger.utils import filter_kwargs
+BACKBONES = {"wavenet": WaveNet, "lynxnet": LYNXNet}
+def build_backbone(out_dims: int, num_feats: int, backbone_type: str, backbone_args: dict) -> paddle.nn.Layer:
+    backbone = BACKBONES[backbone_type]
+    kwargs = filter_kwargs(backbone_args, backbone)
+    return BACKBONES[backbone_type](out_dims, num_feats, **kwargs)

VLMEvalKit_old/PaddleMIX/paddlemix/models/diffsinger/modules/backbones/lynxnet.py ADDED Viewed

	@@ -0,0 +1,188 @@

+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import sys
+import paddle
+from paddlemix.models.diffsinger.utils import paddle_aux
+from paddlemix.models.diffsinger.modules.commons.common_layers import SinusoidalPosEmb
+from paddlemix.models.diffsinger.utils.hparams import hparams
+class SwiGLU(paddle.nn.Layer):
+    def __init__(self, dim=-1):
+        super().__init__()
+        self.dim = dim
+    def forward(self, x):
+        out, gate = paddle_aux.split(x=x, num_or_sections=x.shape[self.dim] // 2, axis=self.dim)
+        return out * paddle.nn.functional.silu(x=gate)
+class Transpose(paddle.nn.Layer):
+    def __init__(self, dims):
+        super().__init__()
+        assert len(dims) == 2, "dims must be a tuple of two dimensions"
+        self.dims = dims
+    def forward(self, x):
+        # return x.transpose(*self.dims)
+        # return x.transpose(perm=list(self.dims))  # or tuple(self.dims)
+        return x.transpose(perm=paddle_aux.transpose_aux_func(x.ndim, *self.dims))
+class LYNXConvModule(paddle.nn.Layer):
+    @staticmethod
+    def calc_same_padding(kernel_size):
+        pad = kernel_size // 2
+        return pad, pad - (kernel_size + 1) % 2
+    def __init__(self, dim, expansion_factor, kernel_size=31, activation="PReLU", dropout=0.0):
+        super().__init__()
+        inner_dim = dim * expansion_factor
+        activation_classes = {
+            "SiLU": paddle.nn.Silu,
+            "ReLU": paddle.nn.ReLU,
+            "PReLU": lambda: paddle.nn.PReLU(num_parameters=inner_dim),
+        }
+        activation = activation if activation is not None else "PReLU"
+        if activation not in activation_classes:
+            raise ValueError(f"{activation} is not a valid activation")
+        _activation = activation_classes[activation]()
+        padding = self.calc_same_padding(kernel_size)
+        if float(dropout) > 0.0:
+            _dropout = paddle.nn.Dropout(p=dropout)
+        else:
+            _dropout = paddle.nn.Identity()
+        self.net = paddle.nn.Sequential(
+            paddle.nn.LayerNorm(normalized_shape=dim),
+            Transpose((1, 2)),
+            paddle.nn.Conv1D(in_channels=dim, out_channels=inner_dim * 2, kernel_size=1),
+            SwiGLU(dim=1),
+            paddle.nn.Conv1D(
+                in_channels=inner_dim,
+                out_channels=inner_dim,
+                kernel_size=kernel_size,
+                padding=padding[0],
+                groups=inner_dim,
+            ),
+            _activation,
+            paddle.nn.Conv1D(in_channels=inner_dim, out_channels=dim, kernel_size=1),
+            Transpose((1, 2)),
+            _dropout,
+        )
+    def forward(self, x):
+        return self.net(x)
+class LYNXNetResidualLayer(paddle.nn.Layer):
+    def __init__(self, dim_cond, dim, expansion_factor, kernel_size=31, activation="PReLU", dropout=0.0):
+        super().__init__()
+        self.diffusion_projection = paddle.nn.Conv1D(in_channels=dim, out_channels=dim, kernel_size=1)
+        self.conditioner_projection = paddle.nn.Conv1D(in_channels=dim_cond, out_channels=dim, kernel_size=1)
+        self.convmodule = LYNXConvModule(
+            dim=dim, expansion_factor=expansion_factor, kernel_size=kernel_size, activation=activation, dropout=dropout
+        )
+    def forward(self, x, conditioner, diffusion_step):
+        res_x = x.transpose(perm=paddle_aux.transpose_aux_func(x.ndim, 1, 2))
+        x = x + self.diffusion_projection(diffusion_step) + self.conditioner_projection(conditioner)
+        x = x.transpose(perm=paddle_aux.transpose_aux_func(x.ndim, 1, 2))
+        x = self.convmodule(x)
+        x = x + res_x
+        x = x.transpose(perm=paddle_aux.transpose_aux_func(x.ndim, 1, 2))
+        return x
+class LYNXNet(paddle.nn.Layer):
+    def __init__(
+        self,
+        in_dims,
+        n_feats,
+        *,
+        num_layers=6,
+        num_channels=512,
+        expansion_factor=2,
+        kernel_size=31,
+        activation="PReLU",
+        dropout=0.0
+    ):
+        """
+        LYNXNet(Linear Gated Depthwise Separable Convolution Network)
+        TIPS:You can control the style of the generated results by modifying the 'activation',
+            - 'PReLU'(default) : Similar to WaveNet
+            - 'SiLU' : Voice will be more pronounced, not recommended for use under DDPM
+            - 'ReLU' : Contrary to 'SiLU', Voice will be weakened
+        """
+        super().__init__()
+        self.in_dims = in_dims
+        self.n_feats = n_feats
+        self.input_projection = paddle.nn.Conv1D(
+            in_channels=in_dims * n_feats, out_channels=num_channels, kernel_size=1
+        )
+        self.diffusion_embedding = paddle.nn.Sequential(
+            SinusoidalPosEmb(num_channels),
+            paddle.nn.Linear(in_features=num_channels, out_features=num_channels * 4),
+            paddle.nn.GELU(),
+            paddle.nn.Linear(in_features=num_channels * 4, out_features=num_channels),
+        )
+        self.residual_layers = paddle.nn.LayerList(
+            sublayers=[
+                LYNXNetResidualLayer(
+                    dim_cond=hparams["hidden_size"],
+                    dim=num_channels,
+                    expansion_factor=expansion_factor,
+                    kernel_size=kernel_size,
+                    activation=activation,
+                    dropout=dropout,
+                )
+                for i in range(num_layers)
+            ]
+        )
+        self.norm = paddle.nn.LayerNorm(normalized_shape=num_channels)
+        self.output_projection = paddle.nn.Conv1D(
+            in_channels=num_channels, out_channels=in_dims * n_feats, kernel_size=1
+        )
+        init_Constant = paddle.nn.initializer.Constant(value=0.0)
+        init_Constant(self.output_projection.weight)
+    def forward(self, spec, diffusion_step, cond):
+        """
+        :param spec: [B, F, M, T]
+        :param diffusion_step: [B, 1]
+        :param cond: [B, H, T]
+        :return:
+        """
+        if self.n_feats == 1:
+            x = spec[:, 0]
+        else:
+            x = spec.flatten(start_axis=1, stop_axis=2)
+        x = self.input_projection(x)
+        x = paddle.nn.functional.gelu(x=x)
+        diffusion_step = self.diffusion_embedding(diffusion_step).unsqueeze(axis=-1)
+        for layer in self.residual_layers:
+            x = layer(x, cond, diffusion_step)
+        x = self.norm(x.transpose(perm=paddle_aux.transpose_aux_func(x.ndim, 1, 2))).transpose(
+            perm=paddle_aux.transpose_aux_func(
+                self.norm(x.transpose(perm=paddle_aux.transpose_aux_func(x.ndim, 1, 2))).ndim, 1, 2
+            )
+        )
+        x = self.output_projection(x)
+        if self.n_feats == 1:
+            x = x[:, None, :, :]
+        else:
+            x = x.reshape(-1, self.n_feats, self.in_dims, tuple(x.shape)[2])
+        return x

VLMEvalKit_old/PaddleMIX/paddlemix/models/diffsinger/modules/backbones/wavenet.py ADDED Viewed

	@@ -0,0 +1,120 @@

+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import math
+import sys
+from math import sqrt
+import paddle
+from paddlemix.models.diffsinger.utils import paddle_aux
+from paddlemix.models.diffsinger.modules.commons.common_layers import SinusoidalPosEmb
+from paddlemix.models.diffsinger.utils.hparams import hparams
+class Conv1d(paddle.nn.Conv1D):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        init_KaimingNormal = paddle.nn.initializer.KaimingNormal(nonlinearity="leaky_relu")
+        init_KaimingNormal(self.weight)
+class ResidualBlock(paddle.nn.Layer):
+    def __init__(self, encoder_hidden, residual_channels, dilation):
+        super().__init__()
+        self.residual_channels = residual_channels
+        self.dilated_conv = paddle.nn.Conv1D(
+            in_channels=residual_channels,
+            out_channels=2 * residual_channels,
+            kernel_size=3,
+            padding=dilation,
+            dilation=dilation,
+        )
+        self.diffusion_projection = paddle.nn.Linear(in_features=residual_channels, out_features=residual_channels)
+        self.conditioner_projection = paddle.nn.Conv1D(
+            in_channels=encoder_hidden, out_channels=2 * residual_channels, kernel_size=1
+        )
+        self.output_projection = paddle.nn.Conv1D(
+            in_channels=residual_channels, out_channels=2 * residual_channels, kernel_size=1
+        )
+    def forward(self, x, conditioner, diffusion_step):
+        diffusion_step = self.diffusion_projection(diffusion_step).unsqueeze(axis=-1)
+        conditioner = self.conditioner_projection(conditioner)
+        y = x + diffusion_step
+        y = self.dilated_conv(y) + conditioner
+        gate, filter = paddle_aux.split(x=y, num_or_sections=[self.residual_channels, self.residual_channels], axis=1)
+        y = paddle.nn.functional.sigmoid(x=gate) * paddle.nn.functional.tanh(x=filter)
+        y = self.output_projection(y)
+        residual, skip = paddle_aux.split(
+            x=y, num_or_sections=[self.residual_channels, self.residual_channels], axis=1
+        )
+        return (x + residual) / math.sqrt(2.0), skip
+class WaveNet(paddle.nn.Layer):
+    def __init__(self, in_dims, n_feats, *, num_layers=20, num_channels=256, dilation_cycle_length=4):
+        super().__init__()
+        self.in_dims = in_dims
+        self.n_feats = n_feats
+        self.input_projection = Conv1d(in_dims * n_feats, num_channels, 1)
+        self.diffusion_embedding = SinusoidalPosEmb(num_channels)
+        self.mlp = paddle.nn.Sequential(
+            paddle.nn.Linear(in_features=num_channels, out_features=num_channels * 4),
+            paddle.nn.Mish(),
+            paddle.nn.Linear(in_features=num_channels * 4, out_features=num_channels),
+        )
+        self.residual_layers = paddle.nn.LayerList(
+            sublayers=[
+                ResidualBlock(
+                    encoder_hidden=hparams["hidden_size"],
+                    residual_channels=num_channels,
+                    dilation=2 ** (i % dilation_cycle_length),
+                )
+                for i in range(num_layers)
+            ]
+        )
+        self.skip_projection = Conv1d(num_channels, num_channels, 1)
+        self.output_projection = Conv1d(num_channels, in_dims * n_feats, 1)
+        init_Constant = paddle.nn.initializer.Constant(value=0.0)
+        init_Constant(self.output_projection.weight)
+    def forward(self, spec, diffusion_step, cond):
+        """
+        :param spec: [B, F, M, T]
+        :param diffusion_step: [B, 1]
+        :param cond: [B, H, T]
+        :return:
+        """
+        if self.n_feats == 1:
+            x = spec.squeeze(axis=1)
+        else:
+            x = spec.flatten(start_axis=1, stop_axis=2)
+        x = self.input_projection(x)
+        x = paddle.nn.functional.relu(x=x)
+        diffusion_step = self.diffusion_embedding(diffusion_step)
+        diffusion_step = self.mlp(diffusion_step)
+        skip = []
+        for layer in self.residual_layers:
+            x, skip_connection = layer(x, cond, diffusion_step)
+            skip.append(skip_connection)
+        x = paddle.sum(x=paddle.stack(x=skip), axis=0) / sqrt(len(self.residual_layers))
+        x = self.skip_projection(x)
+        x = paddle.nn.functional.relu(x=x)
+        x = self.output_projection(x)
+        if self.n_feats == 1:
+            x = x[:, None, :, :]
+        else:
+            x = x.reshape(-1, self.n_feats, self.in_dims, tuple(x.shape)[2])
+        return x

VLMEvalKit_old/PaddleMIX/paddlemix/models/diffsinger/modules/commons/common_layers.py ADDED Viewed

	@@ -0,0 +1,187 @@

+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import annotations
+import math
+import sys
+import paddle
+from paddlemix.models.diffsinger.utils import paddle_aux
+from paddle.nn import GELU, LayerNorm
+from paddle.nn import MultiHeadAttention as MultiheadAttention
+from paddle.nn import ReLU
+from paddle.nn import Silu as SiLU
+import paddlemix.models.diffsinger.utils as utils
+class NormalInitEmbedding(paddle.nn.Embedding):
+    def __init__(self, num_embeddings: int, embedding_dim: int, padding_idx: (int | None) = None, *args, **kwargs):
+        super().__init__(num_embeddings, embedding_dim, *args, padding_idx=padding_idx, **kwargs)
+        init_Normal = paddle.nn.initializer.Normal(mean=0, std=self._embedding_dim**-0.5)
+        init_Normal(self.weight)
+        if padding_idx is not None:
+            init_Constant = paddle.nn.initializer.Constant(value=0)
+            init_Constant(self.weight[padding_idx])
+class XavierUniformInitLinear(paddle.nn.Linear):
+    def __init__(self, in_features: int, out_features: int, *args, bias: bool = True, **kwargs):
+        super().__init__(in_features, out_features, *args, bias_attr=bias, **kwargs)
+        init_XavierUniform = paddle.nn.initializer.XavierUniform()
+        init_XavierUniform(self.weight)
+        if bias:
+            init_Constant = paddle.nn.initializer.Constant(value=0.0)
+            init_Constant(self.bias)
+class SinusoidalPositionalEmbedding(paddle.nn.Layer):
+    """This module produces sinusoidal positional embeddings of any length.
+    Padding symbols are ignored.
+    """
+    def __init__(self, embedding_dim, padding_idx, init_size=1024):
+        super().__init__()
+        self.embedding_dim = embedding_dim
+        self.padding_idx = padding_idx
+        self.weights = SinusoidalPositionalEmbedding.get_embedding(init_size, embedding_dim, padding_idx)
+        self.register_buffer(name="_float_tensor", tensor=paddle.empty(shape=[1], dtype="float32"))
+    @staticmethod
+    def get_embedding(num_embeddings, embedding_dim, padding_idx=None):
+        """Build sinusoidal embeddings.
+        This matches the implementation in tensor2tensor, but differs slightly
+        from the description in Section 3.5 of "Attention Is All You Need".
+        """
+        half_dim = embedding_dim // 2
+        emb = math.log(10000) / (half_dim - 1)
+        emb = paddle.exp(x=paddle.arange(dtype="float32", end=half_dim) * -emb)
+        emb = paddle.arange(dtype="float32", end=num_embeddings).unsqueeze(axis=1) * emb.unsqueeze(axis=0)
+        emb = paddle.concat(x=[paddle.sin(x=emb), paddle.cos(x=emb)], axis=1).view(num_embeddings, -1)
+        if embedding_dim % 2 == 1:
+            emb = paddle.concat(x=[emb, paddle.zeros(shape=[num_embeddings, 1])], axis=1)
+        if padding_idx is not None:
+            emb[padding_idx, :] = 0
+        return emb
+    def forward(self, x, incremental_state=None, timestep=None, positions=None):
+        """Input is expected to be of size [bsz x seqlen]."""
+        bsz, seq_len = tuple(x.shape)[:2]
+        max_pos = self.padding_idx + 1 + seq_len
+        if self.weights is None or max_pos > self.weights.shape[0]:
+            self.weights = SinusoidalPositionalEmbedding.get_embedding(max_pos, self.embedding_dim, self.padding_idx)
+        self.weights = self.weights.to(self._float_tensor)
+        if incremental_state is not None:
+            pos = timestep.view(-1)[0] + 1 if timestep is not None else seq_len
+            return self.weights[self.padding_idx + pos, :].expand(shape=[bsz, 1, -1])
+        positions = utils.make_positions(x, self.padding_idx) if positions is None else positions
+        return self.weights.index_select(axis=0, index=positions.view(-1)).view(bsz, seq_len, -1).detach()
+    @staticmethod
+    def max_positions():
+        """Maximum number of supported positions."""
+        return int(100000.0)
+class TransformerFFNLayer(paddle.nn.Layer):
+    def __init__(self, hidden_size, filter_size, kernel_size=1, dropout=0.0, act="gelu"):
+        super().__init__()
+        self.kernel_size = kernel_size
+        self.dropout = dropout
+        self.act = act
+        self.ffn_1 = paddle.nn.Conv1D(
+            in_channels=hidden_size, out_channels=filter_size, kernel_size=kernel_size, padding=kernel_size // 2
+        )
+        if self.act == "relu":
+            self.act_fn = paddle.nn.ReLU()
+        elif self.act == "gelu":
+            self.act_fn = paddle.nn.GELU()
+        elif self.act == "swish":
+            self.act_fn = paddle.nn.Silu()
+        self.ffn_2 = XavierUniformInitLinear(filter_size, hidden_size)
+    def forward(self, x):
+        x = self.ffn_1(x.transpose(perm=[1, 2, 0])).transpose(perm=[2, 0, 1])
+        x = x * self.kernel_size**-0.5
+        x = self.act_fn(x)
+        x = paddle.nn.functional.dropout(x=x, p=self.dropout, training=self.training)
+        x = self.ffn_2(x)
+        return x
+class EncSALayer(paddle.nn.Layer):
+    def __init__(self, c, num_heads, dropout, attention_dropout=0.1, relu_dropout=0.1, kernel_size=9, act="gelu"):
+        super().__init__()
+        self.dropout = dropout
+        self.layer_norm1 = paddle.nn.LayerNorm(normalized_shape=c)
+        self.self_attn = MultiheadAttention(
+            c,
+            num_heads,
+            dropout=attention_dropout,
+            bias_attr=False,
+        )
+        self.layer_norm2 = paddle.nn.LayerNorm(normalized_shape=c)
+        self.ffn = TransformerFFNLayer(c, 4 * c, kernel_size=kernel_size, dropout=relu_dropout, act=act)
+    def forward(self, x, encoder_padding_mask=None, **kwargs):
+        layer_norm_training = kwargs.get("layer_norm_training", None)
+        if layer_norm_training is not None:
+            self.layer_norm1.training = layer_norm_training
+            self.layer_norm2.training = layer_norm_training
+        residual = x
+        x = self.layer_norm1(x)
+        x = self.self_attn(
+            query=x,
+            key=x,
+            value=x,
+            attn_mask=paddle.any(encoder_padding_mask, -1),  # key_padding_mask=encoder_padding_mask
+        )
+        x = paddle.nn.functional.dropout(x=x, p=self.dropout, training=self.training)
+        x = residual + x
+        x = (
+            x
+            * (1 - encoder_padding_mask.astype(dtype="float32")).transpose(
+                perm=paddle_aux.transpose_aux_func((1 - encoder_padding_mask.astype(dtype="float32")).ndim, 0, 1)
+            )[..., None]
+        )
+        residual = x
+        x = self.layer_norm2(x)
+        x = self.ffn(x)
+        x = paddle.nn.functional.dropout(x=x, p=self.dropout, training=self.training)
+        x = residual + x
+        x = (
+            x
+            * (1 - encoder_padding_mask.astype(dtype="float32")).transpose(
+                perm=paddle_aux.transpose_aux_func((1 - encoder_padding_mask.astype(dtype="float32")).ndim, 0, 1)
+            )[..., None]
+        )
+        return x
+class SinusoidalPosEmb(paddle.nn.Layer):
+    def __init__(self, dim):
+        super().__init__()
+        self.dim = dim
+    def forward(self, x):
+        device = x.place
+        half_dim = self.dim // 2
+        emb = math.log(10000) / (half_dim - 1)
+        emb = paddle.exp(x=paddle.arange(end=half_dim) * -emb)
+        emb = x[:, None] * emb[None, :]
+        emb = paddle.concat(x=(emb.sin(), emb.cos()), axis=-1)
+        return emb

VLMEvalKit_old/PaddleMIX/paddlemix/models/diffsinger/modules/commons/espnet_positional_embedding.py ADDED Viewed

	@@ -0,0 +1,129 @@

+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import math
+import sys
+import paddle
+from paddlemix.models.diffsinger.utils import paddle_aux
+class PositionalEncoding(paddle.nn.Layer):
+    """Positional encoding.
+    Args:
+        d_model (int): Embedding dimension.
+        dropout_rate (float): Dropout rate.
+        max_len (int): Maximum input length.
+        reverse (bool): Whether to reverse the input position.
+    """
+    def __init__(self, d_model, dropout_rate, max_len=5000, reverse=False):
+        """Construct an PositionalEncoding object."""
+        super(PositionalEncoding, self).__init__()
+        self.d_model = d_model
+        self.reverse = reverse
+        self.xscale = math.sqrt(self.d_model)
+        self.dropout = paddle.nn.Dropout(p=dropout_rate)
+        self.pe = None
+        self.extend_pe(paddle.to_tensor(data=0.0).expand(shape=[1, max_len]))
+    def extend_pe(self, x):
+        """Reset the positional encodings."""
+        if self.pe is not None:
+            if self.pe.shape[1] >= x.shape[1]:
+                if self.pe.dtype != x.dtype or self.pe.place != x.place:
+                    self.pe = self.pe.to(dtype=x.dtype, device=x.place)
+                return
+        if self.reverse:
+            position = paddle.arange(start=x.shape[1] - 1, end=-1, step=-1.0, dtype="float32").unsqueeze(axis=1)
+        else:
+            position = paddle.arange(start=0, end=x.shape[1], dtype="float32").unsqueeze(axis=1)
+        div_term = paddle.exp(
+            x=paddle.arange(start=0, end=self.d_model, step=2, dtype="float32") * -(math.log(10000.0) / self.d_model)
+        )
+        pe = (
+            paddle.stack(x=[paddle.sin(x=position * div_term), paddle.cos(x=position * div_term)], axis=2)
+            .view(-1, self.d_model)
+            .unsqueeze(axis=0)
+        )
+        self.pe = pe.to(device=x.place, dtype=x.dtype)
+    def forward(self, x: paddle.Tensor):
+        """Add positional encoding.
+        Args:
+            x (torch.Tensor): Input tensor (batch, time, `*`).
+        Returns:
+            torch.Tensor: Encoded tensor (batch, time, `*`).
+        """
+        self.extend_pe(x)
+        x = x * self.xscale + self.pe[:, : x.shape[1]]
+        return self.dropout(x)
+class ScaledPositionalEncoding(PositionalEncoding):
+    """Scaled positional encoding module.
+    See Sec. 3.2  https://arxiv.org/abs/1809.08895
+    Args:
+        d_model (int): Embedding dimension.
+        dropout_rate (float): Dropout rate.
+        max_len (int): Maximum input length.
+    """
+    def __init__(self, d_model, dropout_rate, max_len=5000):
+        """Initialize class."""
+        super().__init__(d_model=d_model, dropout_rate=dropout_rate, max_len=max_len)
+        self.alpha = paddle.base.framework.EagerParamBase.from_tensor(tensor=paddle.to_tensor(data=1.0))
+    def reset_parameters(self):
+        """Reset parameters."""
+        self.alpha.data = paddle.to_tensor(data=1.0)
+    def forward(self, x):
+        """Add positional encoding.
+        Args:
+            x (torch.Tensor): Input tensor (batch, time, `*`).
+        Returns:
+            torch.Tensor: Encoded tensor (batch, time, `*`).
+        """
+        self.extend_pe(x)
+        x = x + self.alpha * self.pe[:, : x.shape[1]]
+        return self.dropout(x)
+class RelPositionalEncoding(PositionalEncoding):
+    """Relative positional encoding module.
+    See : Appendix B in https://arxiv.org/abs/1901.02860
+    Args:
+        d_model (int): Embedding dimension.
+        dropout_rate (float): Dropout rate.
+        max_len (int): Maximum input length.
+    """
+    def __init__(self, d_model, dropout_rate, max_len=5000):
+        """Initialize class."""
+        super().__init__(d_model, dropout_rate, max_len, reverse=True)
+    def forward(self, x):
+        """Compute positional encoding.
+        Args:
+            x (torch.Tensor): Input tensor (batch, time, `*`).
+        Returns:
+            torch.Tensor: Encoded tensor (batch, time, `*`).
+            torch.Tensor: Positional embedding tensor (1, time, `*`).
+        """
+        self.extend_pe(x)
+        x = x * self.xscale
+        pos_emb = self.pe[:, : x.shape[1]]
+        return self.dropout(x) + self.dropout(pos_emb)

VLMEvalKit_old/PaddleMIX/paddlemix/models/diffsinger/modules/compat.py ADDED Viewed

	@@ -0,0 +1,35 @@

+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+def get_backbone_type(root_config: dict, nested_config: dict = None):
+    if nested_config is None:
+        nested_config = root_config
+    return nested_config.get(
+        "backbone_type", root_config.get("backbone_type", root_config.get("diff_decoder_type", "wavenet"))
+    )
+def get_backbone_args(config: dict, backbone_type: str):
+    args = config.get("backbone_args")
+    if args is not None:
+        return args
+    elif backbone_type == "wavenet":
+        return {
+            "num_layers": config.get("residual_layers"),
+            "num_channels": config.get("residual_channels"),
+            "dilation_cycle_length": config.get("dilation_cycle_length"),
+        }
+    else:
+        return None

VLMEvalKit_old/PaddleMIX/paddlemix/models/diffsinger/modules/core/__init__.py ADDED Viewed

	@@ -0,0 +1,16 @@

+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from .ddpm import GaussianDiffusion, MultiVarianceDiffusion, PitchDiffusion
+from .reflow import MultiVarianceRectifiedFlow, PitchRectifiedFlow, RectifiedFlow

VLMEvalKit_old/PaddleMIX/paddlemix/models/diffsinger/modules/core/ddpm.py ADDED Viewed

	@@ -0,0 +1,521 @@

+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import annotations
+import sys, os
+from collections import deque
+from functools import partial
+from typing import List, Tuple
+import numpy as np
+import paddle
+from tqdm import tqdm
+from paddlemix.models.diffsinger.modules.backbones import build_backbone
+from paddlemix.models.diffsinger.utils.hparams import hparams
+def extract(a, t, x_shape):
+    b, *_ = tuple(t.shape)
+    out = a.take_along_axis(axis=-1, indices=t, broadcast=False)
+    return out.reshape(b, *((1,) * (len(x_shape) - 1)))
+def noise_like(shape, device, repeat=False):
+    repeat_noise = lambda: paddle.randn(shape=(1, *shape[1:])).tile(
+        repeat_times=[shape[0], *((1,) * (len(shape) - 1))]
+    )
+    noise = lambda: paddle.randn(shape=shape)
+    return repeat_noise() if repeat else noise()
+def linear_beta_schedule(timesteps, max_beta=0.01):
+    """
+    linear schedule
+    """
+    betas = np.linspace(0.0001, max_beta, timesteps)
+    return betas
+def cosine_beta_schedule(timesteps, s=0.008):
+    """
+    cosine schedule
+    as proposed in https://openreview.net/forum?id=-NEXDKk8gZ
+    """
+    steps = timesteps + 1
+    x = np.linspace(0, steps, steps)
+    alphas_cumprod = np.cos((x / steps + s) / (1 + s) * np.pi * 0.5) ** 2
+    alphas_cumprod = alphas_cumprod / alphas_cumprod[0]
+    betas = 1 - alphas_cumprod[1:] / alphas_cumprod[:-1]
+    return np.clip(betas, a_min=0, a_max=0.999)
+beta_schedule = {"cosine": cosine_beta_schedule, "linear": linear_beta_schedule}
+class GaussianDiffusion(paddle.nn.Layer):
+    def __init__(
+        self,
+        out_dims,
+        num_feats=1,
+        timesteps=1000,
+        k_step=1000,
+        backbone_type=None,
+        backbone_args=None,
+        betas=None,
+        spec_min=None,
+        spec_max=None,
+    ):
+        super().__init__()
+        self.denoise_fn: paddle.nn.Layer = build_backbone(out_dims, num_feats, backbone_type, backbone_args)
+        self.out_dims = out_dims
+        self.num_feats = num_feats
+        if betas is not None:
+            betas = betas.detach().cpu().numpy() if isinstance(betas, paddle.Tensor) else betas
+        else:
+            betas = beta_schedule[hparams["schedule_type"]](timesteps)
+        alphas = 1.0 - betas
+        alphas_cumprod = np.cumprod(alphas, axis=0)
+        alphas_cumprod_prev = np.append(1.0, alphas_cumprod[:-1])
+        self.use_shallow_diffusion = hparams.get("use_shallow_diffusion", False)
+        if self.use_shallow_diffusion:
+            assert k_step <= timesteps, "K_step should not be larger than timesteps."
+        self.timesteps = timesteps
+        self.k_step = k_step if self.use_shallow_diffusion else timesteps
+        self.noise_list = deque(maxlen=4)
+        to_torch = partial(paddle.to_tensor, dtype="float32")
+        self.register_buffer(name="betas", tensor=to_torch(betas))
+        self.register_buffer(name="alphas_cumprod", tensor=to_torch(alphas_cumprod))
+        self.register_buffer(name="alphas_cumprod_prev", tensor=to_torch(alphas_cumprod_prev))
+        self.register_buffer(name="sqrt_alphas_cumprod", tensor=to_torch(np.sqrt(alphas_cumprod)))
+        self.register_buffer(name="sqrt_one_minus_alphas_cumprod", tensor=to_torch(np.sqrt(1.0 - alphas_cumprod)))
+        self.register_buffer(name="log_one_minus_alphas_cumprod", tensor=to_torch(np.log(1.0 - alphas_cumprod)))
+        self.register_buffer(name="sqrt_recip_alphas_cumprod", tensor=to_torch(np.sqrt(1.0 / alphas_cumprod)))
+        self.register_buffer(name="sqrt_recipm1_alphas_cumprod", tensor=to_torch(np.sqrt(1.0 / alphas_cumprod - 1)))
+        posterior_variance = betas * (1.0 - alphas_cumprod_prev) / (1.0 - alphas_cumprod)
+        self.register_buffer(name="posterior_variance", tensor=to_torch(posterior_variance))
+        self.register_buffer(
+            name="posterior_log_variance_clipped", tensor=to_torch(np.log(np.maximum(posterior_variance, 1e-20)))
+        )
+        self.register_buffer(
+            name="posterior_mean_coef1", tensor=to_torch(betas * np.sqrt(alphas_cumprod_prev) / (1.0 - alphas_cumprod))
+        )
+        self.register_buffer(
+            name="posterior_mean_coef2",
+            tensor=to_torch((1.0 - alphas_cumprod_prev) * np.sqrt(alphas) / (1.0 - alphas_cumprod)),
+        )
+        spec_min = paddle.to_tensor(data=spec_min, dtype="float32")[None, None, :out_dims].transpose(
+            perm=paddle_aux.transpose_aux_func(
+                paddle.to_tensor(data=spec_min, dtype="float32")[None, None, :out_dims].ndim, -3, -2
+            )
+        )
+        spec_max = paddle.to_tensor(data=spec_max, dtype="float32")[None, None, :out_dims].transpose(
+            perm=paddle_aux.transpose_aux_func(
+                paddle.to_tensor(data=spec_max, dtype="float32")[None, None, :out_dims].ndim, -3, -2
+            )
+        )
+        self.register_buffer(name="spec_min", tensor=spec_min)
+        self.register_buffer(name="spec_max", tensor=spec_max)
+        self.time_scale_factor = self.timesteps
+        self.t_start = 1 - self.k_step / self.timesteps
+        factors = paddle.to_tensor(
+            data=[i for i in range(1, self.timesteps + 1) if self.timesteps % i == 0], dtype="int64"
+        )
+        self.register_buffer(name="timestep_factors", tensor=factors, persistable=False)
+    def q_mean_variance(self, x_start, t):
+        mean = extract(self.sqrt_alphas_cumprod, t, tuple(x_start.shape)) * x_start
+        variance = extract(1.0 - self.alphas_cumprod, t, tuple(x_start.shape))
+        log_variance = extract(self.log_one_minus_alphas_cumprod, t, tuple(x_start.shape))
+        return mean, variance, log_variance
+    def predict_start_from_noise(self, x_t, t, noise):
+        return (
+            extract(self.sqrt_recip_alphas_cumprod, t, tuple(x_t.shape)) * x_t
+            - extract(self.sqrt_recipm1_alphas_cumprod, t, tuple(x_t.shape)) * noise
+        )
+    def q_posterior(self, x_start, x_t, t):
+        posterior_mean = (
+            extract(self.posterior_mean_coef1, t, tuple(x_t.shape)) * x_start
+            + extract(self.posterior_mean_coef2, t, tuple(x_t.shape)) * x_t
+        )
+        posterior_variance = extract(self.posterior_variance, t, tuple(x_t.shape))
+        posterior_log_variance_clipped = extract(self.posterior_log_variance_clipped, t, tuple(x_t.shape))
+        return (posterior_mean, posterior_variance, posterior_log_variance_clipped)
+    def p_mean_variance(self, x, t, cond):
+        noise_pred = self.denoise_fn(x, t, cond=cond)
+        x_recon = self.predict_start_from_noise(x, t=t, noise=noise_pred)
+        model_mean, posterior_variance, posterior_log_variance = self.q_posterior(x_start=x_recon, x_t=x, t=t)
+        return model_mean, posterior_variance, posterior_log_variance
+    @paddle.no_grad()
+    def p_sample(self, x, t, cond, clip_denoised=True, repeat_noise=False):
+        b, *_, device = *tuple(x.shape), x.place
+        model_mean, _, model_log_variance = self.p_mean_variance(x=x, t=t, cond=cond)
+        noise = noise_like(tuple(x.shape), device, repeat_noise)
+        nonzero_mask = (1 - (t == 0).astype(dtype="float32")).reshape(b, *((1,) * (len(tuple(x.shape)) - 1)))
+        return model_mean + nonzero_mask * (0.5 * model_log_variance).exp() * noise
+    @paddle.no_grad()
+    def p_sample_ddim(self, x, t, interval, cond):
+        a_t = extract(self.alphas_cumprod, t, tuple(x.shape))
+        a_prev = extract(self.alphas_cumprod, paddle_aux.max(t - interval, paddle.zeros_like(x=t)), tuple(x.shape))
+        noise_pred = self.denoise_fn(x, t, cond=cond)
+        x_prev = a_prev.sqrt() * (
+            x / a_t.sqrt() + (((1 - a_prev) / a_prev).sqrt() - ((1 - a_t) / a_t).sqrt()) * noise_pred
+        )
+        return x_prev
+    @paddle.no_grad()
+    def p_sample_plms(self, x, t, interval, cond, clip_denoised=True, repeat_noise=False):
+        """
+        Use the PLMS method from
+        [Pseudo Numerical Methods for Diffusion Models on Manifolds](https://arxiv.org/abs/2202.09778).
+        """
+        def get_x_pred(x, noise_t, t):
+            a_t = extract(self.alphas_cumprod, t, tuple(x.shape))
+            a_prev = extract(self.alphas_cumprod, paddle_aux.max(t - interval, paddle.zeros_like(x=t)), tuple(x.shape))
+            a_t_sq, a_prev_sq = a_t.sqrt(), a_prev.sqrt()
+            x_delta = (a_prev - a_t) * (
+                1 / (a_t_sq * (a_t_sq + a_prev_sq)) * x
+                - 1 / (a_t_sq * (((1 - a_prev) * a_t).sqrt() + ((1 - a_t) * a_prev).sqrt())) * noise_t
+            )
+            x_pred = x + x_delta
+            return x_pred
+        noise_list = self.noise_list
+        noise_pred = self.denoise_fn(x, t, cond=cond)
+        if len(noise_list) == 0:
+            x_pred = get_x_pred(x, noise_pred, t)
+            noise_pred_prev = self.denoise_fn(x_pred, max(t - interval, 0), cond=cond)
+            noise_pred_prime = (noise_pred + noise_pred_prev) / 2
+        elif len(noise_list) == 1:
+            noise_pred_prime = (3 * noise_pred - noise_list[-1]) / 2
+        elif len(noise_list) == 2:
+            noise_pred_prime = (23 * noise_pred - 16 * noise_list[-1] + 5 * noise_list[-2]) / 12
+        else:
+            noise_pred_prime = (55 * noise_pred - 59 * noise_list[-1] + 37 * noise_list[-2] - 9 * noise_list[-3]) / 24
+        x_prev = get_x_pred(x, noise_pred_prime, t)
+        noise_list.append(noise_pred)
+        return x_prev
+    def q_sample(self, x_start, t, noise):
+        return (
+            extract(self.sqrt_alphas_cumprod, t, tuple(x_start.shape)) * x_start
+            + extract(self.sqrt_one_minus_alphas_cumprod, t, tuple(x_start.shape)) * noise
+        )
+    def p_losses(self, x_start, t, cond, noise=None):
+        if noise is None:
+            noise = paddle.randn(shape=x_start.shape, dtype=x_start.dtype)
+        x_noisy = self.q_sample(x_start=x_start, t=t, noise=noise)
+        x_recon = self.denoise_fn(x_noisy, t, cond)
+        return x_recon, noise
+    def inference(self, cond, b=1, x_start=None, device=None):
+        depth = hparams.get("K_step_infer", self.k_step)
+        speedup = hparams["diff_speedup"]
+        if speedup > 0:
+            assert depth % speedup == 0, f"Acceleration ratio must be a factor of diffusion depth {depth}."
+        noise = paddle.randn(shape=[b, self.num_feats, self.out_dims, tuple(cond.shape)[2]])
+        if self.use_shallow_diffusion:
+            t_max = min(depth, self.k_step)
+        else:
+            t_max = self.k_step
+        if t_max >= self.timesteps:
+            x = noise
+        elif t_max > 0:
+            assert x_start is not None, "Missing shallow diffusion source."
+            x = self.q_sample(x_start, paddle.full(shape=(b,), fill_value=t_max - 1, dtype="int64"), noise)
+        else:
+            assert x_start is not None, "Missing shallow diffusion source."
+            x = x_start
+        if speedup > 1 and t_max > 0:
+            algorithm = hparams["diff_accelerator"]
+            if algorithm == "dpm-solver":
+                from inference.dpm_solver_pytorch import (
+                    DPM_Solver,
+                    NoiseScheduleVP,
+                    model_wrapper,
+                )
+                noise_schedule = NoiseScheduleVP(schedule="discrete", betas=self.betas[:t_max])
+                def my_wrapper(fn):
+                    def wrapped(x, t, **kwargs):
+                        ret = fn(x, t, **kwargs)
+                        self.bar.update(1)
+                        return ret
+                    return wrapped
+                model_fn = model_wrapper(
+                    my_wrapper(self.denoise_fn), noise_schedule, model_type="noise", model_kwargs={"cond": cond}
+                )
+                dpm_solver = DPM_Solver(model_fn, noise_schedule, algorithm_type="dpmsolver++")
+                steps = t_max // hparams["diff_speedup"]
+                self.bar = tqdm(desc="sample time step", total=steps, disable=not hparams["infer"], leave=False)
+                x = dpm_solver.sample(x, steps=steps, order=2, skip_type="time_uniform", method="multistep")
+                self.bar.close()
+            elif algorithm == "unipc":
+                from inference.uni_pc import NoiseScheduleVP, UniPC, model_wrapper
+                noise_schedule = NoiseScheduleVP(schedule="discrete", betas=self.betas[:t_max])
+                def my_wrapper(fn):
+                    def wrapped(x, t, **kwargs):
+                        ret = fn(x, t, **kwargs)
+                        self.bar.update(1)
+                        return ret
+                    return wrapped
+                model_fn = model_wrapper(
+                    my_wrapper(self.denoise_fn), noise_schedule, model_type="noise", model_kwargs={"cond": cond}
+                )
+                uni_pc = UniPC(model_fn, noise_schedule, variant="bh2")
+                steps = t_max // hparams["diff_speedup"]
+                self.bar = tqdm(desc="sample time step", total=steps, disable=not hparams["infer"], leave=False)
+                x = uni_pc.sample(x, steps=steps, order=2, skip_type="time_uniform", method="multistep")
+                self.bar.close()
+            elif algorithm == "pndm":
+                self.noise_list = deque(maxlen=4)
+                iteration_interval = speedup
+                for i in tqdm(
+                    reversed(range(0, t_max, iteration_interval)),
+                    desc="sample time step",
+                    total=t_max // iteration_interval,
+                    disable=not hparams["infer"],
+                    leave=False,
+                ):
+                    x = self.p_sample_plms(
+                        x, paddle.full(shape=(b,), fill_value=i, dtype="int64"), iteration_interval, cond=cond
+                    )
+            elif algorithm == "ddim":
+                iteration_interval = speedup
+                for i in tqdm(
+                    reversed(range(0, t_max, iteration_interval)),
+                    desc="sample time step",
+                    total=t_max // iteration_interval,
+                    disable=not hparams["infer"],
+                    leave=False,
+                ):
+                    x = self.p_sample_ddim(
+                        x, paddle.full(shape=(b,), fill_value=i, dtype="int64"), iteration_interval, cond=cond
+                    )
+            else:
+                raise ValueError(f"Unsupported acceleration algorithm for DDPM: {algorithm}.")
+        else:
+            for i in tqdm(
+                reversed(range(0, t_max)),
+                desc="sample time step",
+                total=t_max,
+                disable=not hparams["infer"],
+                leave=False,
+            ):
+                x = self.p_sample(x, paddle.full(shape=(b,), fill_value=i, dtype="int64"), cond)
+        x = x.transpose(perm=paddle_aux.transpose_aux_func(x.ndim, 2, 3)).squeeze(axis=1)
+        return x
+    def forward(self, condition, gt_spec=None, src_spec=None, infer=True):
+        """
+        conditioning diffusion, use fastspeech2 encoder output as the condition
+        """
+        cond = condition.transpose(perm=paddle_aux.transpose_aux_func(condition.ndim, 1, 2))
+        b, device = tuple(condition.shape)[0], condition.place
+        if not infer:
+            spec = self.norm_spec(gt_spec).transpose(
+                perm=paddle_aux.transpose_aux_func(self.norm_spec(gt_spec).ndim, -2, -1)
+            )
+            if self.num_feats == 1:
+                spec = spec[:, None, :, :]
+            t = paddle.randint(low=0, high=self.k_step, shape=(b,)).astype(dtype="int64")
+            x_recon, noise = self.p_losses(spec, t, cond=cond)
+            return x_recon, noise
+        else:
+            if src_spec is not None:
+                spec = self.norm_spec(src_spec).transpose(
+                    perm=paddle_aux.transpose_aux_func(self.norm_spec(src_spec).ndim, -2, -1)
+                )
+                if self.num_feats == 1:
+                    spec = spec[:, None, :, :]
+            else:
+                spec = None
+            x = self.inference(cond, b=b, x_start=spec, device=device)
+            return self.denorm_spec(x)
+    def norm_spec(self, x):
+        return (x - self.spec_min) / (self.spec_max - self.spec_min) * 2 - 1
+    def denorm_spec(self, x):
+        return (x + 1) / 2 * (self.spec_max - self.spec_min) + self.spec_min
+class RepetitiveDiffusion(GaussianDiffusion):
+    def __init__(
+        self,
+        vmin: (float | int | list),
+        vmax: (float | int | list),
+        repeat_bins: int,
+        timesteps=1000,
+        k_step=1000,
+        backbone_type=None,
+        backbone_args=None,
+        betas=None,
+    ):
+        assert isinstance(vmin, (float, int)) and isinstance(vmin, (float, int)) or len(vmin) == len(vmax)
+        num_feats = 1 if isinstance(vmin, (float, int)) else len(vmin)
+        spec_min = [vmin] if num_feats == 1 else [[v] for v in vmin]
+        spec_max = [vmax] if num_feats == 1 else [[v] for v in vmax]
+        self.repeat_bins = repeat_bins
+        super().__init__(
+            out_dims=repeat_bins,
+            num_feats=num_feats,
+            timesteps=timesteps,
+            k_step=k_step,
+            backbone_type=backbone_type,
+            backbone_args=backbone_args,
+            betas=betas,
+            spec_min=spec_min,
+            spec_max=spec_max,
+        )
+    def norm_spec(self, x):
+        """
+        :param x: [B, T] or [B, F, T]
+        :return [B, T, R] or [B, F, T, R]
+        """
+        if self.num_feats == 1:
+            repeats = [1, 1, self.repeat_bins]
+        else:
+            repeats = [1, 1, 1, self.repeat_bins]
+        return super().norm_spec(x.unsqueeze(axis=-1).tile(repeat_times=repeats))
+    def denorm_spec(self, x):
+        """
+        :param x: [B, T, R] or [B, F, T, R]
+        :return [B, T] or [B, F, T]
+        """
+        return super().denorm_spec(x).mean(axis=-1)
+class PitchDiffusion(RepetitiveDiffusion):
+    def __init__(
+        self,
+        vmin: float,
+        vmax: float,
+        cmin: float,
+        cmax: float,
+        repeat_bins,
+        timesteps=1000,
+        k_step=1000,
+        backbone_type=None,
+        backbone_args=None,
+        betas=None,
+    ):
+        self.vmin = vmin
+        self.vmax = vmax
+        self.cmin = cmin
+        self.cmax = cmax
+        super().__init__(
+            vmin=vmin,
+            vmax=vmax,
+            repeat_bins=repeat_bins,
+            timesteps=timesteps,
+            k_step=k_step,
+            backbone_type=backbone_type,
+            backbone_args=backbone_args,
+            betas=betas,
+        )
+    def norm_spec(self, x):
+        return super().norm_spec(x.clip(min=self.cmin, max=self.cmax))
+    def denorm_spec(self, x):
+        return super().denorm_spec(x).clip(min=self.cmin, max=self.cmax)
+class MultiVarianceDiffusion(RepetitiveDiffusion):
+    def __init__(
+        self,
+        ranges: List[Tuple[float, float]],
+        clamps: List[Tuple[float | None, float | None] | None],
+        repeat_bins,
+        timesteps=1000,
+        k_step=1000,
+        backbone_type=None,
+        backbone_args=None,
+        betas=None,
+    ):
+        assert len(ranges) == len(clamps)
+        self.clamps = clamps
+        vmin = [r[0] for r in ranges]
+        vmax = [r[1] for r in ranges]
+        if len(vmin) == 1:
+            vmin = vmin[0]
+        if len(vmax) == 1:
+            vmax = vmax[0]
+        super().__init__(
+            vmin=vmin,
+            vmax=vmax,
+            repeat_bins=repeat_bins,
+            timesteps=timesteps,
+            k_step=k_step,
+            backbone_type=backbone_type,
+            backbone_args=backbone_args,
+            betas=betas,
+        )
+    def clamp_spec(self, xs: (list | tuple)):
+        clamped = []
+        for x, c in zip(xs, self.clamps):
+            if c is None:
+                clamped.append(x)
+                continue
+            clamped.append(x.clip(min=c[0], max=c[1]))
+        return clamped
+    def norm_spec(self, xs: (list | tuple)):
+        """
+        :param xs: sequence of [B, T]
+        :return: [B, F, T] => super().norm_spec(xs) => [B, F, T, R]
+        """
+        assert len(xs) == self.num_feats
+        clamped = self.clamp_spec(xs)
+        xs = paddle.stack(x=clamped, axis=1)
+        if self.num_feats == 1:
+            xs = xs.squeeze(axis=1)
+        return super().norm_spec(xs)
+    def denorm_spec(self, xs):
+        """
+        :param xs: [B, T, R] or [B, F, T, R] => super().denorm_spec(xs) => [B, T] or [B, F, T]
+        :return: sequence of [B, T]
+        """
+        xs = super().denorm_spec(xs)
+        if self.num_feats == 1:
+            xs = [xs]
+        else:
+            xs = xs.unbind(axis=1)
+        assert len(xs) == self.num_feats
+        return self.clamp_spec(xs)

VLMEvalKit_old/PaddleMIX/paddlemix/models/diffsinger/modules/core/reflow.py ADDED Viewed

	@@ -0,0 +1,311 @@

+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import annotations
+import sys
+from typing import List, Tuple
+import paddle
+from tqdm import tqdm
+from paddlemix.models.diffsinger.modules.backbones import build_backbone
+from paddlemix.models.diffsinger.utils.hparams import hparams
+from paddlemix.models.diffsinger.utils import paddle_aux
+class RectifiedFlow(paddle.nn.Layer):
+    def __init__(
+        self,
+        out_dims,
+        num_feats=1,
+        t_start=0.0,
+        time_scale_factor=1000,
+        backbone_type=None,
+        backbone_args=None,
+        spec_min=None,
+        spec_max=None,
+    ):
+        super().__init__()
+        self.velocity_fn: paddle.nn.Layer = build_backbone(out_dims, num_feats, backbone_type, backbone_args)
+        self.out_dims = out_dims
+        self.num_feats = num_feats
+        self.use_shallow_diffusion = hparams.get("use_shallow_diffusion", False)
+        if self.use_shallow_diffusion:
+            assert 0.0 <= t_start <= 1.0, "T_start should be in [0, 1]."
+        else:
+            t_start = 0.0
+        self.t_start = t_start
+        self.time_scale_factor = time_scale_factor
+        spec_min = paddle.to_tensor(data=spec_min, dtype="float32")[None, None, :out_dims].transpose(
+            perm=paddle_aux.transpose_aux_func(
+                paddle.to_tensor(data=spec_min, dtype="float32")[None, None, :out_dims].ndim, -3, -2
+            )
+        )
+        spec_max = paddle.to_tensor(data=spec_max, dtype="float32")[None, None, :out_dims].transpose(
+            perm=paddle_aux.transpose_aux_func(
+                paddle.to_tensor(data=spec_max, dtype="float32")[None, None, :out_dims].ndim, -3, -2
+            )
+        )
+        self.register_buffer(name="spec_min", tensor=spec_min, persistable=False)
+        self.register_buffer(name="spec_max", tensor=spec_max, persistable=False)
+    def p_losses(self, x_end, t, cond):
+        x_start = paddle.randn(shape=x_end.shape, dtype=x_end.dtype)
+        x_t = x_start + t[:, None, None, None] * (x_end - x_start)
+        v_pred = self.velocity_fn(x_t, t * self.time_scale_factor, cond)
+        return v_pred, x_end - x_start
+    def forward(self, condition, gt_spec=None, src_spec=None, infer=True):
+        cond = condition.transpose(perm=paddle_aux.transpose_aux_func(condition.ndim, 1, 2))
+        b, device = tuple(condition.shape)[0], condition.place
+        if not infer:
+            spec = self.norm_spec(gt_spec).transpose(
+                perm=paddle_aux.transpose_aux_func(self.norm_spec(gt_spec).ndim, -2, -1)
+            )
+            if self.num_feats == 1:
+                spec = spec[:, None, :, :]
+            t = self.t_start + (1.0 - self.t_start) * paddle.rand(shape=(b,))
+            v_pred, v_gt = self.p_losses(spec, t, cond=cond)
+            return v_pred, v_gt, t
+        else:
+            if src_spec is not None:
+                spec = self.norm_spec(src_spec).transpose(
+                    perm=paddle_aux.transpose_aux_func(self.norm_spec(src_spec).ndim, -2, -1)
+                )
+                if self.num_feats == 1:
+                    spec = spec[:, None, :, :]
+            else:
+                spec = None
+            x = self.inference(cond, b=b, x_end=spec, device=device)
+            return self.denorm_spec(x)
+    @paddle.no_grad()
+    def sample_euler(self, x, t, dt, cond):
+        x += self.velocity_fn(x, self.time_scale_factor * t, cond) * dt
+        t += dt
+        return x, t
+    @paddle.no_grad()
+    def sample_rk2(self, x, t, dt, cond):
+        k_1 = self.velocity_fn(x, self.time_scale_factor * t, cond)
+        k_2 = self.velocity_fn(x + 0.5 * k_1 * dt, self.time_scale_factor * (t + 0.5 * dt), cond)
+        x += k_2 * dt
+        t += dt
+        return x, t
+    @paddle.no_grad()
+    def sample_rk4(self, x, t, dt, cond):
+        k_1 = self.velocity_fn(x, self.time_scale_factor * t, cond)
+        k_2 = self.velocity_fn(x + 0.5 * k_1 * dt, self.time_scale_factor * (t + 0.5 * dt), cond)
+        k_3 = self.velocity_fn(x + 0.5 * k_2 * dt, self.time_scale_factor * (t + 0.5 * dt), cond)
+        k_4 = self.velocity_fn(x + k_3 * dt, self.time_scale_factor * (t + dt), cond)
+        x += (k_1 + 2 * k_2 + 2 * k_3 + k_4) * dt / 6
+        t += dt
+        return x, t
+    @paddle.no_grad()
+    def sample_rk5(self, x, t, dt, cond):
+        k_1 = self.velocity_fn(x, self.time_scale_factor * t, cond)
+        k_2 = self.velocity_fn(x + 0.25 * k_1 * dt, self.time_scale_factor * (t + 0.25 * dt), cond)
+        k_3 = self.velocity_fn(x + 0.125 * (k_2 + k_1) * dt, self.time_scale_factor * (t + 0.25 * dt), cond)
+        k_4 = self.velocity_fn(x + 0.5 * (-k_2 + 2 * k_3) * dt, self.time_scale_factor * (t + 0.5 * dt), cond)
+        k_5 = self.velocity_fn(x + 0.0625 * (3 * k_1 + 9 * k_4) * dt, self.time_scale_factor * (t + 0.75 * dt), cond)
+        k_6 = self.velocity_fn(
+            x + (-3 * k_1 + 2 * k_2 + 12 * k_3 - 12 * k_4 + 8 * k_5) * dt / 7, self.time_scale_factor * (t + dt), cond
+        )
+        x += (7 * k_1 + 32 * k_3 + 12 * k_4 + 32 * k_5 + 7 * k_6) * dt / 90
+        t += dt
+        return x, t
+    @paddle.no_grad()
+    def inference(self, cond, b=1, x_end=None, device=None):
+        noise = paddle.randn(shape=[b, self.num_feats, self.out_dims, tuple(cond.shape)[2]])
+        t_start = hparams.get("T_start_infer", self.t_start)
+        if self.use_shallow_diffusion and t_start > 0:
+            assert x_end is not None, "Missing shallow diffusion source."
+            if t_start >= 1.0:
+                t_start = 1.0
+                x = x_end
+            else:
+                x = t_start * x_end + (1 - t_start) * noise
+        else:
+            t_start = 0.0
+            x = noise
+        algorithm = hparams["sampling_algorithm"]
+        infer_step = hparams["sampling_steps"]
+        if t_start < 1:
+            dt = (1.0 - t_start) / max(1, infer_step)
+            algorithm_fn = {
+                "euler": self.sample_euler,
+                "rk2": self.sample_rk2,
+                "rk4": self.sample_rk4,
+                "rk5": self.sample_rk5,
+            }.get(algorithm)
+            if algorithm_fn is None:
+                raise ValueError(f"Unsupported algorithm for Rectified Flow: {algorithm}.")
+            dts = paddle.to_tensor(data=[dt]).to(x)
+            for i in tqdm(
+                range(infer_step), desc="sample time step", total=infer_step, disable=not hparams["infer"], leave=False
+            ):
+                x, _ = algorithm_fn(x, t_start + i * dts, dt, cond)
+            x = x.astype(dtype="float32")
+        x = x.transpose(perm=paddle_aux.transpose_aux_func(x.ndim, 2, 3)).squeeze(axis=1)
+        return x
+    def norm_spec(self, x):
+        return (x - self.spec_min) / (self.spec_max - self.spec_min) * 2 - 1
+    def denorm_spec(self, x):
+        return (x + 1) / 2 * (self.spec_max - self.spec_min) + self.spec_min
+class RepetitiveRectifiedFlow(RectifiedFlow):
+    def __init__(
+        self,
+        vmin: (float | int | list),
+        vmax: (float | int | list),
+        repeat_bins: int,
+        time_scale_factor=1000,
+        backbone_type=None,
+        backbone_args=None,
+    ):
+        assert isinstance(vmin, (float, int)) and isinstance(vmin, (float, int)) or len(vmin) == len(vmax)
+        num_feats = 1 if isinstance(vmin, (float, int)) else len(vmin)
+        spec_min = [vmin] if num_feats == 1 else [[v] for v in vmin]
+        spec_max = [vmax] if num_feats == 1 else [[v] for v in vmax]
+        self.repeat_bins = repeat_bins
+        super().__init__(
+            out_dims=repeat_bins,
+            num_feats=num_feats,
+            time_scale_factor=time_scale_factor,
+            backbone_type=backbone_type,
+            backbone_args=backbone_args,
+            spec_min=spec_min,
+            spec_max=spec_max,
+        )
+    def norm_spec(self, x):
+        """
+        :param x: [B, T] or [B, F, T]
+        :return [B, T, R] or [B, F, T, R]
+        """
+        if self.num_feats == 1:
+            repeats = [1, 1, self.repeat_bins]
+        else:
+            repeats = [1, 1, 1, self.repeat_bins]
+        return super().norm_spec(x.unsqueeze(axis=-1).tile(repeat_times=repeats))
+    def denorm_spec(self, x):
+        """
+        :param x: [B, T, R] or [B, F, T, R]
+        :return [B, T] or [B, F, T]
+        """
+        return super().denorm_spec(x).mean(axis=-1)
+class PitchRectifiedFlow(RepetitiveRectifiedFlow):
+    def __init__(
+        self,
+        vmin: float,
+        vmax: float,
+        cmin: float,
+        cmax: float,
+        repeat_bins,
+        time_scale_factor=1000,
+        backbone_type=None,
+        backbone_args=None,
+    ):
+        self.vmin = vmin
+        self.vmax = vmax
+        self.cmin = cmin
+        self.cmax = cmax
+        super().__init__(
+            vmin=vmin,
+            vmax=vmax,
+            repeat_bins=repeat_bins,
+            time_scale_factor=time_scale_factor,
+            backbone_type=backbone_type,
+            backbone_args=backbone_args,
+        )
+    def norm_spec(self, x):
+        return super().norm_spec(x.clip(min=self.cmin, max=self.cmax))
+    def denorm_spec(self, x):
+        return super().denorm_spec(x).clip(min=self.cmin, max=self.cmax)
+class MultiVarianceRectifiedFlow(RepetitiveRectifiedFlow):
+    def __init__(
+        self,
+        ranges: List[Tuple[float, float]],
+        clamps: List[Tuple[float | None, float | None] | None],
+        repeat_bins,
+        time_scale_factor=1000,
+        backbone_type=None,
+        backbone_args=None,
+    ):
+        assert len(ranges) == len(clamps)
+        self.clamps = clamps
+        vmin = [r[0] for r in ranges]
+        vmax = [r[1] for r in ranges]
+        if len(vmin) == 1:
+            vmin = vmin[0]
+        if len(vmax) == 1:
+            vmax = vmax[0]
+        super().__init__(
+            vmin=vmin,
+            vmax=vmax,
+            repeat_bins=repeat_bins,
+            time_scale_factor=time_scale_factor,
+            backbone_type=backbone_type,
+            backbone_args=backbone_args,
+        )
+    def clamp_spec(self, xs: (list | tuple)):
+        clamped = []
+        for x, c in zip(xs, self.clamps):
+            if c is None:
+                clamped.append(x)
+                continue
+            clamped.append(x.clip(min=c[0], max=c[1]))
+        return clamped
+    def norm_spec(self, xs: (list | tuple)):
+        """
+        :param xs: sequence of [B, T]
+        :return: [B, F, T] => super().norm_spec(xs) => [B, F, T, R]
+        """
+        assert len(xs) == self.num_feats
+        clamped = self.clamp_spec(xs)
+        xs = paddle.stack(x=clamped, axis=1)
+        if self.num_feats == 1:
+            xs = xs.squeeze(axis=1)
+        return super().norm_spec(xs)
+    def denorm_spec(self, xs):
+        """
+        :param xs: [B, T, R] or [B, F, T, R] => super().denorm_spec(xs) => [B, T] or [B, F, T]
+        :return: sequence of [B, T]
+        """
+        xs = super().denorm_spec(xs)
+        if self.num_feats == 1:
+            xs = [xs]
+        else:
+            xs = xs.unbind(axis=1)
+        assert len(xs) == self.num_feats
+        return self.clamp_spec(xs)

VLMEvalKit_old/PaddleMIX/paddlemix/models/diffsinger/modules/fastspeech/acoustic_encoder.py ADDED Viewed

	@@ -0,0 +1,110 @@

+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import paddle
+from paddlemix.models.diffsinger.modules.commons.common_layers import (
+    NormalInitEmbedding as Embedding,
+)
+from paddlemix.models.diffsinger.modules.commons.common_layers import (
+    XavierUniformInitLinear as Linear,
+)
+from paddlemix.models.diffsinger.modules.fastspeech.tts_modules import (
+    FastSpeech2Encoder,
+    mel2ph_to_dur,
+)
+from paddlemix.models.diffsinger.utils.hparams import hparams
+from paddlemix.models.diffsinger.utils.text_encoder import PAD_INDEX
+class FastSpeech2Acoustic(paddle.nn.Layer):
+    def __init__(self, vocab_size):
+        super().__init__()
+        self.txt_embed = Embedding(vocab_size, hparams["hidden_size"], PAD_INDEX)
+        self.dur_embed = Linear(1, hparams["hidden_size"])
+        self.encoder = FastSpeech2Encoder(
+            hidden_size=hparams["hidden_size"],
+            num_layers=hparams["enc_layers"],
+            ffn_kernel_size=hparams["enc_ffn_kernel_size"],
+            ffn_act=hparams["ffn_act"],
+            dropout=hparams["dropout"],
+            num_heads=hparams["num_heads"],
+            use_pos_embed=hparams["use_pos_embed"],
+            rel_pos=hparams["rel_pos"],
+        )
+        self.pitch_embed = Linear(1, hparams["hidden_size"])
+        self.variance_embed_list = []
+        self.use_energy_embed = hparams.get("use_energy_embed", False)
+        self.use_breathiness_embed = hparams.get("use_breathiness_embed", False)
+        self.use_voicing_embed = hparams.get("use_voicing_embed", False)
+        self.use_tension_embed = hparams.get("use_tension_embed", False)
+        if self.use_energy_embed:
+            self.variance_embed_list.append("energy")
+        if self.use_breathiness_embed:
+            self.variance_embed_list.append("breathiness")
+        if self.use_voicing_embed:
+            self.variance_embed_list.append("voicing")
+        if self.use_tension_embed:
+            self.variance_embed_list.append("tension")
+        self.use_variance_embeds = len(self.variance_embed_list) > 0
+        if self.use_variance_embeds:
+            self.variance_embeds = paddle.nn.LayerDict(
+                sublayers={v_name: Linear(1, hparams["hidden_size"]) for v_name in self.variance_embed_list}
+            )
+        self.use_key_shift_embed = hparams.get("use_key_shift_embed", False)
+        if self.use_key_shift_embed:
+            self.key_shift_embed = Linear(1, hparams["hidden_size"])
+        self.use_speed_embed = hparams.get("use_speed_embed", False)
+        if self.use_speed_embed:
+            self.speed_embed = Linear(1, hparams["hidden_size"])
+        self.use_spk_id = hparams["use_spk_id"]
+        if self.use_spk_id:
+            self.spk_embed = Embedding(hparams["num_spk"], hparams["hidden_size"])
+    def forward_variance_embedding(self, condition, key_shift=None, speed=None, **variances):
+        if self.use_variance_embeds:
+            variance_embeds = paddle.stack(
+                x=[self.variance_embeds[v_name](variances[v_name][:, :, None]) for v_name in self.variance_embed_list],
+                axis=-1,
+            ).sum(axis=-1)
+            condition += variance_embeds
+        if self.use_key_shift_embed:
+            key_shift_embed = self.key_shift_embed(key_shift[:, :, None])
+            condition += key_shift_embed
+        if self.use_speed_embed:
+            speed_embed = self.speed_embed(speed[:, :, None])
+            condition += speed_embed
+        return condition
+    def forward(self, txt_tokens, mel2ph, f0, key_shift=None, speed=None, spk_embed_id=None, **kwargs):
+        txt_embed = self.txt_embed(txt_tokens)
+        # dur = mel2ph_to_dur(mel2ph, tuple(txt_tokens.shape)[1]).float()
+        dur = paddle.cast(mel2ph_to_dur(mel2ph, tuple(txt_tokens.shape)[1]), dtype="float32")
+        dur_embed = self.dur_embed(dur[:, :, None])
+        encoder_out = self.encoder(txt_embed, dur_embed, txt_tokens == 0)
+        encoder_out = paddle.nn.functional.pad(x=encoder_out, pad=[0, 0, 1, 0], pad_from_left_axis=False)
+        mel2ph_ = mel2ph[..., None].tile(repeat_times=[1, 1, tuple(encoder_out.shape)[-1]])
+        condition = paddle.take_along_axis(arr=encoder_out, axis=1, indices=mel2ph_, broadcast=False)
+        if self.use_spk_id:
+            spk_mix_embed = kwargs.get("spk_mix_embed")
+            if spk_mix_embed is not None:
+                spk_embed = spk_mix_embed
+            else:
+                spk_embed = self.spk_embed(spk_embed_id)[:, None, :]
+            condition += spk_embed
+        f0_mel = (1 + f0 / 700).log()
+        pitch_embed = self.pitch_embed(f0_mel[:, :, None])
+        condition += pitch_embed
+        condition = self.forward_variance_embedding(condition, key_shift=key_shift, speed=speed, **kwargs)
+        return condition

VLMEvalKit_old/PaddleMIX/paddlemix/models/diffsinger/modules/fastspeech/param_adaptor.py ADDED Viewed

	@@ -0,0 +1,88 @@

+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import annotations
+import sys
+import paddle
+import paddlemix.models.diffsinger.modules.compat as compat
+from paddlemix.models.diffsinger.modules.core.ddpm import MultiVarianceDiffusion
+from paddlemix.models.diffsinger.utils import filter_kwargs
+from paddlemix.models.diffsinger.utils.hparams import hparams
+VARIANCE_CHECKLIST = ["energy", "breathiness", "voicing", "tension"]
+class ParameterAdaptorModule(paddle.nn.Layer):
+    def __init__(self):
+        super().__init__()
+        self.variance_prediction_list = []
+        self.predict_energy = hparams.get("predict_energy", False)
+        self.predict_breathiness = hparams.get("predict_breathiness", False)
+        self.predict_voicing = hparams.get("predict_voicing", False)
+        self.predict_tension = hparams.get("predict_tension", False)
+        if self.predict_energy:
+            self.variance_prediction_list.append("energy")
+        if self.predict_breathiness:
+            self.variance_prediction_list.append("breathiness")
+        if self.predict_voicing:
+            self.variance_prediction_list.append("voicing")
+        if self.predict_tension:
+            self.variance_prediction_list.append("tension")
+        self.predict_variances = len(self.variance_prediction_list) > 0
+    def build_adaptor(self, cls=MultiVarianceDiffusion):
+        ranges = []
+        clamps = []
+        if self.predict_energy:
+            ranges.append((hparams["energy_db_min"], hparams["energy_db_max"]))
+            clamps.append((hparams["energy_db_min"], 0.0))
+        if self.predict_breathiness:
+            ranges.append((hparams["breathiness_db_min"], hparams["breathiness_db_max"]))
+            clamps.append((hparams["breathiness_db_min"], 0.0))
+        if self.predict_voicing:
+            ranges.append((hparams["voicing_db_min"], hparams["voicing_db_max"]))
+            clamps.append((hparams["voicing_db_min"], 0.0))
+        if self.predict_tension:
+            ranges.append((hparams["tension_logit_min"], hparams["tension_logit_max"]))
+            clamps.append((hparams["tension_logit_min"], hparams["tension_logit_max"]))
+        variances_hparams = hparams["variances_prediction_args"]
+        total_repeat_bins = variances_hparams["total_repeat_bins"]
+        assert (
+            total_repeat_bins % len(self.variance_prediction_list) == 0
+        ), f"Total number of repeat bins must be divisible by number of variance parameters ({len(self.variance_prediction_list)})."
+        repeat_bins = total_repeat_bins // len(self.variance_prediction_list)
+        backbone_type = compat.get_backbone_type(hparams, nested_config=variances_hparams)
+        backbone_args = compat.get_backbone_args(variances_hparams, backbone_type=backbone_type)
+        kwargs = filter_kwargs(
+            {
+                "ranges": ranges,
+                "clamps": clamps,
+                "repeat_bins": repeat_bins,
+                "timesteps": hparams.get("timesteps"),
+                "time_scale_factor": hparams.get("time_scale_factor"),
+                "backbone_type": backbone_type,
+                "backbone_args": backbone_args,
+            },
+            cls,
+        )
+        return cls(**kwargs)
+    def collect_variance_inputs(self, **kwargs) -> list:
+        return [kwargs.get(name) for name in self.variance_prediction_list]
+    def collect_variance_outputs(self, variances: (list | tuple)) -> dict:
+        return {name: pred for name, pred in zip(self.variance_prediction_list, variances)}

VLMEvalKit_old/PaddleMIX/paddlemix/models/diffsinger/modules/fastspeech/tts_modules.py ADDED Viewed

	@@ -0,0 +1,473 @@

+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import math
+import sys
+import paddle
+from paddlemix.models.diffsinger.utils import paddle_aux
+from paddlemix.models.diffsinger.modules.commons.common_layers import (
+    EncSALayer,
+    SinusoidalPositionalEmbedding,
+)
+from paddlemix.models.diffsinger.modules.commons.espnet_positional_embedding import (
+    RelPositionalEncoding,
+)
+DEFAULT_MAX_SOURCE_POSITIONS = 2000
+DEFAULT_MAX_TARGET_POSITIONS = 2000
+class TransformerEncoderLayer(paddle.nn.Layer):
+    def __init__(self, hidden_size, dropout, kernel_size=None, act="gelu", num_heads=2):
+        super().__init__()
+        self.op = EncSALayer(
+            hidden_size,
+            num_heads,
+            dropout=dropout,
+            attention_dropout=0.0,
+            relu_dropout=dropout,
+            kernel_size=kernel_size,
+            act=act,
+        )
+    def forward(self, x, **kwargs):
+        return self.op(x, **kwargs)
+class LayerNorm(paddle.nn.LayerNorm):
+    """Layer normalization module.
+    :param int nout: output dim size
+    :param int dim: dimension to be normalized
+    """
+    def __init__(self, nout, dim=-1):
+        """Construct an LayerNorm object."""
+        super(LayerNorm, self).__init__(nout, eps=1e-12)
+        self.dim = dim
+    def forward(self, x):
+        """Apply layer normalization.
+        :param torch.Tensor x: input tensor
+        :return: layer normalized tensor
+        :rtype torch.Tensor
+        """
+        if self.dim == -1:
+            return super(LayerNorm, self).forward(x)
+        return (
+            super(LayerNorm, self)
+            .forward(x.transpose(perm=paddle_aux.transpose_aux_func(x.ndim, 1, -1)))
+            .transpose(
+                perm=paddle_aux.transpose_aux_func(
+                    super(LayerNorm, self)
+                    .forward(x.transpose(perm=paddle_aux.transpose_aux_func(x.ndim, 1, -1)))
+                    .ndim,
+                    1,
+                    -1,
+                )
+            )
+        )
+class DurationPredictor(paddle.nn.Layer):
+    """Duration predictor module.
+    This is a module of duration predictor described in `FastSpeech: Fast, Robust and Controllable Text to Speech`_.
+    The duration predictor predicts a duration of each frame in log domain from the hidden embeddings of encoder.
+    .. _`FastSpeech: Fast, Robust and Controllable Text to Speech`:
+        https://arxiv.org/pdf/1905.09263.pdf
+    Note:
+        The calculation domain of outputs is different between in `forward` and in `inference`. In `forward`,
+        the outputs are calculated in log domain but in `inference`, those are calculated in linear domain.
+    """
+    def __init__(
+        self, in_dims, n_layers=2, n_chans=384, kernel_size=3, dropout_rate=0.1, offset=1.0, dur_loss_type="mse"
+    ):
+        """Initialize duration predictor module.
+        Args:
+            in_dims (int): Input dimension.
+            n_layers (int, optional): Number of convolutional layers.
+            n_chans (int, optional): Number of channels of convolutional layers.
+            kernel_size (int, optional): Kernel size of convolutional layers.
+            dropout_rate (float, optional): Dropout rate.
+            offset (float, optional): Offset value to avoid nan in log domain.
+        """
+        super(DurationPredictor, self).__init__()
+        self.offset = offset
+        self.conv = paddle.nn.LayerList()
+        self.kernel_size = kernel_size
+        for idx in range(n_layers):
+            in_chans = in_dims if idx == 0 else n_chans
+            self.conv.append(
+                paddle.nn.Sequential(
+                    paddle.nn.Identity(),
+                    paddle.nn.Conv1D(
+                        in_channels=in_chans,
+                        out_channels=n_chans,
+                        kernel_size=kernel_size,
+                        stride=1,
+                        padding=kernel_size // 2,
+                    ),
+                    paddle.nn.ReLU(),
+                    LayerNorm(n_chans, dim=1),
+                    paddle.nn.Dropout(p=dropout_rate),
+                )
+            )
+        self.loss_type = dur_loss_type
+        if self.loss_type in ["mse", "huber"]:
+            self.out_dims = 1
+        else:
+            raise NotImplementedError()
+        self.linear = paddle.nn.Linear(in_features=n_chans, out_features=self.out_dims)
+    def out2dur(self, xs):
+        if self.loss_type in ["mse", "huber"]:
+            dur = xs.squeeze(axis=-1).exp() - self.offset
+        else:
+            raise NotImplementedError()
+        return dur
+    def forward(self, xs, x_masks=None, infer=True):
+        """Calculate forward propagation.
+        Args:
+            xs (Tensor): Batch of input sequences (B, Tmax, idim).
+            x_masks (BoolTensor, optional): Batch of masks indicating padded part (B, Tmax).
+            infer (bool): Whether inference
+        Returns:
+            (train) FloatTensor, (infer) LongTensor: Batch of predicted durations in linear domain (B, Tmax).
+        """
+        xs = xs.transpose(perm=paddle_aux.transpose_aux_func(xs.ndim, 1, -1))
+        masks = 1 - x_masks.astype(dtype="float32")
+        masks_ = masks[:, None, :]
+        for f in self.conv:
+            xs = f(xs)
+            if x_masks is not None:
+                xs = xs * masks_
+        xs = self.linear(xs.transpose(perm=paddle_aux.transpose_aux_func(xs.ndim, 1, -1)))
+        xs = xs * masks[:, :, None]
+        dur_pred = self.out2dur(xs)
+        if infer:
+            dur_pred = dur_pred.clip(min=0.0)
+        return dur_pred
+class VariancePredictor(paddle.nn.Layer):
+    def __init__(self, vmin, vmax, in_dims, n_layers=5, n_chans=512, kernel_size=5, dropout_rate=0.1):
+        """Initialize variance predictor module.
+        Args:
+            in_dims (int): Input dimension.
+            n_layers (int, optional): Number of convolutional layers.
+            n_chans (int, optional): Number of channels of convolutional layers.
+            kernel_size (int, optional): Kernel size of convolutional layers.
+            dropout_rate (float, optional): Dropout rate.
+        """
+        super(VariancePredictor, self).__init__()
+        self.vmin = vmin
+        self.vmax = vmax
+        self.conv = paddle.nn.LayerList()
+        self.kernel_size = kernel_size
+        for idx in range(n_layers):
+            in_chans = in_dims if idx == 0 else n_chans
+            self.conv.append(
+                paddle.nn.Sequential(
+                    paddle.nn.Conv1D(
+                        in_channels=in_chans,
+                        out_channels=n_chans,
+                        kernel_size=kernel_size,
+                        stride=1,
+                        padding=kernel_size // 2,
+                    ),
+                    paddle.nn.ReLU(),
+                    LayerNorm(n_chans, dim=1),
+                    paddle.nn.Dropout(p=dropout_rate),
+                )
+            )
+        self.linear = paddle.nn.Linear(in_features=n_chans, out_features=1)
+        self.embed_positions = SinusoidalPositionalEmbedding(in_dims, 0, init_size=4096)
+        self.pos_embed_alpha = paddle.base.framework.EagerParamBase.from_tensor(
+            tensor=paddle.to_tensor(data=[1], dtype="float32")
+        )
+    def out2value(self, xs):
+        return (xs + 1) / 2 * (self.vmax - self.vmin) + self.vmin
+    def forward(self, xs, infer=True):
+        """
+        :param xs: [B, T, H]
+        :param infer: whether inference
+        :return: [B, T]
+        """
+        positions = self.pos_embed_alpha * self.embed_positions(xs[..., 0])
+        xs = xs + positions
+        xs = xs.transpose(perm=paddle_aux.transpose_aux_func(xs.ndim, 1, -1))
+        for f in self.conv:
+            xs = f(xs)
+        xs = self.linear(xs.transpose(perm=paddle_aux.transpose_aux_func(xs.ndim, 1, -1))).squeeze(axis=-1)
+        if infer:
+            xs = self.out2value(xs)
+        return xs
+class PitchPredictor(paddle.nn.Layer):
+    def __init__(
+        self, vmin, vmax, num_bins, deviation, in_dims, n_layers=5, n_chans=384, kernel_size=5, dropout_rate=0.1
+    ):
+        """Initialize pitch predictor module.
+        Args:
+            in_dims (int): Input dimension.
+            n_layers (int, optional): Number of convolutional layers.
+            n_chans (int, optional): Number of channels of convolutional layers.
+            kernel_size (int, optional): Kernel size of convolutional layers.
+            dropout_rate (float, optional): Dropout rate.
+        """
+        super(PitchPredictor, self).__init__()
+        self.vmin = vmin
+        self.vmax = vmax
+        self.interval = (vmax - vmin) / (num_bins - 1)
+        self.sigma = deviation / self.interval
+        self.register_buffer(name="x", tensor=paddle.arange(end=num_bins).astype(dtype="float32").reshape(1, 1, -1))
+        self.base_pitch_embed = paddle.nn.Linear(in_features=1, out_features=in_dims)
+        self.conv = paddle.nn.LayerList()
+        self.kernel_size = kernel_size
+        for idx in range(n_layers):
+            in_chans = in_dims if idx == 0 else n_chans
+            self.conv.append(
+                paddle.nn.Sequential(
+                    paddle.nn.Conv1D(
+                        in_channels=in_chans,
+                        out_channels=n_chans,
+                        kernel_size=kernel_size,
+                        stride=1,
+                        padding=kernel_size // 2,
+                    ),
+                    paddle.nn.ReLU(),
+                    LayerNorm(n_chans, dim=1),
+                    paddle.nn.Dropout(p=dropout_rate),
+                )
+            )
+        self.linear = paddle.nn.Linear(in_features=n_chans, out_features=num_bins)
+        self.embed_positions = SinusoidalPositionalEmbedding(in_dims, 0, init_size=4096)
+        self.pos_embed_alpha = paddle.base.framework.EagerParamBase.from_tensor(
+            tensor=paddle.to_tensor(data=[1], dtype="float32")
+        )
+    def bins_to_values(self, bins):
+        return bins * self.interval + self.vmin
+    def out2pitch(self, probs):
+        logits = probs.sigmoid()
+        bins = paddle.sum(x=self.x * logits, axis=2) / paddle.sum(x=logits, axis=2)
+        pitch = self.bins_to_values(bins)
+        return pitch
+    def forward(self, xs, base):
+        """
+        :param xs: [B, T, H]
+        :param base: [B, T]
+        :return: [B, T, N]
+        """
+        xs = xs + self.base_pitch_embed(base[..., None])
+        positions = self.pos_embed_alpha * self.embed_positions(xs[..., 0])
+        xs = xs + positions
+        xs = xs.transpose(perm=paddle_aux.transpose_aux_func(xs.ndim, 1, -1))
+        for f in self.conv:
+            xs = f(xs)
+        xs = self.linear(xs.transpose(perm=paddle_aux.transpose_aux_func(xs.ndim, 1, -1)))
+        return self.out2pitch(xs) + base, xs
+class RhythmRegulator(paddle.nn.Layer):
+    def __init__(self, eps=1e-05):
+        super().__init__()
+        self.eps = eps
+    def forward(self, ph_dur, ph2word, word_dur):
+        """
+        Example (no batch dim version):
+            1. ph_dur = [4,2,3,2]
+            2. word_dur = [3,4,2], ph2word = [1,2,2,3]
+            3. word_dur_in = [4,5,2]
+            4. alpha_w = [0.75,0.8,1], alpha_ph = [0.75,0.8,0.8,1]
+            5. ph_dur_out = [3,1.6,2.4,2]
+        :param ph_dur: [B, T_ph]
+        :param ph2word: [B, T_ph]
+        :param word_dur: [B, T_w]
+        """
+        ph_dur = ph_dur.astype(dtype="float32") * (ph2word > 0)
+        word_dur = word_dur.astype(dtype="float32")
+        word_dur_in = paddle.zeros(
+            shape=[tuple(ph_dur.shape)[0], ph2word.max() + 1], dtype=ph_dur.dtype
+        ).put_along_axis(axis=1, indices=ph2word, values=ph_dur, reduce="add")[:, 1:]
+        alpha_w = word_dur / word_dur_in.clip(min=self.eps)
+        alpha_ph = paddle.take_along_axis(
+            arr=paddle.nn.functional.pad(x=alpha_w, pad=[1, 0], pad_from_left_axis=False),
+            axis=1,
+            indices=ph2word,
+            broadcast=False,
+        )
+        ph_dur_out = ph_dur * alpha_ph
+        return ph_dur_out.round().astype(dtype="int64")
+class LengthRegulator(paddle.nn.Layer):
+    def forward(self, dur, dur_padding=None, alpha=None):
+        """
+        Example (no batch dim version):
+            1. dur = [2,2,3]
+            2. token_idx = [[1],[2],[3]], dur_cumsum = [2,4,7], dur_cumsum_prev = [0,2,4]
+            3. token_mask = [[1,1,0,0,0,0,0],
+                             [0,0,1,1,0,0,0],
+                             [0,0,0,0,1,1,1]]
+            4. token_idx * token_mask = [[1,1,0,0,0,0,0],
+                                         [0,0,2,2,0,0,0],
+                                         [0,0,0,0,3,3,3]]
+            5. (token_idx * token_mask).sum(0) = [1,1,2,2,3,3,3]
+        :param dur: Batch of durations of each frame (B, T_txt)
+        :param dur_padding: Batch of padding of each frame (B, T_txt)
+        :param alpha: duration rescale coefficient
+        :return:
+            mel2ph (B, T_speech)
+        """
+        assert alpha is None or alpha > 0
+        if alpha is not None:
+            dur = paddle.round(dur.astype(dtype="float32") * alpha).astype(dtype="int64")
+        if dur_padding is not None:
+            dur = dur * (1 - dur_padding.astype(dtype="int64"))
+        token_idx = paddle.arange(start=1, end=tuple(dur.shape)[1] + 1)[None, :, None].to(dur.place)
+        dur_cumsum = paddle.cumsum(x=dur, axis=1)
+        # dur_cumsum_prev = paddle.nn.functional.pad(x=dur_cumsum, pad=[1, -1
+        #     ], mode='constant', value=0, pad_from_left_axis=False)
+        dur_cumsum_prev = paddle.concat([paddle.zeros_like(dur_cumsum[:, :1]), dur_cumsum[:, :-1]], axis=1)
+        pos_idx = paddle.arange(end=dur.sum(axis=-1).max())[None, None].to(dur.place)
+        token_mask = (pos_idx >= dur_cumsum_prev[:, :, None]) & (pos_idx < dur_cumsum[:, :, None])
+        mel2ph = (token_idx * token_mask.astype(dtype="int64")).sum(axis=1)
+        return mel2ph
+class StretchRegulator(paddle.nn.Layer):
+    def forward(self, mel2ph, dur=None):
+        """
+        Example (no batch dim version):
+            1. dur = [2,4,3]
+            2. mel2ph = [1,1,2,2,2,2,3,3,3]
+            3. mel2dur = [2,2,4,4,4,4,3,3,3]
+            4. bound_mask = [0,1,0,0,0,1,0,0,1]
+            5. 1 - bound_mask * mel2dur = [1,-1,1,1,1,-3,1,1,-2] => pad => [0,1,-1,1,1,1,-3,1,1]
+            6. stretch_denorm = [0,1,0,1,2,3,0,1,2]
+        :param dur: Batch of durations of each frame (B, T_txt)
+        :param mel2ph: Batch of mel2ph (B, T_speech)
+        :return:
+            stretch (B, T_speech)
+        """
+        if dur is None:
+            dur = mel2ph_to_dur(mel2ph, mel2ph.max())
+        dur = paddle.nn.functional.pad(x=dur, pad=[1, 0], value=1, pad_from_left_axis=False)
+        mel2dur = paddle.take_along_axis(arr=dur, axis=1, indices=mel2ph, broadcast=False)
+        bound_mask = paddle.greater_than(x=mel2ph[:, 1:], y=paddle.to_tensor(mel2ph[:, :-1]))
+        bound_mask = paddle.nn.functional.pad(
+            x=bound_mask, pad=[0, 1], mode="constant", value=True, pad_from_left_axis=False
+        )
+        stretch_delta = 1 - bound_mask * mel2dur
+        stretch_delta = paddle.nn.functional.pad(
+            x=stretch_delta, pad=[1, -1], mode="constant", value=0, pad_from_left_axis=False
+        )
+        stretch_denorm = paddle.cumsum(x=stretch_delta, axis=1)
+        stretch = stretch_denorm / mel2dur
+        return stretch * (mel2ph > 0)
+def mel2ph_to_dur(mel2ph, T_txt, max_dur=None):
+    B, _ = tuple(mel2ph.shape)
+    dur = paddle.zeros(shape=[B, T_txt + 1], dtype=mel2ph.dtype).put_along_axis(
+        axis=1, indices=mel2ph, values=paddle.ones_like(x=mel2ph), reduce="add"
+    )
+    dur = dur[:, 1:]
+    if max_dur is not None:
+        dur = dur.clip(max=max_dur)
+    return dur
+class FastSpeech2Encoder(paddle.nn.Layer):
+    def __init__(
+        self,
+        hidden_size,
+        num_layers,
+        ffn_kernel_size=9,
+        ffn_act="gelu",
+        dropout=None,
+        num_heads=2,
+        use_pos_embed=True,
+        rel_pos=True,
+    ):
+        super().__init__()
+        self.num_layers = num_layers
+        embed_dim = self.hidden_size = hidden_size
+        self.dropout = dropout
+        self.use_pos_embed = use_pos_embed
+        self.layers = paddle.nn.LayerList(
+            sublayers=[
+                TransformerEncoderLayer(
+                    self.hidden_size, self.dropout, kernel_size=ffn_kernel_size, act=ffn_act, num_heads=num_heads
+                )
+                for _ in range(self.num_layers)
+            ]
+        )
+        self.layer_norm = paddle.nn.LayerNorm(normalized_shape=embed_dim)
+        self.embed_scale = math.sqrt(hidden_size)
+        self.padding_idx = 0
+        self.rel_pos = rel_pos
+        if self.rel_pos:
+            self.embed_positions = RelPositionalEncoding(hidden_size, dropout_rate=0.0)
+        else:
+            self.embed_positions = SinusoidalPositionalEmbedding(
+                hidden_size, self.padding_idx, init_size=DEFAULT_MAX_TARGET_POSITIONS
+            )
+    def forward_embedding(self, main_embed, extra_embed=None, padding_mask=None):
+        x = self.embed_scale * main_embed
+        if extra_embed is not None:
+            x = x + extra_embed
+        if self.use_pos_embed:
+            if self.rel_pos:
+                x = self.embed_positions(x)
+            else:
+                positions = self.embed_positions(~padding_mask)
+                x = x + positions
+        x = paddle.nn.functional.dropout(x=x, p=self.dropout, training=self.training)
+        return x
+    def forward(self, main_embed, extra_embed, padding_mask, attn_mask=None, return_hiddens=False):
+        x = self.forward_embedding(main_embed, extra_embed, padding_mask=padding_mask)
+        nonpadding_mask_TB = (
+            1
+            - padding_mask.transpose(perm=paddle_aux.transpose_aux_func(padding_mask.ndim, 0, 1)).astype(
+                dtype="float32"
+            )[:, :, None]
+        )
+        x = x.transpose(perm=paddle_aux.transpose_aux_func(x.ndim, 0, 1)) * nonpadding_mask_TB
+        hiddens = []
+        for layer in self.layers:
+            x = layer(x, encoder_padding_mask=padding_mask, attn_mask=attn_mask) * nonpadding_mask_TB
+            hiddens.append(x)
+        x = self.layer_norm(x) * nonpadding_mask_TB
+        if return_hiddens:
+            x = paddle.stack(x=hiddens, axis=0)
+            x = x.transpose(perm=paddle_aux.transpose_aux_func(x.ndim, 1, 2))
+        else:
+            x = x.transpose(perm=paddle_aux.transpose_aux_func(x.ndim, 0, 1))
+        return x

VLMEvalKit_old/PaddleMIX/paddlemix/models/diffsinger/modules/fastspeech/variance_encoder.py ADDED Viewed

	@@ -0,0 +1,151 @@

+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import sys
+import paddle
+from paddlemix.models.diffsinger.utils import paddle_aux
+from paddlemix.models.diffsinger.modules.commons.common_layers import (
+    NormalInitEmbedding as Embedding,
+)
+from paddlemix.models.diffsinger.modules.commons.common_layers import (
+    XavierUniformInitLinear as Linear,
+)
+from paddlemix.models.diffsinger.modules.fastspeech.tts_modules import (
+    DurationPredictor,
+    FastSpeech2Encoder,
+)
+from paddlemix.models.diffsinger.utils.hparams import hparams
+from paddlemix.models.diffsinger.utils.text_encoder import PAD_INDEX
+class FastSpeech2Variance(paddle.nn.Layer):
+    def __init__(self, vocab_size):
+        super().__init__()
+        self.predict_dur = hparams["predict_dur"]
+        self.linguistic_mode = "word" if hparams["predict_dur"] else "phoneme"
+        self.txt_embed = Embedding(vocab_size, hparams["hidden_size"], PAD_INDEX)
+        if self.predict_dur:
+            self.onset_embed = Embedding(2, hparams["hidden_size"])
+            self.word_dur_embed = Linear(1, hparams["hidden_size"])
+        else:
+            self.ph_dur_embed = Linear(1, hparams["hidden_size"])
+        self.encoder = FastSpeech2Encoder(
+            hidden_size=hparams["hidden_size"],
+            num_layers=hparams["enc_layers"],
+            ffn_kernel_size=hparams["enc_ffn_kernel_size"],
+            ffn_act=hparams["ffn_act"],
+            dropout=hparams["dropout"],
+            num_heads=hparams["num_heads"],
+            use_pos_embed=hparams["use_pos_embed"],
+            rel_pos=hparams["rel_pos"],
+        )
+        dur_hparams = hparams["dur_prediction_args"]
+        if self.predict_dur:
+            self.midi_embed = Embedding(128, hparams["hidden_size"])
+            self.dur_predictor = DurationPredictor(
+                in_dims=hparams["hidden_size"],
+                n_chans=dur_hparams["hidden_size"],
+                n_layers=dur_hparams["num_layers"],
+                dropout_rate=dur_hparams["dropout"],
+                kernel_size=dur_hparams["kernel_size"],
+                offset=dur_hparams["log_offset"],
+                dur_loss_type=dur_hparams["loss_type"],
+            )
+    def forward(self, txt_tokens, midi, ph2word, ph_dur=None, word_dur=None, spk_embed=None, infer=True):
+        """
+        :param txt_tokens: (train, infer) [B, T_ph]
+        :param midi: (train, infer) [B, T_ph]
+        :param ph2word: (train, infer) [B, T_ph]
+        :param ph_dur: (train, [infer]) [B, T_ph]
+        :param word_dur: (infer) [B, T_w]
+        :param spk_embed: (train) [B, T_ph, H]
+        :param infer: whether inference
+        :return: encoder_out, ph_dur_pred
+        """
+        txt_embed = self.txt_embed(txt_tokens)
+        if self.linguistic_mode == "word":
+            b = tuple(txt_tokens.shape)[0]
+            onset = paddle.diff(x=ph2word, axis=1, prepend=paddle.zeros(shape=[b, 1], dtype=ph2word.dtype)) > 0
+            onset_embed = self.onset_embed(onset.astype(dtype="int64"))
+            if word_dur is None or not infer:
+                word_dur = paddle.zeros(shape=[b, ph2word.max() + 1], dtype=ph_dur.dtype).put_along_axis(
+                    axis=1, indices=ph2word, values=ph_dur, reduce="add"
+                )[:, 1:]
+            word_dur = paddle.take_along_axis(
+                arr=paddle.nn.functional.pad(x=word_dur, pad=[1, 0], value=0, pad_from_left_axis=False),
+                axis=1,
+                indices=ph2word,
+                broadcast=False,
+            )
+            word_dur_embed = self.word_dur_embed(word_dur.astype(dtype="float32")[:, :, None])
+            encoder_out = self.encoder(txt_embed, onset_embed + word_dur_embed, txt_tokens == 0)
+        else:
+            ph_dur_embed = self.ph_dur_embed(ph_dur.astype(dtype="float32")[:, :, None])
+            encoder_out = self.encoder(txt_embed, ph_dur_embed, txt_tokens == 0)
+        if self.predict_dur:
+            midi_embed = self.midi_embed(midi)
+            dur_cond = encoder_out + midi_embed
+            if spk_embed is not None:
+                dur_cond += spk_embed
+            ph_dur_pred = self.dur_predictor(dur_cond, x_masks=txt_tokens == PAD_INDEX, infer=infer)
+            return encoder_out, ph_dur_pred
+        else:
+            return encoder_out, None
+class MelodyEncoder(paddle.nn.Layer):
+    def __init__(self, enc_hparams: dict):
+        super().__init__()
+        def get_hparam(key):
+            return enc_hparams.get(key, hparams.get(key))
+        hidden_size = get_hparam("hidden_size")
+        self.note_midi_embed = Linear(1, hidden_size)
+        self.note_dur_embed = Linear(1, hidden_size)
+        self.use_glide_embed = hparams["use_glide_embed"]
+        self.glide_embed_scale = hparams["glide_embed_scale"]
+        if self.use_glide_embed:
+            self.note_glide_embed = Embedding(len(hparams["glide_types"]) + 1, hidden_size, padding_idx=0)
+        self.encoder = FastSpeech2Encoder(
+            hidden_size=hidden_size,
+            num_layers=get_hparam("enc_layers"),
+            ffn_kernel_size=get_hparam("enc_ffn_kernel_size"),
+            ffn_act=get_hparam("ffn_act"),
+            dropout=get_hparam("dropout"),
+            num_heads=get_hparam("num_heads"),
+            use_pos_embed=get_hparam("use_pos_embed"),
+            rel_pos=get_hparam("rel_pos"),
+        )
+        self.out_proj = Linear(hidden_size, hparams["hidden_size"])
+    def forward(self, note_midi, note_rest, note_dur, glide=None):
+        """
+        :param note_midi: float32 [B, T_n], -1: padding
+        :param note_rest: bool [B, T_n]
+        :param note_dur: int64 [B, T_n]
+        :param glide: int64 [B, T_n]
+        :return: [B, T_n, H]
+        """
+        midi_embed = self.note_midi_embed(note_midi[:, :, None]) * ~note_rest[:, :, None]
+        dur_embed = self.note_dur_embed(note_dur.astype(dtype="float32")[:, :, None])
+        ornament_embed = 0
+        if self.use_glide_embed:
+            ornament_embed += self.note_glide_embed(glide) * self.glide_embed_scale
+        encoder_out = self.encoder(midi_embed, dur_embed + ornament_embed, padding_mask=note_midi < 0)
+        encoder_out = self.out_proj(encoder_out)
+        return encoder_out

VLMEvalKit_old/PaddleMIX/paddlemix/models/diffsinger/modules/hnsep/vr/__init__.py ADDED Viewed

	@@ -0,0 +1,42 @@

+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import pathlib
+import paddle
+import yaml
+from .nets import CascadedNet
+class DotDict(dict):
+    def __getattr__(*args):
+        val = dict.get(*args)
+        return DotDict(val) if type(val) is dict else val
+    __setattr__ = dict.__setitem__
+    __delattr__ = dict.__delitem__
+def load_sep_model(model_path, device="cpu"):
+    model_path = pathlib.Path(model_path)
+    config_file = model_path.with_name("config.yaml")
+    with open(config_file, "r") as config:
+        args = yaml.safe_load(config)
+    args = DotDict(args)
+    model = CascadedNet(args.n_fft, args.hop_length, args.n_out, args.n_out_lstm, True, is_mono=args.is_mono)
+    model.to(device)
+    model.set_state_dict(state_dict=paddle.load(path=str(model_path)))
+    model.eval()
+    return model

VLMEvalKit_old/PaddleMIX/paddlemix/models/diffsinger/modules/hnsep/vr/layers.py ADDED Viewed

	@@ -0,0 +1,140 @@

+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import sys
+import paddle
+import paddle_aux
+def crop_center(h1, h2):
+    h1_shape = tuple(h1.shape)
+    h2_shape = tuple(h2.shape)
+    if h1_shape[3] == h2_shape[3]:
+        return h1
+    elif h1_shape[3] < h2_shape[3]:
+        raise ValueError("h1_shape[3] must be greater than h2_shape[3]")
+    s_time = (h1_shape[3] - h2_shape[3]) // 2
+    e_time = s_time + h2_shape[3]
+    h1 = h1[:, :, :, s_time:e_time]
+    return h1
+class Conv2DBNActiv(paddle.nn.Layer):
+    def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=paddle.nn.ReLU):
+        super(Conv2DBNActiv, self).__init__()
+        self.conv = paddle.nn.Sequential(
+            paddle.nn.Conv2D(
+                in_channels=nin,
+                out_channels=nout,
+                kernel_size=ksize,
+                stride=stride,
+                padding=pad,
+                dilation=dilation,
+                bias_attr=False,
+            ),
+            paddle.nn.BatchNorm2D(num_features=nout),
+            activ(),
+        )
+    def forward(self, x):
+        return self.conv(x)
+class Encoder(paddle.nn.Layer):
+    def __init__(self, nin, nout, ksize=3, stride=1, pad=1, activ=paddle.nn.LeakyReLU):
+        super(Encoder, self).__init__()
+        self.conv1 = Conv2DBNActiv(nin, nout, ksize, stride, pad, activ=activ)
+        self.conv2 = Conv2DBNActiv(nout, nout, ksize, 1, pad, activ=activ)
+    def forward(self, x):
+        h = self.conv1(x)
+        h = self.conv2(h)
+        return h
+class Decoder(paddle.nn.Layer):
+    def __init__(self, nin, nout, ksize=3, stride=1, pad=1, activ=paddle.nn.ReLU, dropout=False):
+        super(Decoder, self).__init__()
+        self.conv1 = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ)
+        self.dropout = paddle.nn.Dropout2D(p=0.1) if dropout else None
+    def forward(self, x, skip=None):
+        x = paddle.nn.functional.interpolate(x=x, scale_factor=2, mode="bilinear", align_corners=True)
+        if skip is not None:
+            skip = crop_center(skip, x)
+            x = paddle.concat(x=[x, skip], axis=1)
+        h = self.conv1(x)
+        if self.dropout is not None:
+            h = self.dropout(h)
+        return h
+class Mean(paddle.nn.Layer):
+    def __init__(self, dim, keepdims=False):
+        super(Mean, self).__init__()
+        self.dim = dim
+        self.keepdims = keepdims
+    def forward(self, x):
+        return x.mean(self.dim, keepdims=self.keepdims)
+class ASPPModule(paddle.nn.Layer):
+    def __init__(self, nin, nout, dilations=(4, 8, 12), activ=paddle.nn.ReLU, dropout=False):
+        super(ASPPModule, self).__init__()
+        self.conv1 = paddle.nn.Sequential(Mean(dim=-2, keepdims=True), Conv2DBNActiv(nin, nout, 1, 1, 0, activ=activ))
+        self.conv2 = Conv2DBNActiv(nin, nout, 1, 1, 0, activ=activ)
+        self.conv3 = Conv2DBNActiv(nin, nout, 3, 1, dilations[0], dilations[0], activ=activ)
+        self.conv4 = Conv2DBNActiv(nin, nout, 3, 1, dilations[1], dilations[1], activ=activ)
+        self.conv5 = Conv2DBNActiv(nin, nout, 3, 1, dilations[2], dilations[2], activ=activ)
+        self.bottleneck = Conv2DBNActiv(nout * 5, nout, 1, 1, 0, activ=activ)
+        self.dropout = paddle.nn.Dropout2D(p=0.1) if dropout else None
+    def forward(self, x):
+        _, _, h, w = tuple(x.shape)
+        feat1 = self.conv1(x).tile(repeat_times=[1, 1, h, 1])
+        feat2 = self.conv2(x)
+        feat3 = self.conv3(x)
+        feat4 = self.conv4(x)
+        feat5 = self.conv5(x)
+        out = paddle.concat(x=(feat1, feat2, feat3, feat4, feat5), axis=1)
+        out = self.bottleneck(out)
+        if self.dropout is not None:
+            out = self.dropout(out)
+        return out
+class LSTMModule(paddle.nn.Layer):
+    def __init__(self, nin_conv, nin_lstm, nout_lstm):
+        super(LSTMModule, self).__init__()
+        self.conv = Conv2DBNActiv(nin_conv, 1, 1, 1, 0)
+        self.lstm = paddle.nn.LSTM(
+            input_size=nin_lstm, hidden_size=nout_lstm // 2, time_major=not False, direction="bidirect"
+        )
+        self.dense = paddle.nn.Sequential(
+            paddle.nn.Linear(in_features=nout_lstm, out_features=nin_lstm),
+            paddle.nn.BatchNorm1D(num_features=nin_lstm),
+            paddle.nn.ReLU(),
+        )
+    def forward(self, x):
+        N, _, nbins, nframes = tuple(x.shape)
+        h = self.conv(x)[:, 0]
+        h = h.transpose(perm=[2, 0, 1])
+        h, _ = self.lstm(h)
+        h = self.dense(h.reshape(-1, tuple(h.shape)[-1]))
+        h = h.reshape(nframes, N, 1, nbins)
+        h = h.transpose(perm=[1, 2, 3, 0])
+        return h

VLMEvalKit_old/PaddleMIX/paddlemix/models/diffsinger/modules/hnsep/vr/nets.py ADDED Viewed

	@@ -0,0 +1,185 @@

+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import sys
+import paddle
+from . import layers
+class BaseNet(paddle.nn.Layer):
+    def __init__(self, nin, nout, nin_lstm, nout_lstm, dilations=((4, 2), (8, 4), (12, 6))):
+        super(BaseNet, self).__init__()
+        self.enc1 = layers.Conv2DBNActiv(nin, nout, 3, 1, 1)
+        self.enc2 = layers.Encoder(nout, nout * 2, 3, 2, 1)
+        self.enc3 = layers.Encoder(nout * 2, nout * 4, 3, 2, 1)
+        self.enc4 = layers.Encoder(nout * 4, nout * 6, 3, 2, 1)
+        self.enc5 = layers.Encoder(nout * 6, nout * 8, 3, 2, 1)
+        self.aspp = layers.ASPPModule(nout * 8, nout * 8, dilations, dropout=True)
+        self.dec4 = layers.Decoder(nout * (6 + 8), nout * 6, 3, 1, 1)
+        self.dec3 = layers.Decoder(nout * (4 + 6), nout * 4, 3, 1, 1)
+        self.dec2 = layers.Decoder(nout * (2 + 4), nout * 2, 3, 1, 1)
+        self.lstm_dec2 = layers.LSTMModule(nout * 2, nin_lstm, nout_lstm)
+        self.dec1 = layers.Decoder(nout * (1 + 2) + 1, nout * 1, 3, 1, 1)
+    def forward(self, x):
+        e1 = self.enc1(x)
+        e2 = self.enc2(e1)
+        e3 = self.enc3(e2)
+        e4 = self.enc4(e3)
+        e5 = self.enc5(e4)
+        h = self.aspp(e5)
+        h = self.dec4(h, e4)
+        h = self.dec3(h, e3)
+        h = self.dec2(h, e2)
+        h = paddle.concat(x=[h, self.lstm_dec2(h)], axis=1)
+        h = self.dec1(h, e1)
+        return h
+class CascadedNet(paddle.nn.Layer):
+    def __init__(self, n_fft, hop_length, nout=32, nout_lstm=128, is_complex=False, is_mono=False):
+        super(CascadedNet, self).__init__()
+        self.n_fft = n_fft
+        self.hop_length = hop_length
+        self.is_complex = is_complex
+        self.is_mono = is_mono
+        self.register_buffer(
+            name="window",
+            tensor=paddle.audio.functional.get_window("hann", n_fft).astype("float32"),
+            persistable=False,
+        )
+        self.max_bin = n_fft // 2
+        self.output_bin = n_fft // 2 + 1
+        self.nin_lstm = self.max_bin // 2
+        self.offset = 64
+        nin = 4 if is_complex else 2
+        if is_mono:
+            nin = nin // 2
+        self.stg1_low_band_net = paddle.nn.Sequential(
+            BaseNet(nin, nout // 2, self.nin_lstm // 2, nout_lstm), layers.Conv2DBNActiv(nout // 2, nout // 4, 1, 1, 0)
+        )
+        self.stg1_high_band_net = BaseNet(nin, nout // 4, self.nin_lstm // 2, nout_lstm // 2)
+        self.stg2_low_band_net = paddle.nn.Sequential(
+            BaseNet(nout // 4 + nin, nout, self.nin_lstm // 2, nout_lstm),
+            layers.Conv2DBNActiv(nout, nout // 2, 1, 1, 0),
+        )
+        self.stg2_high_band_net = BaseNet(nout // 4 + nin, nout // 2, self.nin_lstm // 2, nout_lstm // 2)
+        self.stg3_full_band_net = BaseNet(3 * nout // 4 + nin, nout, self.nin_lstm, nout_lstm)
+        self.out = paddle.nn.Conv2D(in_channels=nout, out_channels=nin, kernel_size=1, bias_attr=False)
+        self.aux_out = paddle.nn.Conv2D(in_channels=3 * nout // 4, out_channels=nin, kernel_size=1, bias_attr=False)
+    def forward(self, x):
+        if self.is_complex:
+            x = paddle.concat(x=[x.real(), x.imag()], axis=1)
+        x = x[:, :, : self.max_bin]
+        bandw = tuple(x.shape)[2] // 2
+        l1_in = x[:, :, :bandw]
+        h1_in = x[:, :, bandw:]
+        l1 = self.stg1_low_band_net(l1_in)
+        h1 = self.stg1_high_band_net(h1_in)
+        aux1 = paddle.concat(x=[l1, h1], axis=2)
+        l2_in = paddle.concat(x=[l1_in, l1], axis=1)
+        h2_in = paddle.concat(x=[h1_in, h1], axis=1)
+        l2 = self.stg2_low_band_net(l2_in)
+        h2 = self.stg2_high_band_net(h2_in)
+        aux2 = paddle.concat(x=[l2, h2], axis=2)
+        f3_in = paddle.concat(x=[x, aux1, aux2], axis=1)
+        f3 = self.stg3_full_band_net(f3_in)
+        if self.is_complex:
+            mask = self.out(f3)
+            if self.is_mono:
+                mask = paddle.complex(real=mask[:, :1], imag=mask[:, 1:])
+            else:
+                mask = paddle.complex(real=mask[:, :2], imag=mask[:, 2:])
+            mask = self.bounded_mask(mask)
+        else:
+            mask = paddle.nn.functional.sigmoid(x=self.out(f3))
+        mask = paddle.nn.functional.pad(
+            x=mask, pad=(0, 0, 0, self.output_bin - tuple(mask.shape)[2]), mode="replicate", pad_from_left_axis=False
+        )
+        return mask
+    def bounded_mask(self, mask, eps=1e-08):
+        mask_mag = paddle.abs(x=mask)
+        mask = paddle.nn.functional.tanh(x=mask_mag) * mask / (mask_mag + eps)
+        return mask
+    def predict_mask(self, x):
+        mask = self.forward(x)
+        if self.offset > 0:
+            mask = mask[:, :, :, self.offset : -self.offset]
+            assert tuple(mask.shape)[3] > 0
+        return mask
+    def predict(self, x):
+        mask = self.forward(x)
+        pred = x * mask
+        if self.offset > 0:
+            pred = pred[:, :, :, self.offset : -self.offset]
+            assert tuple(pred.shape)[3] > 0
+        return pred
+    def audio2spec(self, x, use_pad=False):
+        B, C, T = tuple(x.shape)
+        x = x.reshape(B * C, T)
+        if use_pad:
+            n_frames = T // self.hop_length + 1
+            T_pad = (32 * ((n_frames - 1) // 32 + 1) - 1) * self.hop_length - T
+            nl_pad = T_pad // 2 // self.hop_length
+            Tl_pad = nl_pad * self.hop_length
+            x = paddle.nn.functional.pad(x=x, pad=(Tl_pad, T_pad - Tl_pad), pad_from_left_axis=False)
+        spec = paddle.signal.stft(
+            x,
+            n_fft=self.n_fft,
+            hop_length=self.hop_length,
+            return_complex=True,
+            window=self.window,
+            pad_mode="constant",
+        )
+        spec = spec.reshape(B, C, tuple(spec.shape)[-2], tuple(spec.shape)[-1])
+        return spec
+    def spec2audio(self, x):
+        B, C, N, T = tuple(x.shape)
+        x = x.reshape(-1, N, T)
+        x = paddle.signal.istft(x=x, n_fft=self.n_fft, hop_length=self.hop_length, window=self.window)
+        x = x.reshape(B, C, -1)
+        return x
+    def predict_from_audio(self, x):
+        B, C, T = tuple(x.shape)
+        x = x.reshape(B * C, T)
+        n_frames = T // self.hop_length + 1
+        T_pad = (32 * (n_frames // 32 + 1) - 1) * self.hop_length - T
+        nl_pad = T_pad // 2 // self.hop_length
+        Tl_pad = nl_pad * self.hop_length
+        x = paddle.nn.functional.pad(x=x, pad=(Tl_pad, T_pad - Tl_pad), pad_from_left_axis=False)
+        spec = paddle.signal.stft(
+            x,
+            n_fft=self.n_fft,
+            hop_length=self.hop_length,
+            return_complex=True,
+            window=self.window,
+            pad_mode="constant",
+        )
+        spec = spec.reshape(B, C, tuple(spec.shape)[-2], tuple(spec.shape)[-1])
+        mask = self.forward(spec)
+        spec_pred = spec * mask
+        spec_pred = spec_pred.reshape(B * C, tuple(spec.shape)[-2], tuple(spec.shape)[-1])
+        x_pred = paddle.signal.istft(x=spec_pred, n_fft=self.n_fft, hop_length=self.hop_length, window=self.window)
+        x_pred = x_pred[:, Tl_pad : Tl_pad + T]
+        x_pred = x_pred.reshape(B, C, T)
+        return x_pred

VLMEvalKit_old/PaddleMIX/paddlemix/models/diffsinger/modules/nsf_hifigan/env.py ADDED Viewed

	@@ -0,0 +1,46 @@

+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+class AttrDict(dict):
+    """A dictionary with attribute-style access. It maps attribute access to
+    the real dictionary."""
+    def __init__(self, *args, **kwargs):
+        dict.__init__(self, *args, **kwargs)
+    def __getstate__(self):
+        return self.__dict__.items()
+    def __setstate__(self, items):
+        for key, val in items:
+            self.__dict__[key] = val
+    def __repr__(self):
+        return "%s(%s)" % (self.__class__.__name__, dict.__repr__(self))
+    def __setitem__(self, key, value):
+        return super(AttrDict, self).__setitem__(key, value)
+    def __getitem__(self, name):
+        return super(AttrDict, self).__getitem__(name)
+    def __delitem__(self, name):
+        return super(AttrDict, self).__delitem__(name)
+    __getattr__ = __getitem__
+    __setattr__ = __setitem__
+    def copy(self):
+        return AttrDict(self)

VLMEvalKit_old/PaddleMIX/paddlemix/models/diffsinger/modules/nsf_hifigan/models.py ADDED Viewed

	@@ -0,0 +1,380 @@

+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import json
+import pathlib
+import sys
+import numpy as np
+import paddle
+import paddle.nn.functional as F
+from paddlemix.models.diffsinger.utils import paddle_aux
+from paddle.nn.utils import remove_weight_norm, weight_norm
+from .env import AttrDict
+from .utils import get_padding, init_weights
+LRELU_SLOPE = 0.1
+def load_model(model_path: pathlib.Path):
+    config_file = model_path.with_name("config.json")
+    with open(config_file) as f:
+        data = f.read()
+    json_config = json.loads(data)
+    h = AttrDict(json_config)
+    generator = Generator(h)
+    cp_dict = paddle.load(path=str(model_path))
+    generator.set_state_dict(state_dict=cp_dict["generator"])
+    generator.eval()
+    generator.remove_weight_norm()
+    del cp_dict
+    return generator, h
+class ResBlock1(paddle.nn.Layer):
+    def __init__(self, h, channels, kernel_size=3, dilation=(1, 3, 5)):
+        super(ResBlock1, self).__init__()
+        self.h = h
+        self.convs1 = paddle.nn.LayerList(
+            sublayers=[
+                paddle.nn.utils.weight_norm(
+                    layer=paddle.nn.Conv1D(
+                        in_channels=channels,
+                        out_channels=channels,
+                        kernel_size=kernel_size,
+                        stride=1,
+                        dilation=dilation[0],
+                        padding=get_padding(kernel_size, dilation[0]),
+                    )
+                ),
+                paddle.nn.utils.weight_norm(
+                    layer=paddle.nn.Conv1D(
+                        in_channels=channels,
+                        out_channels=channels,
+                        kernel_size=kernel_size,
+                        stride=1,
+                        dilation=dilation[1],
+                        padding=get_padding(kernel_size, dilation[1]),
+                    )
+                ),
+                paddle.nn.utils.weight_norm(
+                    layer=paddle.nn.Conv1D(
+                        in_channels=channels,
+                        out_channels=channels,
+                        kernel_size=kernel_size,
+                        stride=1,
+                        dilation=dilation[2],
+                        padding=get_padding(kernel_size, dilation[2]),
+                    )
+                ),
+            ]
+        )
+        self.convs1.apply(init_weights)
+        self.convs2 = paddle.nn.LayerList(
+            sublayers=[
+                paddle.nn.utils.weight_norm(
+                    layer=paddle.nn.Conv1D(
+                        in_channels=channels,
+                        out_channels=channels,
+                        kernel_size=kernel_size,
+                        stride=1,
+                        dilation=1,
+                        padding=get_padding(kernel_size, 1),
+                    )
+                ),
+                paddle.nn.utils.weight_norm(
+                    layer=paddle.nn.Conv1D(
+                        in_channels=channels,
+                        out_channels=channels,
+                        kernel_size=kernel_size,
+                        stride=1,
+                        dilation=1,
+                        padding=get_padding(kernel_size, 1),
+                    )
+                ),
+                paddle.nn.utils.weight_norm(
+                    layer=paddle.nn.Conv1D(
+                        in_channels=channels,
+                        out_channels=channels,
+                        kernel_size=kernel_size,
+                        stride=1,
+                        dilation=1,
+                        padding=get_padding(kernel_size, 1),
+                    )
+                ),
+            ]
+        )
+        self.convs2.apply(init_weights)
+    def forward(self, x):
+        for c1, c2 in zip(self.convs1, self.convs2):
+            xt = paddle.nn.functional.leaky_relu(x=x, negative_slope=LRELU_SLOPE)
+            xt = c1(xt)
+            xt = paddle.nn.functional.leaky_relu(x=xt, negative_slope=LRELU_SLOPE)
+            xt = c2(xt)
+            x = xt + x
+        return x
+    def remove_weight_norm(self):
+        for l in self.convs1:
+            paddle.nn.utils.remove_weight_norm(layer=l)
+        for l in self.convs2:
+            paddle.nn.utils.remove_weight_norm(layer=l)
+class ResBlock2(paddle.nn.Layer):
+    def __init__(self, h, channels, kernel_size=3, dilation=(1, 3)):
+        super(ResBlock2, self).__init__()
+        self.h = h
+        self.convs = paddle.nn.LayerList(
+            sublayers=[
+                paddle.nn.utils.weight_norm(
+                    layer=paddle.nn.Conv1D(
+                        in_channels=channels,
+                        out_channels=channels,
+                        kernel_size=kernel_size,
+                        stride=1,
+                        dilation=dilation[0],
+                        padding=get_padding(kernel_size, dilation[0]),
+                    )
+                ),
+                paddle.nn.utils.weight_norm(
+                    layer=paddle.nn.Conv1D(
+                        in_channels=channels,
+                        out_channels=channels,
+                        kernel_size=kernel_size,
+                        stride=1,
+                        dilation=dilation[1],
+                        padding=get_padding(kernel_size, dilation[1]),
+                    )
+                ),
+            ]
+        )
+        self.convs.apply(init_weights)
+    def forward(self, x):
+        for c in self.convs:
+            xt = paddle.nn.functional.leaky_relu(x=x, negative_slope=LRELU_SLOPE)
+            xt = c(xt)
+            x = xt + x
+        return x
+    def remove_weight_norm(self):
+        for l in self.convs:
+            paddle.nn.utils.remove_weight_norm(layer=l)
+class SineGen(paddle.nn.Layer):
+    """Definition of sine generator
+    SineGen(samp_rate, harmonic_num = 0,
+            sine_amp = 0.1, noise_std = 0.003,
+            voiced_threshold = 0,
+            flag_for_pulse=False)
+    samp_rate: sampling rate in Hz
+    harmonic_num: number of harmonic overtones (default 0)
+    sine_amp: amplitude of sine-waveform (default 0.1)
+    noise_std: std of Gaussian noise (default 0.003)
+    voiced_threshold: F0 threshold for U/V classification (default 0)
+    flag_for_pulse: this SinGen is used inside PulseGen (default False)
+    Note: when flag_for_pulse is True, the first time step of a voiced
+        segment is always sin(np.pi) or cos(0)
+    """
+    def __init__(self, samp_rate, harmonic_num=0, sine_amp=0.1, noise_std=0.003, voiced_threshold=0):
+        super(SineGen, self).__init__()
+        self.sine_amp = sine_amp
+        self.noise_std = noise_std
+        self.harmonic_num = harmonic_num
+        self.dim = self.harmonic_num + 1
+        self.sampling_rate = samp_rate
+        self.voiced_threshold = voiced_threshold
+    def _f02uv(self, f0):
+        uv = paddle.ones_like(x=f0)
+        uv = uv * (f0 > self.voiced_threshold)
+        return uv
+    def _f02sine(self, f0, upp):
+        """f0: (batchsize, length, dim)
+        where dim indicates fundamental tone and overtones
+        """
+        # rad = f0 / self.sampling_rate * paddle.arange(start=1, end=upp + 1)
+        rad = f0 / self.sampling_rate * paddle.arange(start=1, end=upp + 1, dtype="float32")
+        rad2 = (
+            paddle.mod(
+                x=rad[..., -1:].astype(dtype="float32") + 0.5,
+                y=paddle.to_tensor(1.0, dtype=(rad[..., -1:].astype(dtype="float32") + 0.5).dtype),
+            )
+            - 0.5
+        )
+        rad_acc = rad2.cumsum(axis=1).mod(y=paddle.to_tensor(1.0)).to(f0)
+        # rad += paddle.nn.functional.pad(x=rad_acc, pad=(0, 0, 1, -1),
+        #     pad_from_left_axis=False)
+        # 等效实现
+        rad_shifted = paddle.concat([paddle.zeros_like(rad_acc[:, :1]), rad_acc[:, :-1]], axis=1)
+        rad += rad_shifted
+        rad = rad.reshape(tuple(f0.shape)[0], -1, 1)
+        # rad = paddle.multiply(x=rad, y=paddle.to_tensor(paddle.arange(start
+        #     =1, end=self.dim + 1).reshape(1, 1, -1)))
+        rad = paddle.multiply(
+            x=rad,
+            y=paddle.to_tensor(
+                paddle.arange(start=1, end=self.dim + 1), dtype="float32"  # Explicitly set dtype to float32
+            ).reshape(1, 1, -1),
+        )
+        rand_ini = paddle.rand(shape=[1, 1, self.dim])
+        rand_ini[..., 0] = 0
+        rad += rand_ini
+        sines = paddle.sin(x=2 * np.pi * rad)
+        return sines
+    @paddle.no_grad()
+    def forward(self, f0, upp):
+        """sine_tensor, uv = forward(f0)
+        input F0: tensor(batchsize=1, length, dim=1)
+                  f0 for unvoiced steps should be 0
+        output sine_tensor: tensor(batchsize=1, length, dim)
+        output uv: tensor(batchsize=1, length, 1)
+        """
+        f0 = f0.unsqueeze(axis=-1)
+        sine_waves = self._f02sine(f0, upp) * self.sine_amp
+        uv = (f0 > self.voiced_threshold).astype(dtype="float32")
+        uv = F.interpolate(uv.transpose([0, 2, 1]), scale_factor=upp, mode="linear", data_format="NCW").transpose(
+            [0, 2, 1]
+        )
+        noise_amp = uv * self.noise_std + (1 - uv) * self.sine_amp / 3
+        noise = noise_amp * paddle.randn(shape=sine_waves.shape, dtype=sine_waves.dtype)
+        sine_waves = sine_waves * uv + noise
+        return sine_waves
+class SourceModuleHnNSF(paddle.nn.Layer):
+    """SourceModule for hn-nsf
+    SourceModule(sampling_rate, harmonic_num=0, sine_amp=0.1,
+                 add_noise_std=0.003, voiced_threshod=0)
+    sampling_rate: sampling_rate in Hz
+    harmonic_num: number of harmonic above F0 (default: 0)
+    sine_amp: amplitude of sine source signal (default: 0.1)
+    add_noise_std: std of additive Gaussian noise (default: 0.003)
+        note that amplitude of noise in unvoiced is decided
+        by sine_amp
+    voiced_threshold: threhold to set U/V given F0 (default: 0)
+    Sine_source, noise_source = SourceModuleHnNSF(F0_sampled)
+    F0_sampled (batchsize, length, 1)
+    Sine_source (batchsize, length, 1)
+    noise_source (batchsize, length 1)
+    uv (batchsize, length, 1)
+    """
+    def __init__(self, sampling_rate, harmonic_num=0, sine_amp=0.1, add_noise_std=0.003, voiced_threshold=0):
+        super(SourceModuleHnNSF, self).__init__()
+        self.sine_amp = sine_amp
+        self.noise_std = add_noise_std
+        self.l_sin_gen = SineGen(sampling_rate, harmonic_num, sine_amp, add_noise_std, voiced_threshold)
+        self.l_linear = paddle.nn.Linear(in_features=harmonic_num + 1, out_features=1)
+        self.l_tanh = paddle.nn.Tanh()
+    def forward(self, x, upp):
+        sine_wavs = self.l_sin_gen(x, upp)
+        sine_merge = self.l_tanh(self.l_linear(sine_wavs))
+        return sine_merge
+class Generator(paddle.nn.Layer):
+    def __init__(self, h):
+        super(Generator, self).__init__()
+        self.h = h
+        self.num_kernels = len(h.resblock_kernel_sizes)
+        self.num_upsamples = len(h.upsample_rates)
+        self.m_source = SourceModuleHnNSF(sampling_rate=h.sampling_rate, harmonic_num=8)
+        self.noise_convs = paddle.nn.LayerList()
+        self.conv_pre = paddle.nn.utils.weight_norm(
+            layer=paddle.nn.Conv1D(
+                in_channels=h.num_mels, out_channels=h.upsample_initial_channel, kernel_size=7, stride=1, padding=3
+            )
+        )
+        resblock = ResBlock1 if h.resblock == "1" else ResBlock2
+        self.ups = paddle.nn.LayerList()
+        for i, (u, k) in enumerate(zip(h.upsample_rates, h.upsample_kernel_sizes)):
+            c_cur = h.upsample_initial_channel // 2 ** (i + 1)
+            self.ups.append(
+                paddle.nn.utils.weight_norm(
+                    layer=paddle.nn.Conv1DTranspose(
+                        in_channels=h.upsample_initial_channel // 2**i,
+                        out_channels=h.upsample_initial_channel // 2 ** (i + 1),
+                        kernel_size=k,
+                        stride=u,
+                        padding=(k - u) // 2,
+                    )
+                )
+            )
+            if i + 1 < len(h.upsample_rates):
+                stride_f0 = int(np.prod(h.upsample_rates[i + 1 :]))
+                self.noise_convs.append(
+                    paddle.nn.Conv1D(
+                        in_channels=1,
+                        out_channels=c_cur,
+                        kernel_size=stride_f0 * 2,
+                        stride=stride_f0,
+                        padding=stride_f0 // 2,
+                    )
+                )
+            else:
+                self.noise_convs.append(paddle.nn.Conv1D(in_channels=1, out_channels=c_cur, kernel_size=1))
+        self.resblocks = paddle.nn.LayerList()
+        ch = h.upsample_initial_channel
+        for i in range(len(self.ups)):
+            ch //= 2
+            for j, (k, d) in enumerate(zip(h.resblock_kernel_sizes, h.resblock_dilation_sizes)):
+                self.resblocks.append(resblock(h, ch, k, d))
+        self.conv_post = paddle.nn.utils.weight_norm(
+            layer=paddle.nn.Conv1D(in_channels=ch, out_channels=1, kernel_size=7, stride=1, padding=3)
+        )
+        self.ups.apply(init_weights)
+        self.conv_post.apply(init_weights)
+        self.upp = int(np.prod(h.upsample_rates))
+    def forward(self, x, f0):
+        har_source = self.m_source(f0, self.upp).transpose(
+            perm=paddle_aux.transpose_aux_func(self.m_source(f0, self.upp).ndim, 1, 2)
+        )
+        # har_source = self.m_source(f0, self.upp).transpose(1, 2)
+        x = self.conv_pre(x)
+        for i in range(self.num_upsamples):
+            x = paddle.nn.functional.leaky_relu(x=x, negative_slope=LRELU_SLOPE)
+            x = self.ups[i](x)
+            x_source = self.noise_convs[i](har_source)
+            x = x + x_source
+            xs = None
+            for j in range(self.num_kernels):
+                if xs is None:
+                    xs = self.resblocks[i * self.num_kernels + j](x)
+                else:
+                    xs += self.resblocks[i * self.num_kernels + j](x)
+            x = xs / self.num_kernels
+        x = paddle.nn.functional.leaky_relu(x=x)
+        x = self.conv_post(x)
+        x = paddle.nn.functional.tanh(x=x)
+        return x
+    def remove_weight_norm(self):
+        print("Removing weight norm...")
+        for l in self.ups:
+            remove_weight_norm(l)
+        for l in self.resblocks:
+            l.remove_weight_norm()
+        remove_weight_norm(self.conv_pre)
+        remove_weight_norm(self.conv_post)

VLMEvalKit_old/PaddleMIX/paddlemix/models/diffsinger/modules/nsf_hifigan/nvSTFT.py ADDED Viewed

	@@ -0,0 +1,104 @@

+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+import paddle
+os.environ["LRU_CACHE_CAPACITY"] = "3"
+import numpy as np
+from librosa.filters import mel as librosa_mel_fn
+def dynamic_range_compression(x, C=1, clip_val=1e-05):
+    return np.log(np.clip(x, a_min=clip_val, a_max=None) * C)
+def dynamic_range_decompression(x, C=1):
+    return np.exp(x) / C
+def dynamic_range_compression_torch(x, C=1, clip_val=1e-05):
+    return paddle.log(x=paddle.clip(x=x, min=clip_val) * C)
+def dynamic_range_decompression_torch(x, C=1):
+    return paddle.exp(x=x) / C
+class STFT:
+    def __init__(
+        self,
+        sr=22050,
+        n_mels=80,
+        n_fft=1024,
+        win_size=1024,
+        hop_length=256,
+        fmin=20,
+        fmax=11025,
+        clip_val=1e-05,
+        device=None,
+    ):
+        self.target_sr = sr
+        self.n_mels = n_mels
+        self.n_fft = n_fft
+        self.win_size = win_size
+        self.hop_length = hop_length
+        self.fmin = fmin
+        self.fmax = fmax
+        self.clip_val = clip_val
+        if device is None:
+            device = str("cuda" if paddle.device.cuda.device_count() >= 1 else "cpu").replace("cuda", "gpu")
+        self.device = device
+        mel_basis = librosa_mel_fn(sr=sr, n_fft=n_fft, n_mels=n_mels, fmin=fmin, fmax=fmax)
+        self.mel_basis = paddle.to_tensor(data=mel_basis).astype(dtype="float32").to(device)
+    def get_mel(self, y, keyshift=0, speed=1, center=False):
+        factor = 2 ** (keyshift / 12)
+        n_fft_new = int(np.round(self.n_fft * factor))
+        win_size_new = int(np.round(self.win_size * factor))
+        hop_length_new = int(np.round(self.hop_length * speed))
+        if paddle.min(x=y) < -1.0:
+            print("min value is ", paddle.min(x=y))
+        if paddle.max(x=y) > 1.0:
+            print("max value is ", paddle.max(x=y))
+        window = paddle.audio.functional.get_window("hann", win_size_new).astype("float32").to(self.device)
+        y = paddle.nn.functional.pad(
+            x=y.unsqueeze(axis=1),
+            pad=((win_size_new - hop_length_new) // 2, (win_size_new - hop_length_new + 1) // 2),
+            mode="reflect",
+            pad_from_left_axis=False,
+        )
+        y = y.squeeze(axis=1)
+        spec = paddle.signal.stft(
+            y,
+            n_fft_new,
+            hop_length=hop_length_new,
+            win_length=win_size_new,
+            window=window,
+            center=center,
+            pad_mode="reflect",
+            normalized=False,
+            onesided=True,
+        ).abs()
+        if keyshift != 0:
+            size = self.n_fft // 2 + 1
+            resize = spec.shape[1]
+            if resize < size:
+                spec = paddle.nn.functional.pad(x=spec, pad=(0, 0, 0, size - resize), pad_from_left_axis=False)
+            spec = spec[:, :size, :] * self.win_size / win_size_new
+        spec = paddle.matmul(x=self.mel_basis, y=spec)
+        spec = dynamic_range_compression_torch(spec, clip_val=self.clip_val)
+        return spec

VLMEvalKit_old/PaddleMIX/paddlemix/models/diffsinger/modules/nsf_hifigan/utils.py ADDED Viewed

	@@ -0,0 +1,27 @@

+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import matplotlib
+matplotlib.use("Agg")
+def init_weights(m, mean=0.0, std=0.01):
+    classname = m.__class__.__name__
+    if classname.find("Conv") != -1:
+        m.weight.data.normal_(mean, std)
+def get_padding(kernel_size, dilation=1):
+    return int((kernel_size * dilation - dilation) / 2)

VLMEvalKit_old/PaddleMIX/paddlemix/models/diffsinger/modules/pe/pm.py ADDED Viewed

	@@ -0,0 +1,30 @@

+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from paddlemix.models.diffsinger.basics.base_pe import BasePE
+from paddlemix.models.diffsinger.utils.binarizer_utils import get_pitch_parselmouth
+class ParselmouthPE(BasePE):
+    def get_pitch(self, waveform, samplerate, length, *, hop_size, f0_min=65, f0_max=1100, speed=1, interp_uv=False):
+        return get_pitch_parselmouth(
+            waveform,
+            samplerate=samplerate,
+            length=length,
+            hop_size=hop_size,
+            f0_min=f0_min,
+            f0_max=f0_max,
+            speed=speed,
+            interp_uv=interp_uv,
+        )

VLMEvalKit_old/PaddleMIX/paddlemix/models/diffsinger/modules/pe/rmvpe/__init__.py ADDED Viewed

	@@ -0,0 +1,19 @@

+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from .constants import *
+from .inference import RMVPE
+from .model import E2E0
+from .spec import MelSpectrogram
+from .utils import to_local_average_f0, to_viterbi_f0

VLMEvalKit_old/PaddleMIX/paddlemix/models/diffsinger/modules/pe/rmvpe/constants.py ADDED Viewed

	@@ -0,0 +1,21 @@

+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+SAMPLE_RATE = 16000
+N_CLASS = 360
+N_MELS = 128
+MEL_FMIN = 30
+MEL_FMAX = 8000
+WINDOW_LENGTH = 1024
+CONST = 1997.379408437619

VLMEvalKit_old/PaddleMIX/paddlemix/models/diffsinger/modules/pe/rmvpe/deepunet.py ADDED Viewed

	@@ -0,0 +1,194 @@

+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import paddle
+from .constants import N_MELS
+class ConvBlockRes(paddle.nn.Layer):
+    def __init__(self, in_channels, out_channels, momentum=0.01):
+        super(ConvBlockRes, self).__init__()
+        self.conv = paddle.nn.Sequential(
+            paddle.nn.Conv2D(
+                in_channels=in_channels,
+                out_channels=out_channels,
+                kernel_size=(3, 3),
+                stride=(1, 1),
+                padding=(1, 1),
+                bias_attr=False,
+            ),
+            paddle.nn.BatchNorm2D(num_features=out_channels, momentum=1 - momentum),
+            paddle.nn.ReLU(),
+            paddle.nn.Conv2D(
+                in_channels=out_channels,
+                out_channels=out_channels,
+                kernel_size=(3, 3),
+                stride=(1, 1),
+                padding=(1, 1),
+                bias_attr=False,
+            ),
+            paddle.nn.BatchNorm2D(num_features=out_channels, momentum=1 - momentum),
+            paddle.nn.ReLU(),
+        )
+        if in_channels != out_channels:
+            self.shortcut = paddle.nn.Conv2D(in_channels=in_channels, out_channels=out_channels, kernel_size=(1, 1))
+            self.is_shortcut = True
+        else:
+            self.is_shortcut = False
+    def forward(self, x):
+        if self.is_shortcut:
+            return self.conv(x) + self.shortcut(x)
+        else:
+            return self.conv(x) + x
+class ResEncoderBlock(paddle.nn.Layer):
+    def __init__(self, in_channels, out_channels, kernel_size, n_blocks=1, momentum=0.01):
+        super(ResEncoderBlock, self).__init__()
+        self.n_blocks = n_blocks
+        self.conv = paddle.nn.LayerList()
+        self.conv.append(ConvBlockRes(in_channels, out_channels, momentum))
+        for i in range(n_blocks - 1):
+            self.conv.append(ConvBlockRes(out_channels, out_channels, momentum))
+        self.kernel_size = kernel_size
+        if self.kernel_size is not None:
+            self.pool = paddle.nn.AvgPool2D(kernel_size=kernel_size, exclusive=False)
+    def forward(self, x):
+        for i in range(self.n_blocks):
+            x = self.conv[i](x)
+        if self.kernel_size is not None:
+            return x, self.pool(x)
+        else:
+            return x
+class ResDecoderBlock(paddle.nn.Layer):
+    def __init__(self, in_channels, out_channels, stride, n_blocks=1, momentum=0.01):
+        super(ResDecoderBlock, self).__init__()
+        out_padding = (0, 1) if stride == (1, 2) else (1, 1)
+        self.n_blocks = n_blocks
+        self.conv1 = paddle.nn.Sequential(
+            paddle.nn.Conv2DTranspose(
+                in_channels=in_channels,
+                out_channels=out_channels,
+                kernel_size=(3, 3),
+                stride=stride,
+                padding=(1, 1),
+                output_padding=out_padding,
+                bias_attr=False,
+            ),
+            paddle.nn.BatchNorm2D(num_features=out_channels, momentum=1 - momentum),
+            paddle.nn.ReLU(),
+        )
+        self.conv2 = paddle.nn.LayerList()
+        self.conv2.append(ConvBlockRes(out_channels * 2, out_channels, momentum))
+        for i in range(n_blocks - 1):
+            self.conv2.append(ConvBlockRes(out_channels, out_channels, momentum))
+    def forward(self, x, concat_tensor):
+        x = self.conv1(x)
+        x = paddle.concat(x=(x, concat_tensor), axis=1)
+        for i in range(self.n_blocks):
+            x = self.conv2[i](x)
+        return x
+class Encoder(paddle.nn.Layer):
+    def __init__(self, in_channels, in_size, n_encoders, kernel_size, n_blocks, out_channels=16, momentum=0.01):
+        super(Encoder, self).__init__()
+        self.n_encoders = n_encoders
+        self.bn = paddle.nn.BatchNorm2D(num_features=in_channels, momentum=1 - momentum)
+        self.layers = paddle.nn.LayerList()
+        self.latent_channels = []
+        for i in range(self.n_encoders):
+            self.layers.append(ResEncoderBlock(in_channels, out_channels, kernel_size, n_blocks, momentum=momentum))
+            self.latent_channels.append([out_channels, in_size])
+            in_channels = out_channels
+            out_channels *= 2
+            in_size //= 2
+        self.out_size = in_size
+        self.out_channel = out_channels
+    def forward(self, x):
+        concat_tensors = []
+        x = self.bn(x)
+        for i in range(self.n_encoders):
+            _, x = self.layers[i](x)
+            concat_tensors.append(_)
+        return x, concat_tensors
+class Intermediate(paddle.nn.Layer):
+    def __init__(self, in_channels, out_channels, n_inters, n_blocks, momentum=0.01):
+        super(Intermediate, self).__init__()
+        self.n_inters = n_inters
+        self.layers = paddle.nn.LayerList()
+        self.layers.append(ResEncoderBlock(in_channels, out_channels, None, n_blocks, momentum))
+        for i in range(self.n_inters - 1):
+            self.layers.append(ResEncoderBlock(out_channels, out_channels, None, n_blocks, momentum))
+    def forward(self, x):
+        for i in range(self.n_inters):
+            x = self.layers[i](x)
+        return x
+class Decoder(paddle.nn.Layer):
+    def __init__(self, in_channels, n_decoders, stride, n_blocks, momentum=0.01):
+        super(Decoder, self).__init__()
+        self.layers = paddle.nn.LayerList()
+        self.n_decoders = n_decoders
+        for i in range(self.n_decoders):
+            out_channels = in_channels // 2
+            self.layers.append(ResDecoderBlock(in_channels, out_channels, stride, n_blocks, momentum))
+            in_channels = out_channels
+    def forward(self, x, concat_tensors):
+        for i in range(self.n_decoders):
+            x = self.layers[i](x, concat_tensors[-1 - i])
+        return x
+class TimbreFilter(paddle.nn.Layer):
+    def __init__(self, latent_rep_channels):
+        super(TimbreFilter, self).__init__()
+        self.layers = paddle.nn.LayerList()
+        for latent_rep in latent_rep_channels:
+            self.layers.append(ConvBlockRes(latent_rep[0], latent_rep[0]))
+    def forward(self, x_tensors):
+        out_tensors = []
+        for i, layer in enumerate(self.layers):
+            out_tensors.append(layer(x_tensors[i]))
+        return out_tensors
+class DeepUnet0(paddle.nn.Layer):
+    def __init__(self, kernel_size, n_blocks, en_de_layers=5, inter_layers=4, in_channels=1, en_out_channels=16):
+        super(DeepUnet0, self).__init__()
+        self.encoder = Encoder(in_channels, N_MELS, en_de_layers, kernel_size, n_blocks, en_out_channels)
+        self.intermediate = Intermediate(
+            self.encoder.out_channel // 2, self.encoder.out_channel, inter_layers, n_blocks
+        )
+        self.tf = TimbreFilter(self.encoder.latent_channels)
+        self.decoder = Decoder(self.encoder.out_channel, en_de_layers, kernel_size, n_blocks)
+    def forward(self, x):
+        x, concat_tensors = self.encoder(x)
+        x = self.intermediate(x)
+        x = self.decoder(x, concat_tensors)
+        return x

VLMEvalKit_old/PaddleMIX/paddlemix/models/diffsinger/modules/pe/rmvpe/inference.py ADDED Viewed

	@@ -0,0 +1,80 @@

+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import numpy as np
+import paddle
+from basics.base_pe import BasePE
+from torchaudio.transforms import Resample
+from utils.infer_utils import resample_align_curve
+from utils.pitch_utils import interp_f0
+from .constants import *
+from .model import E2E0
+from .spec import MelSpectrogram
+from .utils import to_local_average_f0, to_viterbi_f0
+class RMVPE(BasePE):
+    def __init__(self, model_path, hop_length=160):
+        self.resample_kernel = {}
+        self.device = "cuda" if paddle.device.cuda.device_count() >= 1 else "cpu"
+        self.model = E2E0(4, 1, (2, 2)).eval().to(self.device)
+        ckpt = paddle.load(path=str(model_path))
+        self.model.set_state_dict(state_dict=ckpt["model"])
+        self.mel_extractor = MelSpectrogram(
+            N_MELS, SAMPLE_RATE, WINDOW_LENGTH, hop_length, None, MEL_FMIN, MEL_FMAX
+        ).to(self.device)
+    @paddle.no_grad()
+    def mel2hidden(self, mel):
+        n_frames = tuple(mel.shape)[-1]
+        mel = paddle.nn.functional.pad(
+            x=mel, pad=(0, 32 * ((n_frames - 1) // 32 + 1) - n_frames), mode="constant", pad_from_left_axis=False
+        )
+        hidden = self.model(mel)
+        return hidden[:, :n_frames]
+    def decode(self, hidden, thred=0.03, use_viterbi=False):
+        if use_viterbi:
+            f0 = to_viterbi_f0(hidden, thred=thred)
+        else:
+            f0 = to_local_average_f0(hidden, thred=thred)
+        return f0
+    def infer_from_audio(self, audio, sample_rate=16000, thred=0.03, use_viterbi=False):
+        audio = paddle.to_tensor(data=audio).astype(dtype="float32").unsqueeze(axis=0).to(self.device)
+        if sample_rate == 16000:
+            audio_res = audio
+        else:
+            key_str = str(sample_rate)
+            if key_str not in self.resample_kernel:
+                self.resample_kernel[key_str] = Resample(sample_rate, 16000, lowpass_filter_width=128)
+            self.resample_kernel[key_str] = self.resample_kernel[key_str].to(self.device)
+            audio_res = self.resample_kernel[key_str](audio)
+        mel = self.mel_extractor(audio_res, center=True)
+        hidden = self.mel2hidden(mel)
+        f0 = self.decode(hidden, thred=thred, use_viterbi=use_viterbi)
+        return f0
+    def get_pitch(self, waveform, samplerate, length, *, hop_size, f0_min=65, f0_max=1100, speed=1, interp_uv=False):
+        f0 = self.infer_from_audio(waveform, sample_rate=samplerate)
+        uv = f0 == 0
+        f0, uv = interp_f0(f0, uv)
+        hop_size = int(np.round(hop_size * speed))
+        time_step = hop_size / samplerate
+        f0_res = resample_align_curve(f0, 0.01, time_step, length)
+        uv_res = resample_align_curve(uv.astype(np.float32), 0.01, time_step, length) > 0.5
+        if not interp_uv:
+            f0_res[uv_res] = 0
+        return f0_res, uv_res

VLMEvalKit_old/PaddleMIX/paddlemix/models/diffsinger/modules/pe/rmvpe/model.py ADDED Viewed

	@@ -0,0 +1,54 @@

+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import sys
+import paddle
+import paddle_aux
+from .constants import *
+from .deepunet import DeepUnet0
+from .seq import BiGRU
+class E2E0(paddle.nn.Layer):
+    def __init__(
+        self, n_blocks, n_gru, kernel_size, en_de_layers=5, inter_layers=4, in_channels=1, en_out_channels=16
+    ):
+        super(E2E0, self).__init__()
+        self.unet = DeepUnet0(kernel_size, n_blocks, en_de_layers, inter_layers, in_channels, en_out_channels)
+        self.cnn = paddle.nn.Conv2D(in_channels=en_out_channels, out_channels=3, kernel_size=(3, 3), padding=(1, 1))
+        if n_gru:
+            self.fc = paddle.nn.Sequential(
+                BiGRU(3 * N_MELS, 256, n_gru),
+                paddle.nn.Linear(in_features=512, out_features=N_CLASS),
+                paddle.nn.Dropout(p=0.25),
+                paddle.nn.Sigmoid(),
+            )
+        else:
+            self.fc = paddle.nn.Sequential(
+                paddle.nn.Linear(in_features=3 * N_MELS, out_features=N_CLASS),
+                paddle.nn.Dropout(p=0.25),
+                paddle.nn.Sigmoid(),
+            )
+    def forward(self, mel):
+        mel = mel.transpose(perm=paddle_aux.transpose_aux_func(mel.ndim, -1, -2)).unsqueeze(axis=1)
+        x = (
+            self.cnn(self.unet(mel))
+            .transpose(perm=paddle_aux.transpose_aux_func(self.cnn(self.unet(mel)).ndim, 1, 2))
+            .flatten(start_axis=-2)
+        )
+        x = self.fc(x)
+        return x

VLMEvalKit_old/PaddleMIX/paddlemix/models/diffsinger/modules/pe/rmvpe/seq.py ADDED Viewed

	@@ -0,0 +1,30 @@

+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import paddle
+class BiGRU(paddle.nn.Layer):
+    def __init__(self, input_features, hidden_features, num_layers):
+        super(BiGRU, self).__init__()
+        self.gru = paddle.nn.GRU(
+            input_size=input_features,
+            hidden_size=hidden_features,
+            num_layers=num_layers,
+            time_major=not True,
+            direction="bidirect",
+        )
+    def forward(self, x):
+        return self.gru(x)[0]

VLMEvalKit_old/PaddleMIX/paddlemix/models/diffsinger/modules/pe/rmvpe/spec.py ADDED Viewed

	@@ -0,0 +1,65 @@

+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import numpy as np
+import paddle
+from librosa.filters import mel
+class MelSpectrogram(paddle.nn.Layer):
+    def __init__(
+        self, n_mel_channels, sampling_rate, win_length, hop_length, n_fft=None, mel_fmin=0, mel_fmax=None, clamp=1e-05
+    ):
+        super().__init__()
+        n_fft = win_length if n_fft is None else n_fft
+        self.hann_window = {}
+        mel_basis = mel(sr=sampling_rate, n_fft=n_fft, n_mels=n_mel_channels, fmin=mel_fmin, fmax=mel_fmax, htk=True)
+        mel_basis = paddle.to_tensor(data=mel_basis).astype(dtype="float32")
+        self.register_buffer(name="mel_basis", tensor=mel_basis)
+        self.n_fft = win_length if n_fft is None else n_fft
+        self.hop_length = hop_length
+        self.win_length = win_length
+        self.sampling_rate = sampling_rate
+        self.n_mel_channels = n_mel_channels
+        self.clamp = clamp
+    def forward(self, audio, keyshift=0, speed=1, center=True):
+        factor = 2 ** (keyshift / 12)
+        n_fft_new = int(np.round(self.n_fft * factor))
+        win_length_new = int(np.round(self.win_length * factor))
+        hop_length_new = int(np.round(self.hop_length * speed))
+        keyshift_key = str(keyshift) + "_" + str(audio.place)
+        if keyshift_key not in self.hann_window:
+            self.hann_window[keyshift_key] = paddle.audio.functional.get_window("hann", win_length_new).to(audio.place)
+        fft = paddle.signal.stft(
+            audio,
+            n_fft=n_fft_new,
+            hop_length=hop_length_new,
+            win_length=win_length_new,
+            window=self.hann_window[keyshift_key],
+            center=center,
+            return_complex=True,
+        )
+        magnitude = fft.abs()
+        if keyshift != 0:
+            size = self.n_fft // 2 + 1
+            resize = magnitude.shape[1]
+            if resize < size:
+                magnitude = paddle.nn.functional.pad(
+                    x=magnitude, pad=(0, 0, 0, size - resize), pad_from_left_axis=False
+                )
+            magnitude = magnitude[:, :size, :] * self.win_length / win_length_new
+        mel_output = paddle.matmul(x=self.mel_basis, y=magnitude)
+        log_mel_spec = paddle.log(x=paddle.clip(x=mel_output, min=self.clamp))
+        return log_mel_spec

VLMEvalKit_old/PaddleMIX/paddlemix/models/diffsinger/modules/pe/rmvpe/utils.py ADDED Viewed

	@@ -0,0 +1,54 @@

+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import sys
+import librosa
+import numpy as np
+import paddle
+import paddle_aux
+from .constants import *
+def to_local_average_f0(hidden, center=None, thred=0.03):
+    idx = paddle.arange(end=N_CLASS)[None, None, :]
+    idx_cents = idx * 20 + CONST
+    if center is None:
+        center = paddle.argmax(x=hidden, axis=2, keepdim=True)
+    start = paddle.clip(x=center - 4, min=0)
+    end = paddle.clip(x=center + 5, max=N_CLASS)
+    idx_mask = (idx >= start) & (idx < end)
+    weights = hidden * idx_mask
+    product_sum = paddle.sum(x=weights * idx_cents, axis=2)
+    weight_sum = paddle.sum(x=weights, axis=2)
+    cents = product_sum / (weight_sum + (weight_sum == 0))
+    f0 = 10 * 2 ** (cents / 1200)
+    uv = hidden.max(dim=2)[0] < thred
+    f0 = f0 * ~uv
+    return f0.squeeze(axis=0).cpu().numpy()
+def to_viterbi_f0(hidden, thred=0.03):
+    if not hasattr(to_viterbi_f0, "transition"):
+        xx, yy = np.meshgrid(range(N_CLASS), range(N_CLASS))
+        transition = np.maximum(30 - abs(xx - yy), 0)
+        transition = transition / transition.sum(axis=1, keepdims=True)
+        to_viterbi_f0.transition = transition
+    prob = hidden.squeeze(axis=0).cpu().numpy()
+    prob = prob.T
+    prob = prob / prob.sum(axis=0)
+    path = librosa.sequence.viterbi(prob, to_viterbi_f0.transition).astype(np.int64)
+    center = paddle.to_tensor(data=path).unsqueeze(axis=0).unsqueeze(axis=-1).to(hidden.place)
+    return to_local_average_f0(hidden, center=center, thred=thred)

VLMEvalKit_old/PaddleMIX/paddlemix/models/diffsinger/modules/toplevel.py ADDED Viewed

	@@ -0,0 +1,323 @@

+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import Dict
+import paddle
+import paddlemix.models.diffsinger.modules.compat as compat
+from paddlemix.models.diffsinger.basics.base_module import CategorizedModule
+from paddlemix.models.diffsinger.modules.aux_decoder import AuxDecoderAdaptor
+from paddlemix.models.diffsinger.modules.commons.common_layers import (
+    NormalInitEmbedding as Embedding,
+)
+from paddlemix.models.diffsinger.modules.commons.common_layers import (
+    XavierUniformInitLinear as Linear,
+)
+from paddlemix.models.diffsinger.modules.core import (
+    GaussianDiffusion,
+    MultiVarianceDiffusion,
+    MultiVarianceRectifiedFlow,
+    PitchDiffusion,
+    PitchRectifiedFlow,
+    RectifiedFlow,
+)
+from paddlemix.models.diffsinger.modules.fastspeech.acoustic_encoder import (
+    FastSpeech2Acoustic,
+)
+from paddlemix.models.diffsinger.modules.fastspeech.param_adaptor import (
+    ParameterAdaptorModule,
+)
+from paddlemix.models.diffsinger.modules.fastspeech.tts_modules import (
+    LengthRegulator,
+    RhythmRegulator,
+)
+from paddlemix.models.diffsinger.modules.fastspeech.variance_encoder import (
+    FastSpeech2Variance,
+    MelodyEncoder,
+)
+from paddlemix.models.diffsinger.utils.hparams import hparams
+class ShallowDiffusionOutput:
+    def __init__(self, *, aux_out=None, diff_out=None):
+        self.aux_out = aux_out
+        self.diff_out = diff_out
+class DiffSingerAcoustic(CategorizedModule, ParameterAdaptorModule):
+    @property
+    def category(self):
+        return "acoustic"
+    def __init__(self, vocab_size, out_dims):
+        CategorizedModule.__init__(self)
+        ParameterAdaptorModule.__init__(self)
+        self.fs2 = FastSpeech2Acoustic(vocab_size=vocab_size)
+        self.use_shallow_diffusion = hparams.get("use_shallow_diffusion", False)
+        self.shallow_args = hparams.get("shallow_diffusion_args", {})
+        if self.use_shallow_diffusion:
+            self.train_aux_decoder = self.shallow_args["train_aux_decoder"]
+            self.train_diffusion = self.shallow_args["train_diffusion"]
+            self.aux_decoder_grad = self.shallow_args["aux_decoder_grad"]
+            self.aux_decoder = AuxDecoderAdaptor(
+                in_dims=hparams["hidden_size"],
+                out_dims=out_dims,
+                num_feats=1,
+                spec_min=hparams["spec_min"],
+                spec_max=hparams["spec_max"],
+                aux_decoder_arch=self.shallow_args["aux_decoder_arch"],
+                aux_decoder_args=self.shallow_args["aux_decoder_args"],
+            )
+        self.diffusion_type = hparams.get("diffusion_type", "ddpm")
+        self.backbone_type = compat.get_backbone_type(hparams)
+        self.backbone_args = compat.get_backbone_args(hparams, self.backbone_type)
+        if self.diffusion_type == "ddpm":
+            self.diffusion = GaussianDiffusion(
+                out_dims=out_dims,
+                num_feats=1,
+                timesteps=hparams["timesteps"],
+                k_step=hparams["K_step"],
+                backbone_type=self.backbone_type,
+                backbone_args=self.backbone_args,
+                spec_min=hparams["spec_min"],
+                spec_max=hparams["spec_max"],
+            )
+        elif self.diffusion_type == "reflow":
+            self.diffusion = RectifiedFlow(
+                out_dims=out_dims,
+                num_feats=1,
+                t_start=hparams["T_start"],
+                time_scale_factor=hparams["time_scale_factor"],
+                backbone_type=self.backbone_type,
+                backbone_args=self.backbone_args,
+                spec_min=hparams["spec_min"],
+                spec_max=hparams["spec_max"],
+            )
+        else:
+            raise NotImplementedError(self.diffusion_type)
+    def forward(
+        self, txt_tokens, mel2ph, f0, key_shift=None, speed=None, spk_embed_id=None, gt_mel=None, infer=True, **kwargs
+    ) -> ShallowDiffusionOutput:
+        condition = self.fs2(
+            txt_tokens, mel2ph, f0, key_shift=key_shift, speed=speed, spk_embed_id=spk_embed_id, **kwargs
+        )
+        if infer:
+            if self.use_shallow_diffusion:
+                aux_mel_pred = self.aux_decoder(condition, infer=True)
+                aux_mel_pred *= (mel2ph > 0).astype(dtype="float32")[:, :, None]
+                if gt_mel is not None and self.shallow_args["val_gt_start"]:
+                    src_mel = gt_mel
+                else:
+                    src_mel = aux_mel_pred
+            else:
+                aux_mel_pred = src_mel = None
+            mel_pred = self.diffusion(condition, src_spec=src_mel, infer=True)
+            mel_pred *= (mel2ph > 0).astype(dtype="float32")[:, :, None]
+            return ShallowDiffusionOutput(aux_out=aux_mel_pred, diff_out=mel_pred)
+        elif self.use_shallow_diffusion:
+            if self.train_aux_decoder:
+                aux_cond = condition * self.aux_decoder_grad + condition.detach() * (1 - self.aux_decoder_grad)
+                aux_out = self.aux_decoder(aux_cond, infer=False)
+            else:
+                aux_out = None
+            if self.train_diffusion:
+                diff_out = self.diffusion(condition, gt_spec=gt_mel, infer=False)
+            else:
+                diff_out = None
+            return ShallowDiffusionOutput(aux_out=aux_out, diff_out=diff_out)
+        else:
+            aux_out = None
+            diff_out = self.diffusion(condition, gt_spec=gt_mel, infer=False)
+            return ShallowDiffusionOutput(aux_out=aux_out, diff_out=diff_out)
+class DiffSingerVariance(CategorizedModule, ParameterAdaptorModule):
+    @property
+    def category(self):
+        return "variance"
+    def __init__(self, vocab_size):
+        CategorizedModule.__init__(self)
+        ParameterAdaptorModule.__init__(self)
+        self.predict_dur = hparams["predict_dur"]
+        self.predict_pitch = hparams["predict_pitch"]
+        self.use_spk_id = hparams["use_spk_id"]
+        if self.use_spk_id:
+            self.spk_embed = Embedding(hparams["num_spk"], hparams["hidden_size"])
+        self.fs2 = FastSpeech2Variance(vocab_size=vocab_size)
+        self.rr = RhythmRegulator()
+        self.lr = LengthRegulator()
+        self.diffusion_type = hparams.get("diffusion_type", "ddpm")
+        if self.predict_pitch:
+            self.use_melody_encoder = hparams.get("use_melody_encoder", False)
+            if self.use_melody_encoder:
+                self.melody_encoder = MelodyEncoder(enc_hparams=hparams["melody_encoder_args"])
+                self.delta_pitch_embed = Linear(1, hparams["hidden_size"])
+            else:
+                self.base_pitch_embed = Linear(1, hparams["hidden_size"])
+            self.pitch_retake_embed = Embedding(2, hparams["hidden_size"])
+            pitch_hparams = hparams["pitch_prediction_args"]
+            self.pitch_backbone_type = compat.get_backbone_type(hparams, nested_config=pitch_hparams)
+            self.pitch_backbone_args = compat.get_backbone_args(pitch_hparams, backbone_type=self.pitch_backbone_type)
+            if self.diffusion_type == "ddpm":
+                self.pitch_predictor = PitchDiffusion(
+                    vmin=pitch_hparams["pitd_norm_min"],
+                    vmax=pitch_hparams["pitd_norm_max"],
+                    cmin=pitch_hparams["pitd_clip_min"],
+                    cmax=pitch_hparams["pitd_clip_max"],
+                    repeat_bins=pitch_hparams["repeat_bins"],
+                    timesteps=hparams["timesteps"],
+                    k_step=hparams["K_step"],
+                    backbone_type=self.pitch_backbone_type,
+                    backbone_args=self.pitch_backbone_args,
+                )
+            elif self.diffusion_type == "reflow":
+                self.pitch_predictor = PitchRectifiedFlow(
+                    vmin=pitch_hparams["pitd_norm_min"],
+                    vmax=pitch_hparams["pitd_norm_max"],
+                    cmin=pitch_hparams["pitd_clip_min"],
+                    cmax=pitch_hparams["pitd_clip_max"],
+                    repeat_bins=pitch_hparams["repeat_bins"],
+                    time_scale_factor=hparams["time_scale_factor"],
+                    backbone_type=self.pitch_backbone_type,
+                    backbone_args=self.pitch_backbone_args,
+                )
+            else:
+                raise ValueError(f"Invalid diffusion type: {self.diffusion_type}")
+        if self.predict_variances:
+            self.pitch_embed = Linear(1, hparams["hidden_size"])
+            self.variance_embeds = paddle.nn.LayerDict(
+                sublayers={v_name: Linear(1, hparams["hidden_size"]) for v_name in self.variance_prediction_list}
+            )
+            if self.diffusion_type == "ddpm":
+                self.variance_predictor = self.build_adaptor(cls=MultiVarianceDiffusion)
+            elif self.diffusion_type == "reflow":
+                self.variance_predictor = self.build_adaptor(cls=MultiVarianceRectifiedFlow)
+            else:
+                raise NotImplementedError(self.diffusion_type)
+    def forward(
+        self,
+        txt_tokens,
+        midi,
+        ph2word,
+        ph_dur=None,
+        word_dur=None,
+        mel2ph=None,
+        note_midi=None,
+        note_rest=None,
+        note_dur=None,
+        note_glide=None,
+        mel2note=None,
+        base_pitch=None,
+        pitch=None,
+        pitch_expr=None,
+        pitch_retake=None,
+        variance_retake: Dict[str, paddle.Tensor] = None,
+        spk_id=None,
+        infer=True,
+        **kwargs
+    ):
+        if self.use_spk_id:
+            ph_spk_mix_embed = kwargs.get("ph_spk_mix_embed")
+            spk_mix_embed = kwargs.get("spk_mix_embed")
+            if ph_spk_mix_embed is not None and spk_mix_embed is not None:
+                ph_spk_embed = ph_spk_mix_embed
+                spk_embed = spk_mix_embed
+            else:
+                ph_spk_embed = spk_embed = self.spk_embed(spk_id)[:, None, :]
+        else:
+            ph_spk_embed = spk_embed = None
+        encoder_out, dur_pred_out = self.fs2(
+            txt_tokens,
+            midi=midi,
+            ph2word=ph2word,
+            ph_dur=ph_dur,
+            word_dur=word_dur,
+            spk_embed=ph_spk_embed,
+            infer=infer,
+        )
+        if not self.predict_pitch and not self.predict_variances:
+            return dur_pred_out, None, {} if infer else None
+        if mel2ph is None and word_dur is not None:
+            dur_pred_align = self.rr(dur_pred_out, ph2word, word_dur)
+            mel2ph = self.lr(dur_pred_align)
+            mel2ph = paddle.nn.functional.pad(
+                x=mel2ph, pad=[0, tuple(base_pitch.shape)[1] - tuple(mel2ph.shape)[1]], pad_from_left_axis=False
+            )
+        encoder_out = paddle.nn.functional.pad(x=encoder_out, pad=[0, 0, 1, 0], pad_from_left_axis=False)
+        mel2ph_ = mel2ph[..., None].tile(repeat_times=[1, 1, hparams[hidden_size]])
+        condition = paddle.take_along_axis(arr=encoder_out, axis=1, indices=mel2ph_, broadcast=False)
+        if self.use_spk_id:
+            condition += spk_embed
+        if self.predict_pitch:
+            if self.use_melody_encoder:
+                melody_encoder_out = self.melody_encoder(note_midi, note_rest, note_dur, glide=note_glide)
+                melody_encoder_out = paddle.nn.functional.pad(
+                    x=melody_encoder_out, pad=[0, 0, 1, 0], pad_from_left_axis=False
+                )
+                mel2note_ = mel2note[..., None].tile(repeat_times=[1, 1, hparams[hidden_size]])
+                melody_condition = paddle.take_along_axis(
+                    arr=melody_encoder_out, axis=1, indices=mel2note_, broadcast=False
+                )
+                pitch_cond = condition + melody_condition
+            else:
+                pitch_cond = condition.clone()
+            retake_unset = pitch_retake is None
+            if retake_unset:
+                pitch_retake = paddle.ones_like(x=mel2ph, dtype="bool")
+            if pitch_expr is None:
+                pitch_retake_embed = self.pitch_retake_embed(pitch_retake.astype(dtype="int64"))
+            else:
+                retake_true_embed = self.pitch_retake_embed(paddle.ones(shape=[1, 1], dtype="int64"))
+                retake_false_embed = self.pitch_retake_embed(paddle.zeros(shape=[1, 1], dtype="int64"))
+                pitch_expr = (pitch_expr * pitch_retake)[:, :, None]
+                pitch_retake_embed = pitch_expr * retake_true_embed + (1.0 - pitch_expr) * retake_false_embed
+            pitch_cond += pitch_retake_embed
+            if self.use_melody_encoder:
+                if retake_unset:
+                    delta_pitch_in = paddle.zeros_like(x=base_pitch)
+                else:
+                    delta_pitch_in = (pitch - base_pitch) * ~pitch_retake
+                pitch_cond += self.delta_pitch_embed(delta_pitch_in[:, :, None])
+            else:
+                if not retake_unset:
+                    base_pitch = base_pitch * pitch_retake + pitch * ~pitch_retake
+                pitch_cond += self.base_pitch_embed(base_pitch[:, :, None])
+            if infer:
+                pitch_pred_out = self.pitch_predictor(pitch_cond, infer=True)
+            else:
+                pitch_pred_out = self.pitch_predictor(pitch_cond, pitch - base_pitch, infer=False)
+        else:
+            pitch_pred_out = None
+        if not self.predict_variances:
+            return dur_pred_out, pitch_pred_out, {} if infer else None
+        if pitch is None:
+            pitch = base_pitch + pitch_pred_out
+        var_cond = condition + self.pitch_embed(pitch[:, :, None])
+        variance_inputs = self.collect_variance_inputs(**kwargs)
+        if variance_retake is not None:
+            variance_embeds = [
+                (self.variance_embeds[v_name](v_input[:, :, None]) * ~variance_retake[v_name][:, :, None])
+                for v_name, v_input in zip(self.variance_prediction_list, variance_inputs)
+            ]
+            var_cond += paddle.stack(x=variance_embeds, axis=-1).sum(axis=-1)
+        variance_outputs = self.variance_predictor(var_cond, variance_inputs, infer=infer)
+        if infer:
+            variances_pred_out = self.collect_variance_outputs(variance_outputs)
+        else:
+            variances_pred_out = variance_outputs
+        return dur_pred_out, pitch_pred_out, variances_pred_out

VLMEvalKit_old/PaddleMIX/paddlemix/models/diffsinger/utils/__init__.py ADDED Viewed

	@@ -0,0 +1,342 @@

+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import annotations
+import pathlib
+import re
+import time
+import types
+from collections import OrderedDict
+import numpy as np
+import paddle
+from paddlemix.models.diffsinger.basics.base_module import CategorizedModule
+from paddlemix.models.diffsinger.utils import paddle_aux
+from paddlemix.models.diffsinger.utils.hparams import hparams
+def tensors_to_scalars(metrics):
+    new_metrics = {}
+    for k, v in metrics.items():
+        if isinstance(v, paddle.Tensor):
+            v = v.item()
+        if type(v) is dict:
+            v = tensors_to_scalars(v)
+        new_metrics[k] = v
+    return new_metrics
+def collate_nd(values, pad_value=0, max_len=None):
+    """
+    Pad a list of Nd tensors on their first dimension and stack them into a (N+1)d tensor.
+    """
+    size = max(v.shape[0] for v in values) if max_len is None else max_len, *tuple(values[0].shape)[1:]
+    res = paddle.full(shape=(len(values), *size), fill_value=pad_value, dtype=values[0].dtype)
+    for i, v in enumerate(values):
+        res[i, : len(v), ...] = v
+    return res
+def random_continuous_masks(*shape: int, dim: int, device: (str | (paddle.CPUPlace, paddle.CUDAPlace, str)) = "cpu"):  # type: ignore
+    start, end = (
+        paddle.sort(
+            x=paddle.randint(
+                low=0, high=shape[dim] + 1, shape=(*shape[:dim], 2, *((1,) * (len(shape) - dim - 1)))
+            ).expand(shape=[*((-1,) * (dim + 1)), *shape[dim + 1 :]]),
+            axis=dim,
+        ),
+        paddle.argsort(
+            x=paddle.randint(
+                low=0, high=shape[dim] + 1, shape=(*shape[:dim], 2, *((1,) * (len(shape) - dim - 1)))
+            ).expand(shape=[*((-1,) * (dim + 1)), *shape[dim + 1 :]]),
+            axis=dim,
+        ),
+    )[0].split(1, dim=dim)
+    idx = paddle.arange(start=0, end=shape[dim], dtype="int64").reshape(
+        *((1,) * dim), shape[dim], *((1,) * (len(shape) - dim - 1))
+    )
+    masks = (idx >= start) & (idx < end)
+    return masks
+def _is_batch_full(batch, num_frames, max_batch_frames, max_batch_size):
+    if len(batch) == 0:
+        return 0
+    if len(batch) == max_batch_size:
+        return 1
+    if num_frames > max_batch_frames:
+        return 1
+    return 0
+def batch_by_size(indices, num_frames_fn, max_batch_frames=80000, max_batch_size=48, required_batch_size_multiple=1):
+    """
+    Yield mini-batches of indices bucketed by size. Batches may contain
+    sequences of different lengths.
+    Args:
+        indices (List[int]): ordered list of dataset indices
+        num_frames_fn (callable): function that returns the number of frames at
+            a given index
+        max_batch_frames (int, optional): max number of frames in each batch
+            (default: 80000).
+        max_batch_size (int, optional): max number of sentences in each
+            batch (default: 48).
+        required_batch_size_multiple: require the batch size to be multiple
+            of a given number
+    """
+    bsz_mult = required_batch_size_multiple
+    if isinstance(indices, types.GeneratorType):
+        indices = np.fromiter(indices, dtype=np.int64, count=-1)
+    sample_len = 0
+    sample_lens = []
+    batch = []
+    batches = []
+    for i in range(len(indices)):
+        idx = indices[i]
+        num_frames = num_frames_fn(idx)
+        sample_lens.append(num_frames)
+        sample_len = max(sample_len, num_frames)
+        assert (
+            sample_len <= max_batch_frames
+        ), "sentence at index {} of size {} exceeds max_batch_samples limit of {}!".format(
+            idx, sample_len, max_batch_frames
+        )
+        num_frames = (len(batch) + 1) * sample_len
+        if _is_batch_full(batch, num_frames, max_batch_frames, max_batch_size):
+            mod_len = max(bsz_mult * (len(batch) // bsz_mult), len(batch) % bsz_mult)
+            batches.append(batch[:mod_len])
+            batch = batch[mod_len:]
+            sample_lens = sample_lens[mod_len:]
+            sample_len = max(sample_lens) if len(sample_lens) > 0 else 0
+        batch.append(idx)
+    if len(batch) > 0:
+        batches.append(batch)
+    return batches
+def make_positions(tensor, padding_idx):
+    """Replace non-padding symbols with their position numbers.
+    Position numbers begin at padding_idx+1. Padding symbols are ignored.
+    """
+    mask = tensor.not_equal(y=paddle.to_tensor(padding_idx)).astype(dtype="int32")
+    return (paddle.cumsum(x=mask, axis=1).astype(dtype=mask.dtype) * mask).astype(dtype="int64") + padding_idx
+def softmax(x, dim):
+    return paddle.nn.functional.softmax(x=x, axis=dim, dtype="float32")
+def unpack_dict_to_list(samples):
+    samples_ = []
+    bsz = samples.get("outputs").shape[0]
+    for i in range(bsz):
+        res = {}
+        for k, v in samples.items():
+            try:
+                res[k] = v[i]
+            except:
+                pass
+        samples_.append(res)
+    return samples_
+def filter_kwargs(dict_to_filter, kwarg_obj):
+    import inspect
+    sig = inspect.signature(kwarg_obj)
+    if any(param.kind == param.VAR_KEYWORD for param in sig.parameters.values()):
+        return dict_to_filter.copy()
+    filter_keys = [
+        param.name
+        for param in sig.parameters.values()
+        if param.kind == param.POSITIONAL_OR_KEYWORD or param.kind == param.KEYWORD_ONLY
+    ]
+    filtered_dict = {
+        filter_key: dict_to_filter[filter_key] for filter_key in filter_keys if filter_key in dict_to_filter
+    }
+    return filtered_dict
+def load_ckpt(
+    cur_model,
+    ckpt_base_dir,
+    ckpt_steps=None,
+    prefix_in_ckpt="model",
+    ignored_prefixes=None,
+    key_in_ckpt="state_dict",
+    strict=True,
+    device="cpu",
+):
+    if ignored_prefixes is None:
+        ignored_prefixes = ["model.fs2.encoder.embed_tokens"]
+    if not isinstance(ckpt_base_dir, pathlib.Path):
+        ckpt_base_dir = pathlib.Path(ckpt_base_dir)
+    if ckpt_base_dir.is_file():
+        checkpoint_path = [ckpt_base_dir]
+    elif ckpt_steps is not None:
+        checkpoint_path = [ckpt_base_dir / f"model_ckpt_steps_{int(ckpt_steps)}.ckpt"]
+    else:
+        base_dir = ckpt_base_dir
+        checkpoint_path = sorted(
+            [
+                ckpt_file
+                for ckpt_file in base_dir.iterdir()
+                if ckpt_file.is_file() and re.fullmatch("model_ckpt_steps_\\d+\\.ckpt", ckpt_file.name)
+            ],
+            key=lambda x: int(re.search("\\d+", x.name).group(0)),
+        )
+    assert len(checkpoint_path) > 0, f"| ckpt not found in {ckpt_base_dir}."
+    checkpoint_path = checkpoint_path[-1]
+    ckpt_loaded = paddle.load(path=str(checkpoint_path))
+    if isinstance(cur_model, CategorizedModule):
+        cur_model.check_category(ckpt_loaded.get("category"))
+    if key_in_ckpt is None:
+        state_dict = ckpt_loaded
+    else:
+        state_dict = ckpt_loaded[key_in_ckpt]
+    if prefix_in_ckpt is not None:
+        state_dict = OrderedDict(
+            {
+                k[len(prefix_in_ckpt) + 1 :]: v
+                for k, v in state_dict.items()
+                if k.startswith(f"{prefix_in_ckpt}.")
+                if all(not k.startswith(p) for p in ignored_prefixes)
+            }
+        )
+    if not strict:
+        cur_model_state_dict = cur_model.state_dict()
+        unmatched_keys = []
+        for key, param in state_dict.items():
+            if key in cur_model_state_dict:
+                new_param = cur_model_state_dict[key]
+                if tuple(new_param.shape) != tuple(param.shape):
+                    unmatched_keys.append(key)
+                    print("| Unmatched keys: ", key, tuple(new_param.shape), tuple(param.shape))
+        for key in unmatched_keys:
+            del state_dict[key]
+    cur_model.set_state_dict(state_dict=state_dict)
+    shown_model_name = "state dict"
+    if prefix_in_ckpt is not None:
+        shown_model_name = f"'{prefix_in_ckpt}'"
+    elif key_in_ckpt is not None:
+        shown_model_name = f"'{key_in_ckpt}'"
+    print(f"| load {shown_model_name} from '{checkpoint_path}'.")
+def remove_padding(x, padding_idx=0):
+    if x is None:
+        return None
+    assert len(tuple(x.shape)) in [1, 2]
+    if len(tuple(x.shape)) == 2:
+        return x[np.abs(x).sum(-1) != padding_idx]
+    elif len(tuple(x.shape)) == 1:
+        return x[x != padding_idx]
+class Timer:
+    timer_map = {}
+    def __init__(self, name, print_time=False):
+        if name not in Timer.timer_map:
+            Timer.timer_map[name] = 0
+        self.name = name
+        self.print_time = print_time
+    def __enter__(self):
+        self.t = time.time()
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        Timer.timer_map[self.name] += time.time() - self.t
+        if self.print_time:
+            print(self.name, Timer.timer_map[self.name])
+def print_arch(model, model_name="model"):
+    print(f"| {model_name} Arch: ", model)
+def num_params(model, print_out=True, model_name="model"):
+    parameters = filter(lambda p: not p.stop_gradient, model.parameters())
+    parameters = sum([np.prod(tuple(p.shape)) for p in parameters]) / 1000000
+    if print_out:
+        print(f"| {model_name} Trainable Parameters: %.3fM" % parameters)
+    return parameters
+def build_object_from_class_name(cls_str, parent_cls, *args, **kwargs):
+    import importlib
+    pkg = ".".join(cls_str.split(".")[:-1])
+    cls_name = cls_str.split(".")[-1]
+    cls_type = getattr(importlib.import_module(pkg), cls_name)
+    if parent_cls is not None:
+        assert issubclass(cls_type, parent_cls), f"| {cls_type} is not subclass of {parent_cls}."
+    return cls_type(*args, **filter_kwargs(kwargs, cls_type))
+def build_lr_scheduler_from_config(optimizer, scheduler_args):
+    # try:
+    # except ImportError:
+    from paddle.optimizer.lr import LRScheduler as LRScheduler
+    def helper(params):
+        if isinstance(params, list):
+            return [helper(s) for s in params]
+        elif isinstance(params, dict):
+            resolved = {k: helper(v) for k, v in params.items()}
+            if "cls" in resolved:
+                if (
+                    resolved["cls"] == "torch.optim.lr_scheduler.ChainedScheduler"
+                    and scheduler_args["scheduler_cls"] == "torch.optim.lr_scheduler.SequentialLR"
+                ):
+                    raise ValueError(f"ChainedScheduler cannot be part of a SequentialLR.")
+                resolved["optimizer"] = optimizer
+                obj = build_object_from_class_name(resolved["cls"], LRScheduler, **resolved)
+                return obj
+            return resolved
+        else:
+            return params
+    resolved = helper(scheduler_args)
+    resolved["optimizer"] = optimizer
+    return build_object_from_class_name(scheduler_args["scheduler_cls"], LRScheduler, **resolved)
+def simulate_lr_scheduler(optimizer_args, scheduler_args, step_count, num_param_groups=1):
+    optimizer = build_object_from_class_name(
+        optimizer_args["optimizer_cls"],
+        paddle.optimizer.Optimizer,
+        [
+            {
+                "params": paddle.base.framework.EagerParamBase.from_tensor(tensor=paddle.to_tensor([])),
+                "initial_lr": optimizer_args["lr"],
+            }
+            for _ in range(num_param_groups)
+        ],
+        **optimizer_args,
+    )
+    scheduler = build_lr_scheduler_from_config(optimizer, scheduler_args)
+    scheduler.optimizer._step_count = 1
+    for _ in range(step_count):
+        scheduler.step()
+    return scheduler.state_dict()
+def remove_suffix(string: str, suffix: str):
+    if string.endswith(suffix):
+        string = string[: -len(suffix)]
+    return string