tuandunghcmut commited on
Commit
727399d
·
verified ·
1 Parent(s): 8ec1a3c

Add files using upload-large-folder tool

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gitattributes +1 -0
  2. DeepSeek-VL2/vg.jpg +0 -0
  3. VLMEvalKit_old/PaddleMIX/paddlemix/examples/GOT_OCR_2_0/README.md +110 -0
  4. VLMEvalKit_old/PaddleMIX/paddlemix/examples/GOT_OCR_2_0/requirement.txt +3 -0
  5. VLMEvalKit_old/PaddleMIX/paddlemix/examples/GOT_OCR_2_0/run_train.sh +78 -0
  6. VLMEvalKit_old/PaddleMIX/paddlemix/examples/GOT_OCR_2_0/train_GOT.py +243 -0
  7. VLMEvalKit_old/PaddleMIX/paddlemix/examples/ppdocbee/app.py +350 -0
  8. VLMEvalKit_old/PaddleMIX/paddlemix/models/GOT/utils/conversation.py +400 -0
  9. VLMEvalKit_old/PaddleMIX/paddlemix/models/audioldm2/encoders/phoneme_encoder/__init__.py +13 -0
  10. VLMEvalKit_old/PaddleMIX/paddlemix/models/audioldm2/encoders/phoneme_encoder/cleaners.py +103 -0
  11. VLMEvalKit_old/PaddleMIX/paddlemix/models/audioldm2/encoders/phoneme_encoder/symbols.py +28 -0
  12. VLMEvalKit_old/PaddleMIX/paddlemix/models/audioldm2/encoders/phoneme_encoder/text.py +62 -0
  13. VLMEvalKit_old/PaddleMIX/paddlemix/models/audioldm2/unet/attention.py +199 -0
  14. VLMEvalKit_old/PaddleMIX/paddlemix/models/diffsinger/basics/base_augmentation.py +46 -0
  15. VLMEvalKit_old/PaddleMIX/paddlemix/models/diffsinger/basics/base_binarizer.py +330 -0
  16. VLMEvalKit_old/PaddleMIX/paddlemix/models/diffsinger/basics/base_exporter.py +72 -0
  17. VLMEvalKit_old/PaddleMIX/paddlemix/models/diffsinger/basics/base_svs_infer.py +149 -0
  18. VLMEvalKit_old/PaddleMIX/paddlemix/models/diffsinger/basics/base_vocoder.py +37 -0
  19. VLMEvalKit_old/PaddleMIX/paddlemix/models/diffsinger/modules/aux_decoder/convnext.py +103 -0
  20. VLMEvalKit_old/PaddleMIX/paddlemix/models/diffsinger/modules/backbones/__init__.py +26 -0
  21. VLMEvalKit_old/PaddleMIX/paddlemix/models/diffsinger/modules/backbones/lynxnet.py +188 -0
  22. VLMEvalKit_old/PaddleMIX/paddlemix/models/diffsinger/modules/backbones/wavenet.py +120 -0
  23. VLMEvalKit_old/PaddleMIX/paddlemix/models/diffsinger/modules/commons/common_layers.py +187 -0
  24. VLMEvalKit_old/PaddleMIX/paddlemix/models/diffsinger/modules/commons/espnet_positional_embedding.py +129 -0
  25. VLMEvalKit_old/PaddleMIX/paddlemix/models/diffsinger/modules/compat.py +35 -0
  26. VLMEvalKit_old/PaddleMIX/paddlemix/models/diffsinger/modules/core/__init__.py +16 -0
  27. VLMEvalKit_old/PaddleMIX/paddlemix/models/diffsinger/modules/core/ddpm.py +521 -0
  28. VLMEvalKit_old/PaddleMIX/paddlemix/models/diffsinger/modules/core/reflow.py +311 -0
  29. VLMEvalKit_old/PaddleMIX/paddlemix/models/diffsinger/modules/fastspeech/acoustic_encoder.py +110 -0
  30. VLMEvalKit_old/PaddleMIX/paddlemix/models/diffsinger/modules/fastspeech/param_adaptor.py +88 -0
  31. VLMEvalKit_old/PaddleMIX/paddlemix/models/diffsinger/modules/fastspeech/tts_modules.py +473 -0
  32. VLMEvalKit_old/PaddleMIX/paddlemix/models/diffsinger/modules/fastspeech/variance_encoder.py +151 -0
  33. VLMEvalKit_old/PaddleMIX/paddlemix/models/diffsinger/modules/hnsep/vr/__init__.py +42 -0
  34. VLMEvalKit_old/PaddleMIX/paddlemix/models/diffsinger/modules/hnsep/vr/layers.py +140 -0
  35. VLMEvalKit_old/PaddleMIX/paddlemix/models/diffsinger/modules/hnsep/vr/nets.py +185 -0
  36. VLMEvalKit_old/PaddleMIX/paddlemix/models/diffsinger/modules/nsf_hifigan/env.py +46 -0
  37. VLMEvalKit_old/PaddleMIX/paddlemix/models/diffsinger/modules/nsf_hifigan/models.py +380 -0
  38. VLMEvalKit_old/PaddleMIX/paddlemix/models/diffsinger/modules/nsf_hifigan/nvSTFT.py +104 -0
  39. VLMEvalKit_old/PaddleMIX/paddlemix/models/diffsinger/modules/nsf_hifigan/utils.py +27 -0
  40. VLMEvalKit_old/PaddleMIX/paddlemix/models/diffsinger/modules/pe/pm.py +30 -0
  41. VLMEvalKit_old/PaddleMIX/paddlemix/models/diffsinger/modules/pe/rmvpe/__init__.py +19 -0
  42. VLMEvalKit_old/PaddleMIX/paddlemix/models/diffsinger/modules/pe/rmvpe/constants.py +21 -0
  43. VLMEvalKit_old/PaddleMIX/paddlemix/models/diffsinger/modules/pe/rmvpe/deepunet.py +194 -0
  44. VLMEvalKit_old/PaddleMIX/paddlemix/models/diffsinger/modules/pe/rmvpe/inference.py +80 -0
  45. VLMEvalKit_old/PaddleMIX/paddlemix/models/diffsinger/modules/pe/rmvpe/model.py +54 -0
  46. VLMEvalKit_old/PaddleMIX/paddlemix/models/diffsinger/modules/pe/rmvpe/seq.py +30 -0
  47. VLMEvalKit_old/PaddleMIX/paddlemix/models/diffsinger/modules/pe/rmvpe/spec.py +65 -0
  48. VLMEvalKit_old/PaddleMIX/paddlemix/models/diffsinger/modules/pe/rmvpe/utils.py +54 -0
  49. VLMEvalKit_old/PaddleMIX/paddlemix/models/diffsinger/modules/toplevel.py +323 -0
  50. VLMEvalKit_old/PaddleMIX/paddlemix/models/diffsinger/utils/__init__.py +342 -0
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ DeepSeek-VL2/vg.jpg filter=lfs diff=lfs merge=lfs -text
DeepSeek-VL2/vg.jpg ADDED
VLMEvalKit_old/PaddleMIX/paddlemix/examples/GOT_OCR_2_0/README.md ADDED
@@ -0,0 +1,110 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # GOT-OCR2.0
2
+
3
+ ## 1. 模型介绍
4
+
5
+ [GOT-OCR2.0](https://arxiv.org/abs/2409.01704)是由 StepFun 和中国科学院大学推出的专用于通用 OCR 任务的多模态大模型,参数量 0.6B,是一款极具突破性的通用OCR多模态模型,旨在解决传统OCR系统(OCR-1.0)和当前大规模视觉语言模型(LVLMs)在OCR任务中的局限性。
6
+
7
+ **本仓库支持的模型权重:**
8
+
9
+ | Model |
10
+ |--------------------|
11
+ | stepfun-ai/GOT-OCR2_0 |
12
+
13
+ 注意:与huggingface权重同名,但权重为paddle框架的Tensor,使用`xxx.from_pretrained("stepfun-ai/GOT-OCR2_0")`即可自动下载该权重文件夹到缓存目录。
14
+
15
+
16
+ ## 2. 环境要求
17
+ - **python >= 3.10**
18
+ - **paddlepaddle-gpu 要求3.0.0b2版本或develop版本**
19
+ ```
20
+ # 安装示例
21
+ python -m pip install paddlepaddle-gpu==3.0.0b2 -i https://www.paddlepaddle.org.cn/packages/stable/cu118/
22
+ ```
23
+
24
+ - **paddlenlp == 3.0.0b3**
25
+ - **paddlenlp要求是3.0.0b3版本**
26
+ ```
27
+ # 安装示例
28
+ python -m pip install paddlenlp==3.0.0b3
29
+ ```
30
+
31
+ - **其他环境要求**
32
+ ```
33
+ pip install -r requirements.txt
34
+ ```
35
+
36
+ ## 3 推理预测
37
+
38
+ 注意:GOT-OCR2.0 模型推理显存约需4G,不支持数据类型为"float16"进行推理。
39
+
40
+ ### 3.1. plain texts OCR:
41
+ ```bash
42
+ python paddlemix/examples/GOT_OCR_2_0/got_ocr2_0_infer.py \
43
+ --model_name_or_path stepfun-ai/GOT-OCR2_0 \
44
+ --image_file paddlemix/demo_images/hospital.jpeg \
45
+ --ocr_type ocr \
46
+ --dtype "bfloat16" \
47
+ ```
48
+
49
+ ### 3.2. format texts OCR:
50
+ ```bash
51
+ python paddlemix/examples/GOT_OCR_2_0/got_ocr2_0_infer.py \
52
+ --model_name_or_path stepfun-ai/GOT-OCR2_0 \
53
+ --image_file paddlemix/demo_images/hospital.jpeg \
54
+ --ocr_type format \
55
+ --dtype "bfloat16" \
56
+ ```
57
+
58
+ ### 3.3. multi_crop plain texts OCR:
59
+ ```bash
60
+ python paddlemix/examples/GOT_OCR_2_0/got_ocr2_0_infer.py \
61
+ --model_name_or_path stepfun-ai/GOT-OCR2_0 \
62
+ --image_file paddlemix/demo_images/hospital.jpeg \
63
+ --ocr_type ocr \
64
+ --multi_crop \
65
+ --dtype "bfloat16" \
66
+ ```
67
+
68
+ ## 4 训练
69
+
70
+ 与[官方github代码库](https://github.com/Ucas-HaoranWei/GOT-OCR2.0/?tab=readme-ov-file#train)一样,目前仅支持基于GOT权重的post-training(stage-2/stage-3),其中stage2是全参数微调,stage3是冻结vision encoder后微调,默认训练方式是stage2全参数微调,训练显存约10GB每卡。
71
+
72
+ ### 数据集下载
73
+ PaddleMIX团队提供了一个改版的SynthDoG-EN数据集,统一修改了其原先的question为```<image>\nOCR:```,下载链接为:
74
+ ```
75
+ wget https://paddlenlp.bj.bcebos.com/datasets/paddlemix/playground/synthdog_en.tar # 2.4G
76
+ ```
77
+ synthdog_en.tar包括了图片images文件夹和标注json文件,需下载解压或软链接在PaddleMIX/目录下。
78
+
79
+ ### 数据集格式
80
+
81
+ 同[官方例子](https://github.com/Ucas-HaoranWei/GOT-OCR2.0/blob/main/assets/train_sample.jpg),其中question统一为```<image>\nOCR:```,answer是其OCR结果。
82
+
83
+
84
+ ### 训练命令
85
+
86
+ ```bash
87
+ sh paddlemix/examples/GOT_OCR_2_0/run_train.sh
88
+ ```
89
+
90
+ 注意:默认训练方式是stage2全参数微调,训练显存约10GB每卡。也可通过设置```--freeze_vision_tower True```冻结vision encoder后微调。
91
+
92
+ ### 训完后推理
93
+
94
+ ```bash
95
+ python paddlemix/examples/GOT_OCR_2_0/got_ocr2_0_infer.py \
96
+ --model_name_or_path work_dirs/got_ocr_20/ \
97
+ --image_file paddlemix/demo_images/hospital.jpeg \
98
+ --ocr_type ocr \
99
+ ```
100
+
101
+
102
+ ## 参考文献
103
+ ```BibTeX
104
+ @article{wei2024general,
105
+ title={General OCR Theory: Towards OCR-2.0 via a Unified End-to-end Model},
106
+ author={Wei, Haoran and Liu, Chenglong and Chen, Jinyue and Wang, Jia and Kong, Lingyu and Xu, Yanming and Ge, Zheng and Zhao, Liang and Sun, Jianjian and Peng, Yuang and others},
107
+ journal={arXiv preprint arXiv:2409.01704},
108
+ year={2024}
109
+ }
110
+ ```
VLMEvalKit_old/PaddleMIX/paddlemix/examples/GOT_OCR_2_0/requirement.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ megfile
2
+ natsort
3
+ paddlenlp==3.0.0b3
VLMEvalKit_old/PaddleMIX/paddlemix/examples/GOT_OCR_2_0/run_train.sh ADDED
@@ -0,0 +1,78 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ set -x
16
+
17
+ GPUS=${GPUS:-8}
18
+ BATCH_SIZE=${BATCH_SIZE:-32}
19
+ PER_DEVICE_BATCH_SIZE=${PER_DEVICE_BATCH_SIZE:-1}
20
+
21
+ GRADIENT_ACC=$((BATCH_SIZE / PER_DEVICE_BATCH_SIZE / GPUS))
22
+ tensor_parallel_degree=${tensor_parallel_degree:-1}
23
+ sharding_parallel_degree=$((GPUS / tensor_parallel_degree))
24
+
25
+ export PYTHONPATH="${PYTHONPATH}:$(pwd)"
26
+ export MASTER_PORT=34229
27
+ export TF_CPP_MIN_LOG_LEVEL=3
28
+
29
+ OUTPUT_DIR='work_dirs/got_ocr_20'
30
+
31
+ if [ ! -d "$OUTPUT_DIR" ]; then
32
+ mkdir -p "$OUTPUT_DIR"
33
+ fi
34
+
35
+ TRAINING_MODEL_RESUME="None"
36
+ TRAINER_INSTANCES='127.0.0.1'
37
+ MASTER='127.0.0.1:8080'
38
+
39
+ # --freeze_vision_tower False \ # True for stage3
40
+
41
+ TRAINING_PYTHON="python -m paddle.distributed.launch --master ${MASTER} --nnodes 1 --nproc_per_node ${GPUS} --rank 0 --ips ${TRAINER_INSTANCES} --run_mode=collective"
42
+ ${TRAINING_PYTHON} --log_dir ${OUTPUT_DIR}/paddle_distributed_logs \
43
+ paddlemix/examples/GOT_OCR_2_0/train_GOT.py \
44
+ --do_train \
45
+ --model_name_or_path "stepfun-ai/GOT-OCR2_0" \
46
+ --output_dir ${OUTPUT_DIR} \
47
+ --logging_dir ${OUTPUT_DIR}/logs \
48
+ --meta_path paddlemix/examples/GOT_OCR_2_0/configs/demo_dataset.json \
49
+ --overwrite_output_dir True \
50
+ --dataloader_num_workers 8 \
51
+ --bf16 True \
52
+ --fp16 False \
53
+ --fp16_opt_level "O2" \
54
+ --num_train_epochs 1 \
55
+ --per_device_train_batch_size ${PER_DEVICE_BATCH_SIZE} \
56
+ --gradient_accumulation_steps ${GRADIENT_ACC} \
57
+ --freeze_vision_tower False \
58
+ --use_im_start_end True \
59
+ --max_seq_length 8192 \
60
+ --recompute False \
61
+ --max_grad_norm 1.0 \
62
+ --evaluation_strategy "no" \
63
+ --save_strategy "steps" \
64
+ --save_steps 200 \
65
+ --save_total_limit 1 \
66
+ --learning_rate 2e-5 \
67
+ --weight_decay 0. \
68
+ --warmup_ratio 0.001 \
69
+ --optim "adamw" \
70
+ --lr_scheduler_type "cosine" \
71
+ --logging_steps 1 \
72
+ --report_to "visualdl" \
73
+ --tensor_parallel_degree=${tensor_parallel_degree} \
74
+ --sharding_parallel_degree=${sharding_parallel_degree} \
75
+ --pipeline_parallel_degree=1 \
76
+ --sep_parallel_degree=1 \
77
+ --sharding="stage1" \
78
+ 2>&1 | tee -a "${OUTPUT_DIR}/training_log.txt"
VLMEvalKit_old/PaddleMIX/paddlemix/examples/GOT_OCR_2_0/train_GOT.py ADDED
@@ -0,0 +1,243 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Adopted from https://github.com/lm-sys/FastChat. Below is the original copyright:
2
+ # Adopted from tatsu-lab@stanford_alpaca. Below is the original copyright:
3
+ # Copyright 2023 Rohan Taori, Ishaan Gulrajani, Tianyi Zhang, Yann Dubois, Xuechen Li
4
+ #
5
+ # Licensed under the Apache License, Version 2.0 (the "License");
6
+ # you may not use this file except in compliance with the License.
7
+ # You may obtain a copy of the License at
8
+ #
9
+ # http://www.apache.org/licenses/LICENSE-2.0
10
+ #
11
+ # Unless required by applicable law or agreed to in writing, software
12
+ # distributed under the License is distributed on an "AS IS" BASIS,
13
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
+ # See the License for the specific language governing permissions and
15
+ # limitations under the License.
16
+
17
+ import logging
18
+ import os
19
+ import sys
20
+ from dataclasses import dataclass, field
21
+ from typing import Optional
22
+
23
+ import paddle
24
+ import paddle.distributed as dist
25
+ from paddlenlp.trainer import PdArgumentParser, TrainingArguments, set_seed
26
+ from paddlenlp.trainer.trainer import Trainer
27
+ from paddlenlp.trainer.trainer_utils import get_last_checkpoint
28
+ from paddlenlp.transformers import QWenTokenizer
29
+
30
+ from paddlemix.datasets.got_dataset import make_supervised_data_module
31
+ from paddlemix.models.GOT.GOT_ocr_2_0 import GOTQwenForCausalLM
32
+ from paddlemix.models.GOT.utils.utils import smart_tokenizer_and_embedding_resize
33
+
34
+ logger = logging.getLogger(__name__)
35
+
36
+
37
+ def print_trainable_params(model: paddle.nn.Layer) -> None:
38
+ trainable_params, all_param = 0, 0
39
+ for k, param in model.named_parameters():
40
+ num_params = param.size
41
+ if num_params == 0 and hasattr(param, "ds_numel"):
42
+ num_params = param.ds_numel
43
+ all_param += num_params
44
+ if not param.stop_gradient:
45
+ # print('{}, shape: {}, requires grad: {}'.format(k, param.shape, not param.stop_gradient))
46
+ trainable_params += num_params
47
+ print(
48
+ "trainable params: {:d} || all params: {:d} || trainable%: {:.4f}".format(
49
+ trainable_params, all_param, 100 * trainable_params / all_param
50
+ )
51
+ )
52
+
53
+
54
+ @dataclass
55
+ class ModelArguments:
56
+ model_name_or_path: Optional[str] = field(default="stepfun-ai/GOT-OCR2_0")
57
+ use_cache: bool = field(default=False)
58
+ vision_tower: Optional[str] = field(default="openai/clip-vit-large-patch14")
59
+ freeze_vision_tower: bool = field(default=False)
60
+ freeze_lm_model: bool = field(default=False)
61
+ pretrained_stage1_model: Optional[str] = field(default=None) # mlp &/ vision tower
62
+ vision_select_layer: Optional[int] = field(default=-1) # default to the last layer
63
+ use_im_start_end: bool = field(default=False)
64
+
65
+
66
+ @dataclass
67
+ class DataArguments:
68
+ datasets: str = field(default=None, metadata={"help": "combinations of the training data."})
69
+ meta_path: Optional[str] = field(
70
+ default=None,
71
+ metadata={"help": "The path of the meta file of datasets."},
72
+ )
73
+ sep_image_conv_front: bool = False
74
+ image_token_len: int = 256
75
+ image_aspect_ratio: str = "square"
76
+ conversation_version: str = "mpt"
77
+ box_limit: int = 0
78
+ max_seq_length: int = 8192
79
+
80
+
81
+ @dataclass
82
+ class GOTTrainingArguments(TrainingArguments):
83
+ cache_dir: Optional[str] = field(default=None)
84
+ optim: str = field(default="adamw_torch")
85
+ remove_unused_columns: bool = field(default=False)
86
+ force_fsdp: bool = field(default=False)
87
+ interleave: bool = field(default=False)
88
+ with_box: bool = field(default=False)
89
+ model_max_length: int = field(
90
+ default=512,
91
+ metadata={"help": "Maximum sequence length. Sequences will be right padded (and possibly truncated)."},
92
+ )
93
+ lora_enable: bool = False
94
+ lora_r: int = 8
95
+ lora_alpha: int = 16
96
+ lora_dropout: float = 0.05
97
+ lora_weight_path: str = ""
98
+ lora_bias: str = "none"
99
+
100
+
101
+ def train():
102
+ parser = PdArgumentParser((ModelArguments, DataArguments, GOTTrainingArguments))
103
+ if len(sys.argv) == 2 and sys.argv[1].endswith(".json"):
104
+ # If we pass only one argument to the script, and it's the path to a json file,
105
+ # let's parse it to get our arguments.
106
+ model_args, data_args, training_args = parser.parse_json_file(json_file=os.path.abspath(sys.argv[1]))
107
+ else:
108
+ model_args, data_args, training_args = parser.parse_args_into_dataclasses()
109
+ training_args.print_config(model_args, "Model")
110
+ training_args.print_config(data_args, "Data")
111
+
112
+ # Detecting last checkpoint and eventually continue from last checkpoint.
113
+ last_checkpoint = None
114
+ if os.path.isdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir:
115
+ last_checkpoint = get_last_checkpoint(training_args.output_dir)
116
+ if last_checkpoint is None and len(os.listdir(training_args.output_dir)) > 0:
117
+ raise ValueError(
118
+ f"Output directory ({training_args.output_dir}) already exists and is not empty. "
119
+ "Use --overwrite_output_dir to overcome."
120
+ )
121
+ elif last_checkpoint is not None and training_args.resume_from_checkpoint is None:
122
+ logger.info(
123
+ f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change "
124
+ "the `--output_dir` or add `--overwrite_output_dir` to train from scratch."
125
+ )
126
+
127
+ # Load model
128
+ if training_args.fp16_opt_level == "O2":
129
+ if training_args.fp16:
130
+ dtype = "float16"
131
+ elif training_args.bf16 and paddle.amp.is_bfloat16_supported():
132
+ dtype = "bfloat16"
133
+ else:
134
+ raise ValueError("Please specific dtype: --fp16 or --bf16")
135
+ else:
136
+ dtype = "float32"
137
+
138
+ # Set seed before initializing model.
139
+ set_seed(training_args.seed)
140
+
141
+ # Load pretrained model, tokenizer, and image processor
142
+ tokenizer_path = model_args.model_name_or_path
143
+ print(f"Loading Tokenizer: {tokenizer_path}")
144
+
145
+ tokenizer = QWenTokenizer.from_pretrained(
146
+ model_args.model_name_or_path, padding_side="right", model_max_length=training_args.model_max_length
147
+ )
148
+ print("tokenizer", tokenizer)
149
+ # print("len(tokenizer)", len(tokenizer))
150
+ # print("tokenizer.added_tokens_encoder", tokenizer.added_tokens_encoder)
151
+ # print("tokenizer.added_tokens_decoder", tokenizer.added_tokens_decoder)
152
+
153
+ model = GOTQwenForCausalLM.from_pretrained(model_args.model_name_or_path, dtype=dtype)
154
+
155
+ smart_tokenizer_and_embedding_resize(
156
+ special_tokens_dict=dict(pad_token="<|endoftext|>"),
157
+ tokenizer=tokenizer,
158
+ model=model,
159
+ )
160
+
161
+ vision_tower_dict = model.get_model().initialize_vision_modules(
162
+ vision_tower=model_args.vision_tower,
163
+ pretrained_stage1_model=model_args.pretrained_stage1_model,
164
+ freeze_vision_tower=model_args.freeze_vision_tower,
165
+ use_im_start_end=model_args.use_im_start_end,
166
+ vision_select_layer=model_args.vision_select_layer,
167
+ dtype=dtype,
168
+ )
169
+
170
+ model.initialize_vision_tokenizer(
171
+ tokenizer=tokenizer,
172
+ freeze_lm_model=model_args.freeze_lm_model,
173
+ pretrained_stage1_model=model_args.pretrained_stage1_model,
174
+ )
175
+
176
+ # 'image_processor_high
177
+ data_args.image_token_len = 256
178
+ data_args.image_processor = vision_tower_dict["image_processor"]
179
+ data_args.image_processor_high = vision_tower_dict["image_processor_high"]
180
+ data_args.use_im_start_end = model_args.use_im_start_end
181
+
182
+ def _freeze_params(module):
183
+ for param in module.parameters():
184
+ param.stop_gradient = not False
185
+
186
+ # mixed relation, to be fixed
187
+ if model_args.freeze_lm_model:
188
+ _freeze_params(model.get_model().mm_projector)
189
+ _freeze_params(model.get_model().mm_projector_vary)
190
+ _freeze_params(model.get_input_embeddings())
191
+
192
+ if model_args.freeze_vision_tower:
193
+ _freeze_params(model.qwen2.vision_tower_high)
194
+
195
+ print_trainable_params(model)
196
+ # trainable params: 464959488 || all params: 560528640 || trainable%: 82.9502 # stage3
197
+ # trainable params: 560528640 || all params: 560528640 || trainable%: 100 # stage2
198
+ params_grad = [p.numel() for n, p in model.named_parameters() if not p.stop_gradient]
199
+ print(f"Number of Mapping Trainable Parameters: {int(sum(params_grad)) / (1 << 20):.2f} M")
200
+
201
+ # print trainable parameters
202
+ if dist.get_rank() == 0:
203
+ for name, param in model.named_parameters():
204
+ if not param.stop_gradient:
205
+ logger.info(name)
206
+
207
+ # set seed for paddle dataloaders
208
+ set_seed(training_args.seed)
209
+
210
+ data_module = make_supervised_data_module(
211
+ interleave=training_args.interleave, with_box=training_args.with_box, tokenizer=tokenizer, data_args=data_args
212
+ )
213
+
214
+ trainer = Trainer(
215
+ model=model,
216
+ args=training_args,
217
+ tokenizer=tokenizer,
218
+ **data_module,
219
+ )
220
+
221
+ # Training
222
+ if training_args.do_train:
223
+ checkpoint = None
224
+ if training_args.resume_from_checkpoint is not None:
225
+ checkpoint = training_args.resume_from_checkpoint
226
+ elif last_checkpoint is not None:
227
+ checkpoint = last_checkpoint
228
+ train_result = trainer.train(resume_from_checkpoint=checkpoint)
229
+ trainer.save_model() # Saves the tokenizer too for easy upload
230
+
231
+ metrics = train_result.metrics
232
+ try:
233
+ metrics["train_samples"] = len(data_module["train_dataset"])
234
+ except:
235
+ metrics["train_samples"] = -1
236
+
237
+ trainer.log_metrics("train", metrics)
238
+ trainer.save_metrics("train", metrics)
239
+ trainer.save_state()
240
+
241
+
242
+ if __name__ == "__main__":
243
+ train()
VLMEvalKit_old/PaddleMIX/paddlemix/examples/ppdocbee/app.py ADDED
@@ -0,0 +1,350 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ import hashlib
16
+ import os
17
+ import os.path
18
+ import sys
19
+ import tempfile
20
+ import time
21
+ from datetime import datetime
22
+
23
+ import gradio as gr
24
+ import numpy as np
25
+ import paddle
26
+ from PIL import Image
27
+
28
+ # 设置使用的GPU设备
29
+ os.environ["CUDA_VISIBLE_DEVICES"] = "0"
30
+
31
+ # 模型配置
32
+ model_path = "PaddleMIX/PPDocBee-2B-1129"
33
+ dtype = "bfloat16" # V100请改成float16
34
+
35
+ # 全局变量定义
36
+ model = None
37
+ processor = None
38
+
39
+ min_pixels = 256 * 28 * 28 # 最小像素数
40
+ max_pixels = 48 * 48 * 28 * 28 # 最大像素数
41
+
42
+ SERVER_NAME = "localhost"
43
+ SERVER_PORR = 8080
44
+
45
+
46
+ def check_and_install_paddlemix():
47
+ try:
48
+ from paddlemix.models.qwen2_vl.modeling_qwen2_vl import (
49
+ Qwen2VLForConditionalGeneration,
50
+ )
51
+
52
+ print("Required Qwen2VL model successfully installed")
53
+ except ImportError:
54
+ print("Failed to install required Qwen2VL model even after running the script")
55
+ sys.exit(1)
56
+
57
+
58
+ # 在继续之前检查所需模型
59
+ check_and_install_paddlemix()
60
+
61
+
62
+ from paddlemix.models.qwen2_vl import MIXQwen2Tokenizer
63
+ from paddlemix.models.qwen2_vl.modeling_qwen2_vl import Qwen2VLForConditionalGeneration
64
+ from paddlemix.processors.qwen2_vl_processing import (
65
+ Qwen2VLImageProcessor,
66
+ Qwen2VLProcessor,
67
+ process_vision_info,
68
+ )
69
+
70
+ # 示例使用HTTP链接
71
+ EXAMPLES = [
72
+ [
73
+ "维修保养、其他注意事项的注意点中,电池需为什么型号的?",
74
+ "paddlemix/demo_images/shuomingshu_20.png",
75
+ ],
76
+ [
77
+ "产品期限是多久?",
78
+ "paddlemix/demo_images/shuomingshu_39.png",
79
+ ],
80
+ ]
81
+
82
+
83
+ class ImageCache:
84
+ """图片缓存管理类"""
85
+
86
+ def __init__(self):
87
+ """初始化图片缓存"""
88
+ self.temp_dir = tempfile.mkdtemp()
89
+ self.current_image = None
90
+ self.is_example = False # 标记当前图片是否为示例图片
91
+ print(f"Created temporary directory for image cache: {self.temp_dir}")
92
+
93
+ def cleanup_previous(self):
94
+ """清理之前的缓存图片"""
95
+ if self.current_image and os.path.exists(self.current_image) and not self.is_example:
96
+ try:
97
+ os.unlink(self.current_image)
98
+ print(f"Cleaned up previous image: {self.current_image}")
99
+ except Exception as e:
100
+ print(f"Error cleaning up previous image: {e}")
101
+
102
+ def cache_image(self, image_path, is_example=False):
103
+ """
104
+ 缓存图片并返回缓存路径
105
+ Args:
106
+ image_path: 图片文件路径
107
+ is_example: 是否为示例图片
108
+ Returns:
109
+ 缓存后的图片路径
110
+ """
111
+ if not image_path:
112
+ return None
113
+
114
+ try:
115
+ # 如果是示例图片且已经在使用中,直接返回
116
+ if is_example and self.current_image == image_path and self.is_example:
117
+ return self.current_image
118
+
119
+ # 创建安全的文件名
120
+ timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
121
+ file_hash = hashlib.md5(str(time.time()).encode()).hexdigest()[:8]
122
+ _, ext = os.path.splitext(image_path)
123
+ if not ext:
124
+ ext = ".jpg" # 默认扩展名
125
+ new_filename = f"image_{timestamp}_{file_hash}{ext}"
126
+
127
+ # 在临时目录中创建新路径
128
+ new_path = os.path.join(self.temp_dir, new_filename) if not is_example else image_path
129
+
130
+ if not is_example:
131
+ # 处理上传的图片文件
132
+ with Image.open(image_path) as img:
133
+ # 如果需要,转换为RGB
134
+ if img.mode != "RGB":
135
+ img = img.convert("RGB")
136
+ img.save(new_path)
137
+
138
+ # 更新当前图片之前清理之前的图片
139
+ self.cleanup_previous()
140
+
141
+ self.current_image = new_path
142
+ self.is_example = is_example
143
+
144
+ return new_path
145
+
146
+ except Exception as e:
147
+ print(f"Error caching image: {e}")
148
+ return image_path
149
+
150
+
151
+ # 创建全局图片缓存管理器
152
+ image_cache = ImageCache()
153
+
154
+
155
+ def load_model():
156
+ """加载模型并进行内存优化"""
157
+ global model, processor
158
+
159
+ if model is None:
160
+ # 加载模型和处理器
161
+ model = Qwen2VLForConditionalGeneration.from_pretrained(
162
+ model_path,
163
+ dtype=dtype,
164
+ )
165
+ image_processor = Qwen2VLImageProcessor()
166
+ tokenizer = MIXQwen2Tokenizer.from_pretrained(model_path)
167
+ processor = Qwen2VLProcessor(image_processor, tokenizer, min_pixels=min_pixels, max_pixels=max_pixels)
168
+
169
+ # 设置为评估模式
170
+ model.eval()
171
+ del tokenizer
172
+ return model, processor
173
+
174
+
175
+ def clear_cache():
176
+ """清理GPU缓存"""
177
+ if paddle.device.cuda.memory_allocated() > 0:
178
+ paddle.device.cuda.empty_cache()
179
+ import gc
180
+
181
+ gc.collect()
182
+
183
+
184
+ def multimodal_understanding(image, question, seed=42, top_p=0.95, temperature=0.1):
185
+ """
186
+ 多模态理解主函数
187
+ Args:
188
+ image: 输入图片
189
+ question: 问题文本
190
+ seed: 随机种子
191
+ top_p: 采样参数
192
+ temperature: 温度参数
193
+ Yields:
194
+ 处理状态和结果
195
+ """
196
+ # 输入验证
197
+ if not image:
198
+ yield "⚠️ 请上传图片后再开始对话。"
199
+ return
200
+ if not question or question.strip() == "":
201
+ yield "⚠️ 请输入您的问题后再开始对话。"
202
+ return
203
+
204
+ try:
205
+ start_time = time.time()
206
+ yield "🔄 正在处理您的请求,请稍候..."
207
+
208
+ # 检查超时
209
+ if time.time() - start_time > 200:
210
+ yield "⏳ 系统当前用户繁多,请等待10分钟后再次尝试。感谢您的理解!"
211
+ return
212
+
213
+ clear_cache()
214
+
215
+ # 设置随机种子
216
+ paddle.seed(seed)
217
+ np.random.seed(seed)
218
+
219
+ # 处理图片缓存
220
+ is_example = any(image == example[1] for example in EXAMPLES)
221
+ cached_image = image_cache.cache_image(image, is_example=is_example)
222
+ if not cached_image:
223
+ return "图片处理失败,请检查图片格式是否正确。"
224
+
225
+ # 构建提示文本
226
+ prompts = question + "\n请用图片中完整出现的内容回答,可以是单词、短语或句子,针对问题回答尽可能详细和完整,并保持格式、单位、符号和标点都与图片中的文字内容完全一致。"
227
+
228
+ # 构建消息
229
+ messages = [
230
+ {
231
+ "role": "user",
232
+ "content": [
233
+ {
234
+ "type": "image",
235
+ "image": cached_image,
236
+ },
237
+ {"type": "text", "text": prompts},
238
+ ],
239
+ }
240
+ ]
241
+
242
+ yield "模型正在分析图片内容..."
243
+
244
+ # 处理视觉信息
245
+ image_inputs, video_inputs = process_vision_info(messages)
246
+ image_pad_token = "<|vision_start|><|image_pad|><|vision_end|>"
247
+ text = f"<|im_start|>system\n你是一个非常棒的多模态理解的AI助手。<|im_end|>\n<|im_start|>user\n{image_pad_token}{prompts}<|im_end|>\n<|im_start|>assistant\n"
248
+
249
+ # 生成回答
250
+ with paddle.no_grad():
251
+ inputs = processor(
252
+ text=[text], images=image_inputs, videos=video_inputs, padding=True, return_tensors="pd"
253
+ )
254
+
255
+ yield "正在生成回答..."
256
+
257
+ generated_ids = model.generate(
258
+ **inputs,
259
+ max_new_tokens=1024,
260
+ top_p=top_p,
261
+ temperature=temperature,
262
+ num_beams=1,
263
+ do_sample=True,
264
+ use_cache=True,
265
+ )
266
+
267
+ output_text = processor.batch_decode(
268
+ generated_ids[0], skip_special_tokens=True, clean_up_tokenization_spaces=False
269
+ )[0]
270
+
271
+ # 清理内存
272
+ del inputs, generated_ids
273
+ clear_cache()
274
+
275
+ yield output_text
276
+
277
+ except Exception as e:
278
+ error_message = f"处理过程中出现错误: {str(e)}\n请重试或在评论区留下你的问题。"
279
+ return error_message
280
+
281
+
282
+ def process_example(question, image):
283
+ """处理示例图片的包装函数"""
284
+ cached_path = image_cache.cache_image(image, is_example=True)
285
+ return multimodal_understanding(cached_path, question)
286
+
287
+
288
+ def handle_image_upload(image):
289
+ """处理图片上传"""
290
+ if image is None:
291
+ return None
292
+ try:
293
+ cached_path = image_cache.cache_image(image, is_example=False)
294
+ return cached_path
295
+ except Exception as e:
296
+ print(f"Error handling image upload: {e}")
297
+ return None
298
+
299
+
300
+ # model, processor = load_model()
301
+ # # image = "/home/aistudio/work/doc-lark/PaddleMIX/paddlemix/demo_images/examples_image1.jpg"
302
+ # print(multimodal_understanding(EXAMPLES[1][1],EXAMPLES[1][0]))
303
+
304
+ # Gradio界面配置
305
+ with gr.Blocks() as demo:
306
+ gr.Markdown(
307
+ value="""
308
+ # 🤖 PP-DocBee(2B): Multimodal Document Understanding Demo
309
+
310
+ 📚 原始模型来自 [PaddleMIX](https://github.com/PaddlePaddle/PaddleMIX) (🌟 一个基于飞桨PaddlePaddle框架构建的多模态大模型套件)
311
+ """
312
+ )
313
+ with gr.Row():
314
+ image_input = gr.Image(type="filepath", label="📷 Upload Image or Input URL")
315
+ with gr.Column():
316
+ question_input = gr.Textbox(label="💭 Question", placeholder="Enter your question here...")
317
+ und_seed_input = gr.Number(label="🎲 Seed", precision=0, value=42)
318
+ top_p = gr.Slider(minimum=0, maximum=1, value=0.95, step=0.05, label="📊 Top P")
319
+ temperature = gr.Slider(minimum=0, maximum=1, value=0.1, step=0.05, label="🌡️ Temperature")
320
+
321
+ image_input.upload(fn=handle_image_upload, inputs=[image_input], outputs=[image_input])
322
+
323
+ understanding_button = gr.Button("💬 Chat", variant="primary")
324
+ understanding_output = gr.Textbox(label="🤖 Response", interactive=False)
325
+
326
+ gr.Examples(
327
+ examples=EXAMPLES,
328
+ inputs=[question_input, image_input],
329
+ outputs=understanding_output,
330
+ fn=process_example,
331
+ cache_examples=True,
332
+ run_on_click=True,
333
+ )
334
+
335
+ # 加载模型
336
+ clear_cache()
337
+ model, processor = load_model()
338
+ clear_cache()
339
+
340
+ understanding_button.click(
341
+ fn=multimodal_understanding,
342
+ inputs=[image_input, question_input, und_seed_input, top_p, temperature],
343
+ outputs=understanding_output,
344
+ api_name="chat",
345
+ )
346
+
347
+ if __name__ == "__main__":
348
+ # 创建队列
349
+ demo.queue()
350
+ demo.launch(server_name=SERVER_NAME, server_port=SERVER_PORR, share=True, ssr_mode=False, max_threads=1) # 限制并发请求数
VLMEvalKit_old/PaddleMIX/paddlemix/models/GOT/utils/conversation.py ADDED
@@ -0,0 +1,400 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ import dataclasses
16
+ from enum import Enum, auto
17
+ from typing import List
18
+
19
+
20
+ class SeparatorStyle(Enum):
21
+ """Different separator style."""
22
+
23
+ SINGLE = auto()
24
+ TWO = auto()
25
+ MPT = auto()
26
+
27
+
28
+ @dataclasses.dataclass
29
+ class Conversation:
30
+ """A class that keeps all conversation history."""
31
+
32
+ system: str
33
+ roles: List[str]
34
+ messages: List[List[str]]
35
+ offset: int
36
+ sep_style: SeparatorStyle = SeparatorStyle.SINGLE
37
+ sep: str = "<|im_end|>"
38
+ sep2: str = None
39
+ version: str = "Unknown"
40
+
41
+ skip_next: bool = False
42
+
43
+ def get_prompt(self):
44
+ if self.sep_style == SeparatorStyle.SINGLE:
45
+ ret = self.system + self.sep + "\n"
46
+ for role, message in self.messages:
47
+ if message:
48
+ if type(message) is tuple:
49
+ message, _, _ = message
50
+ ret += role + ": " + message + self.sep
51
+ else:
52
+ ret += role + ":"
53
+ return ret
54
+ elif self.sep_style == SeparatorStyle.TWO:
55
+ seps = [self.sep, self.sep2]
56
+ ret = self.system + seps[0]
57
+ for i, (role, message) in enumerate(self.messages):
58
+ if message:
59
+ if type(message) is tuple:
60
+ message, _, _ = message
61
+ ret += role + ": " + message + seps[i % 2]
62
+ else:
63
+ ret += role + ":"
64
+ return ret
65
+ if self.sep_style == SeparatorStyle.MPT:
66
+ if self.system:
67
+ ret = self.system + self.sep
68
+ else:
69
+ ret = ""
70
+ for role, message in self.messages:
71
+ if message:
72
+ if type(message) is tuple:
73
+ message, _, _ = message
74
+ ret += role + message + self.sep
75
+ else:
76
+ ret += role
77
+ return ret
78
+ else:
79
+ raise ValueError(f"Invalid style: {self.sep_style}")
80
+
81
+ def append_message(self, role, message):
82
+ self.messages.append([role, message])
83
+
84
+ def get_images(self, return_pil=False):
85
+ images = []
86
+ for i, (role, msg) in enumerate(self.messages[self.offset :]):
87
+ if i % 2 == 0:
88
+ if type(msg) is tuple:
89
+ import base64
90
+ from io import BytesIO
91
+
92
+ from PIL import Image
93
+
94
+ msg, image, image_process_mode = msg
95
+ if image_process_mode == "Pad":
96
+
97
+ def expand2square(pil_img, background_color=(122, 116, 104)):
98
+ width, height = pil_img.size
99
+ if width == height:
100
+ return pil_img
101
+ elif width > height:
102
+ result = Image.new(pil_img.mode, (width, width), background_color)
103
+ # result.paste(pil_img, (0, (width - height) // 2))
104
+ result.paste(pil_img)
105
+ return result
106
+ else:
107
+ result = Image.new(pil_img.mode, (height, height), background_color)
108
+ # result.paste(pil_img, ((height - width) // 2, 0))
109
+ result.paste(pil_img)
110
+ return result
111
+
112
+ image = expand2square(image)
113
+ elif image_process_mode == "Crop":
114
+ max_hw, min_hw = max(image.size), min(image.size)
115
+ aspect_ratio = max_hw / min_hw
116
+ max_len, min_len = 800, 400
117
+ shortest_edge = int(min(max_len / aspect_ratio, min_len, min_hw))
118
+ longest_edge = int(shortest_edge * aspect_ratio)
119
+ W, H = image.size
120
+ if H > W:
121
+ H, W = longest_edge, shortest_edge
122
+ else:
123
+ H, W = shortest_edge, longest_edge
124
+ image = image.resize((W, H))
125
+ elif image_process_mode == "Resize":
126
+ image = image.resize((224, 224))
127
+ else:
128
+ raise ValueError(f"Invalid image_process_mode: {image_process_mode}")
129
+
130
+ if return_pil:
131
+ images.append(image)
132
+ else:
133
+ buffered = BytesIO()
134
+ image.convert("RGB").save(buffered, format="JPEG")
135
+ img_b64_str = base64.b64encode(buffered.getvalue()).decode()
136
+ images.append(img_b64_str)
137
+ return images
138
+
139
+ def to_gradio_chatbot(self):
140
+ ret = []
141
+ for i, (role, msg) in enumerate(self.messages[self.offset :]):
142
+ if i % 2 == 0:
143
+ if type(msg) is tuple:
144
+ import base64
145
+ from io import BytesIO
146
+
147
+ msg, image, image_process_mode = msg
148
+ max_hw, min_hw = max(image.size), min(image.size)
149
+ aspect_ratio = max_hw / min_hw
150
+ max_len, min_len = 800, 400
151
+ shortest_edge = int(min(max_len / aspect_ratio, min_len, min_hw))
152
+ longest_edge = int(shortest_edge * aspect_ratio)
153
+ W, H = image.size
154
+ if H > W:
155
+ H, W = longest_edge, shortest_edge
156
+ else:
157
+ H, W = shortest_edge, longest_edge
158
+ image = image.resize((W, H))
159
+ # image = image.resize((224, 224))
160
+ buffered = BytesIO()
161
+ image.save(buffered, format="JPEG")
162
+ img_b64_str = base64.b64encode(buffered.getvalue()).decode()
163
+ img_str = f'<img src="data:image/png;base64,{img_b64_str}" alt="user upload image" />'
164
+ msg = msg.replace("<image>", img_str)
165
+ ret.append([msg, None])
166
+ else:
167
+ ret[-1][-1] = msg
168
+ return ret
169
+
170
+ def copy(self):
171
+ return Conversation(
172
+ system=self.system,
173
+ roles=self.roles,
174
+ messages=[[x, y] for x, y in self.messages],
175
+ offset=self.offset,
176
+ sep_style=self.sep_style,
177
+ sep=self.sep,
178
+ sep2=self.sep2,
179
+ )
180
+
181
+ def dict(self):
182
+ if len(self.get_images()) > 0:
183
+ return {
184
+ "system": self.system,
185
+ "roles": self.roles,
186
+ "messages": [[x, y[0] if type(y) is tuple else y] for x, y in self.messages],
187
+ "offset": self.offset,
188
+ "sep": self.sep,
189
+ "sep2": self.sep2,
190
+ }
191
+ return {
192
+ "system": self.system,
193
+ "roles": self.roles,
194
+ "messages": self.messages,
195
+ "offset": self.offset,
196
+ "sep": self.sep,
197
+ "sep2": self.sep2,
198
+ }
199
+
200
+
201
+ conv_v1 = Conversation(
202
+ system="A chat between a curious human and an artificial intelligence assistant. "
203
+ "The assistant gives helpful, detailed, and polite answers to the human's questions.",
204
+ roles=("Human", "Assistant"),
205
+ messages=(
206
+ ("Human", "Give three tips for staying healthy."),
207
+ (
208
+ "Assistant",
209
+ "Sure, here are three tips for staying healthy:\n"
210
+ "1. Exercise regularly: Regular physical activity can help improve your overall health and wellbeing. "
211
+ "It can also help reduce your risk of chronic conditions such as obesity, diabetes, heart disease, "
212
+ "and certain cancers. Aim for at least 150 minutes of moderate-intensity aerobic exercise or "
213
+ "75 minutes of vigorous-intensity aerobic exercise per week, along with muscle-strengthening "
214
+ "activities at least two days per week.\n"
215
+ "2. Eat a balanced diet: Eating a balanced diet that is rich in fruits, "
216
+ "vegetables, whole grains, lean proteins, and healthy fats can help support "
217
+ "your overall health. Try to limit your intake of processed and high-sugar foods, "
218
+ "and aim to drink plenty of water throughout the day.\n"
219
+ "3. Get enough sleep: Getting enough quality sleep is essential for your physical "
220
+ "and mental health. Adults should aim for seven to nine hours of sleep per night. "
221
+ "Establish a regular sleep schedule and try to create a relaxing bedtime routine to "
222
+ "help improve the quality of your sleep.",
223
+ ),
224
+ ),
225
+ offset=2,
226
+ sep_style=SeparatorStyle.SINGLE,
227
+ sep="###",
228
+ )
229
+
230
+ conv_v1_2 = Conversation(
231
+ system="A chat between a curious human and an artificial intelligence assistant. "
232
+ "The assistant gives helpful, detailed, and polite answers to the human's questions.",
233
+ roles=("Human", "Assistant"),
234
+ messages=(
235
+ ("Human", "What are the key differences between renewable and non-renewable energy sources?"),
236
+ (
237
+ "Assistant",
238
+ "Renewable energy sources are those that can be replenished naturally in a relatively "
239
+ "short amount of time, such as solar, wind, hydro, geothermal, and biomass. "
240
+ "Non-renewable energy sources, on the other hand, are finite and will eventually be "
241
+ "depleted, such as coal, oil, and natural gas. Here are some key differences between "
242
+ "renewable and non-renewable energy sources:\n"
243
+ "1. Availability: Renewable energy sources are virtually inexhaustible, while non-renewable "
244
+ "energy sources are finite and will eventually run out.\n"
245
+ "2. Environmental impact: Renewable energy sources have a much lower environmental impact "
246
+ "than non-renewable sources, which can lead to air and water pollution, greenhouse gas emissions, "
247
+ "and other negative effects.\n"
248
+ "3. Cost: Renewable energy sources can be more expensive to initially set up, but they typically "
249
+ "have lower operational costs than non-renewable sources.\n"
250
+ "4. Reliability: Renewable energy sources are often more reliable and can be used in more remote "
251
+ "locations than non-renewable sources.\n"
252
+ "5. Flexibility: Renewable energy sources are often more flexible and can be adapted to different "
253
+ "situations and needs, while non-renewable sources are more rigid and inflexible.\n"
254
+ "6. Sustainability: Renewable energy sources are more sustainable over the long term, while "
255
+ "non-renewable sources are not, and their depletion can lead to economic and social instability.\n",
256
+ ),
257
+ ),
258
+ offset=2,
259
+ sep_style=SeparatorStyle.SINGLE,
260
+ sep="###",
261
+ )
262
+
263
+ conv_vicuna_v1_1 = Conversation(
264
+ system="A chat between a curious user and an artificial intelligence assistant. "
265
+ "The assistant gives helpful, detailed, and polite answers to the user's questions.",
266
+ roles=("USER", "ASSISTANT"),
267
+ version="v1",
268
+ messages=(),
269
+ offset=0,
270
+ sep_style=SeparatorStyle.TWO,
271
+ sep=" ",
272
+ sep2="</s>",
273
+ )
274
+
275
+
276
+ conv_mpt = Conversation(
277
+ system="""<|im_start|>system
278
+ You should follow the instructions carefully and explain your answers in detail.""",
279
+ # system = None,
280
+ roles=("<|im_start|>user\n", "<|im_start|>assistant\n"),
281
+ version="mpt",
282
+ messages=(),
283
+ offset=0,
284
+ sep_style=SeparatorStyle.MPT,
285
+ sep="<|im_end|>",
286
+ )
287
+
288
+ conv_mpt_eval = Conversation(
289
+ system="",
290
+ roles=("<|im_start|>user\n", "<|im_start|>assistant\n"),
291
+ version="mpt",
292
+ messages=(),
293
+ offset=0,
294
+ sep_style=SeparatorStyle.MPT,
295
+ sep="<|im_end|>",
296
+ )
297
+
298
+ conv_mpt_text = Conversation(
299
+ system="""<|im_start|>system
300
+ - You are a helpful assistant chatbot trained by MosaicML.
301
+ - You answer questions.
302
+ - You are excited to be able to help the user, but will refuse to do anything that could be considered harmful to the user.
303
+ - You are more than just an information source, you are also able to write poetry, short stories, and make jokes.""",
304
+ roles=("<|im_start|>user\n", "<|im_start|>assistant\n"),
305
+ version="mpt",
306
+ messages=(),
307
+ offset=0,
308
+ sep_style=SeparatorStyle.MPT,
309
+ sep="<|im_end|>",
310
+ )
311
+
312
+ conv_bair_v1 = Conversation(
313
+ system="BEGINNING OF CONVERSATION:",
314
+ roles=("USER", "GPT"),
315
+ messages=(),
316
+ offset=0,
317
+ sep_style=SeparatorStyle.TWO,
318
+ sep=" ",
319
+ sep2="</s>",
320
+ )
321
+
322
+
323
+ simple_conv = Conversation(
324
+ system="",
325
+ roles=("Human", "Assistant"),
326
+ messages=(),
327
+ offset=0,
328
+ sep_style=SeparatorStyle.SINGLE,
329
+ sep="###",
330
+ )
331
+
332
+
333
+ simple_conv_multimodal = Conversation(
334
+ system="You are GOT, a large language and vision assistant trained by Foundation Model Group, Megvii Technology."
335
+ "You are able to understand the visual content that the user provides, and assist the user with a variety of tasks using natural language."
336
+ "Follow the instructions carefully and explain your answers in detail.",
337
+ # system="",
338
+ roles=("Human", "Assistant"),
339
+ messages=(("Human", "Hi!"), ("Assistant", "Hi there! How can I help you today?\n")),
340
+ offset=2,
341
+ sep_style=SeparatorStyle.SINGLE,
342
+ sep="###",
343
+ )
344
+
345
+
346
+ simple_conv_mpt_multimodal = Conversation(
347
+ system="""<|im_start|>system
348
+ - You are GOT, a large language and vision assistant trained by Foundation Model Group, Megvii Technology.
349
+ - You are able to understand the visual content that the user provides, and assist the user with a variety of tasks using natural language.
350
+ - You should follow the instructions carefully and explain your answers in detail.""",
351
+ roles=("<|im_start|>user\n", "<|im_start|>assistant\n"),
352
+ version="mpt",
353
+ messages=(),
354
+ offset=0,
355
+ sep_style=SeparatorStyle.MPT,
356
+ sep="<|im_end|>",
357
+ )
358
+
359
+
360
+ simple_conv_legacy = Conversation(
361
+ system="You are GOT, a large language model trained by Foundation Model Group, Megvii Technology."
362
+ "You are designed to assist human with a variety of tasks using natural language."
363
+ "Follow the instructions carefully.",
364
+ roles=("Human", "Assistant"),
365
+ messages=(("Human", "Hi!\n\n### Response:"), ("Assistant", "Hi there! How can I help you today?\n")),
366
+ offset=2,
367
+ sep_style=SeparatorStyle.SINGLE,
368
+ sep="###",
369
+ )
370
+
371
+
372
+ conv_llava_v1 = Conversation(
373
+ system="You are GOT, a large language and vision assistant trained by Foundation Model Group, Megvii Technology."
374
+ "You are able to understand the visual content that the user provides, and assist the user with a variety of tasks using natural language."
375
+ "Follow the instructions carefully and explain your answers in detail.",
376
+ roles=("USER", "ASSISTANT"),
377
+ version="v1",
378
+ messages=(),
379
+ offset=0,
380
+ sep_style=SeparatorStyle.TWO,
381
+ sep=" ",
382
+ sep2="</s>",
383
+ )
384
+
385
+ default_conversation = conv_mpt
386
+ conv_templates = {
387
+ "default": simple_conv_multimodal,
388
+ "simple": simple_conv,
389
+ "simple_legacy": simple_conv_legacy,
390
+ "multimodal": simple_conv,
391
+ "mpt_multimodal": simple_conv_mpt_multimodal,
392
+ "llava_v1": conv_llava_v1,
393
+ "mpt_eval": conv_mpt_eval,
394
+ # fastchat
395
+ "v1": conv_vicuna_v1_1,
396
+ "bair_v1": conv_bair_v1,
397
+ "vicuna_v1_1": conv_vicuna_v1_1,
398
+ "mpt": conv_mpt,
399
+ "mpt_text": conv_mpt_text,
400
+ }
VLMEvalKit_old/PaddleMIX/paddlemix/models/audioldm2/encoders/phoneme_encoder/__init__.py ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
VLMEvalKit_old/PaddleMIX/paddlemix/models/audioldm2/encoders/phoneme_encoder/cleaners.py ADDED
@@ -0,0 +1,103 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ """ from https://github.com/keithito/tacotron """
16
+
17
+ import re
18
+ from unidecode import unidecode
19
+ from phonemizer import phonemize
20
+
21
+ __all__ = [
22
+ "basic_cleaners",
23
+ "transliteration_cleaners",
24
+ "english_cleaners",
25
+ "english_cleaners2"
26
+ ]
27
+
28
+ # Regular expression matching whitespace:
29
+ _whitespace_re = re.compile(r'\s+')
30
+
31
+ # List of (regular expression, replacement) pairs for abbreviations:
32
+ _abbreviations = [(re.compile('\\b%s\\.' % x[0], re.IGNORECASE), x[1]) for x in [
33
+ ('mrs', 'misess'),
34
+ ('mr', 'mister'),
35
+ ('dr', 'doctor'),
36
+ ('st', 'saint'),
37
+ ('co', 'company'),
38
+ ('jr', 'junior'),
39
+ ('maj', 'major'),
40
+ ('gen', 'general'),
41
+ ('drs', 'doctors'),
42
+ ('rev', 'reverend'),
43
+ ('lt', 'lieutenant'),
44
+ ('hon', 'honorable'),
45
+ ('sgt', 'sergeant'),
46
+ ('capt', 'captain'),
47
+ ('esq', 'esquire'),
48
+ ('ltd', 'limited'),
49
+ ('col', 'colonel'),
50
+ ('ft', 'fort'),
51
+ ]]
52
+
53
+
54
+ def expand_abbreviations(text):
55
+ for regex, replacement in _abbreviations:
56
+ text = re.sub(regex, replacement, text)
57
+ return text
58
+
59
+ def lowercase(text):
60
+ return text.lower()
61
+
62
+
63
+ def collapse_whitespace(text):
64
+ return re.sub(_whitespace_re, ' ', text)
65
+
66
+
67
+ def convert_to_ascii(text):
68
+ return unidecode(text)
69
+
70
+
71
+ def basic_cleaners(text):
72
+ '''Basic pipeline that lowercases and collapses whitespace without transliteration.'''
73
+ text = lowercase(text)
74
+ text = collapse_whitespace(text)
75
+ return text
76
+
77
+
78
+ def transliteration_cleaners(text):
79
+ '''Pipeline for non-English text that transliterates to ASCII.'''
80
+ text = convert_to_ascii(text)
81
+ text = lowercase(text)
82
+ text = collapse_whitespace(text)
83
+ return text
84
+
85
+
86
+ def english_cleaners(text):
87
+ '''Pipeline for English text, including abbreviation expansion.'''
88
+ text = convert_to_ascii(text)
89
+ text = lowercase(text)
90
+ text = expand_abbreviations(text)
91
+ phonemes = phonemize(text, language='en-us', backend='espeak', strip=True)
92
+ phonemes = collapse_whitespace(phonemes)
93
+ return phonemes
94
+
95
+
96
+ def english_cleaners2(text):
97
+ '''Pipeline for English text, including abbreviation expansion. + punctuation + stress'''
98
+ text = convert_to_ascii(text)
99
+ text = lowercase(text)
100
+ text = expand_abbreviations(text)
101
+ phonemes = phonemize(text, language='en-us', backend='espeak', strip=True, preserve_punctuation=True, with_stress=True)
102
+ phonemes = collapse_whitespace(phonemes)
103
+ return phonemes
VLMEvalKit_old/PaddleMIX/paddlemix/models/audioldm2/encoders/phoneme_encoder/symbols.py ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ '''
16
+ Defines the set of symbols used in text input to the model.
17
+ '''
18
+ _pad = '_'
19
+ _punctuation = ';:,.!?¡¿—…"«»“” '
20
+ _letters = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz'
21
+ _letters_ipa = "ɑɐɒæɓʙβɔɕçɗɖðʤəɘɚɛɜɝɞɟʄɡɠɢʛɦɧħɥʜɨɪʝɭɬɫɮʟɱɯɰŋɳɲɴøɵɸθœɶʘɹɺɾɻʀʁɽʂʃʈʧʉʊʋⱱʌɣɤʍχʎʏʑʐʒʔʡʕʢǀǁǂǃˈˌːˑʼʴʰʱʲʷˠˤ˞↓↑→↗↘'̩'ᵻ"
22
+
23
+
24
+ # Export all symbols:
25
+ symbols = [_pad] + list(_punctuation) + list(_letters) + list(_letters_ipa)
26
+
27
+ # Special symbol ids
28
+ SPACE_ID = symbols.index(" ")
VLMEvalKit_old/PaddleMIX/paddlemix/models/audioldm2/encoders/phoneme_encoder/text.py ADDED
@@ -0,0 +1,62 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ """ from https://github.com/keithito/tacotron """
16
+
17
+ from .cleaners import *
18
+ from .symbols import symbols
19
+
20
+ # Mappings from symbol to numeric ID and vice versa:
21
+ _symbol_to_id = {s: i for i, s in enumerate(symbols)}
22
+ _id_to_symbol = {i: s for i, s in enumerate(symbols)}
23
+
24
+ cleaner = english_cleaners2
25
+
26
+ def text_to_sequence(text, cleaner_names):
27
+ '''Converts a string of text to a sequence of IDs corresponding to the symbols in the text.
28
+ Args:
29
+ text: string to convert to a sequence
30
+ cleaner_names: names of the cleaner functions to run the text through
31
+ Returns:
32
+ List of integers corresponding to the symbols in the text
33
+ '''
34
+ sequence = []
35
+
36
+ clean_text = _clean_text(text, cleaner_names)
37
+ for symbol in clean_text:
38
+ symbol_id = _symbol_to_id[symbol]
39
+ sequence += [symbol_id]
40
+ return sequence
41
+
42
+ def cleaned_text_to_sequence(cleaned_text):
43
+ '''Converts a string of text to a sequence of IDs corresponding to the symbols in the text.
44
+ Args:
45
+ text: string to convert to a sequence
46
+ Returns:
47
+ List of integers corresponding to the symbols in the text
48
+ '''
49
+ sequence = [_symbol_to_id[symbol] for symbol in cleaned_text]
50
+ return sequence
51
+
52
+ def sequence_to_text(sequence):
53
+ '''Converts a sequence of IDs back to a string'''
54
+ result = ''
55
+ for symbol_id in sequence:
56
+ s = _id_to_symbol[symbol_id]
57
+ result += s
58
+ return result
59
+
60
+ def _clean_text(text, cleaner_names):
61
+ text = cleaner(text)
62
+ return text
VLMEvalKit_old/PaddleMIX/paddlemix/models/audioldm2/unet/attention.py ADDED
@@ -0,0 +1,199 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ import paddle
16
+ from paddle import nn
17
+ from ppdiffusers.models.attention import GEGLU
18
+ from einops import rearrange, repeat
19
+ from ..diffusionwrapper import default
20
+
21
+ def Normalize(in_channels):
22
+ return nn.GroupNorm(
23
+ num_groups=32, num_channels=in_channels, epsilon=1e-6
24
+ )
25
+
26
+ class FeedForward(nn.Layer):
27
+ def __init__(self, dim, dim_out=None, mult=4, glu=False, dropout=0.0):
28
+ super().__init__()
29
+ inner_dim = int(dim * mult)
30
+ dim_out = default(dim_out, dim)
31
+ project_in = (
32
+ nn.Sequential(nn.Linear(dim, inner_dim), nn.GELU())
33
+ if not glu
34
+ else GEGLU(dim, inner_dim)
35
+ )
36
+
37
+ self.net = nn.Sequential(
38
+ project_in, nn.Dropout(dropout), nn.Linear(inner_dim, dim_out)
39
+ )
40
+
41
+ def forward(self, x):
42
+ return self.net(x)
43
+
44
+
45
+ class CrossAttention(nn.Layer):
46
+ def __init__(self, query_dim, context_dim=None, heads=8, dim_head=64, dropout=0.0):
47
+ super().__init__()
48
+ inner_dim = dim_head * heads
49
+ context_dim = default(context_dim, query_dim)
50
+
51
+ self.scale = dim_head**-0.5
52
+ self.heads = heads
53
+
54
+ self.to_q = nn.Linear(query_dim, inner_dim, bias_attr=False)
55
+ self.to_k = nn.Linear(context_dim, inner_dim, bias_attr=False)
56
+ self.to_v = nn.Linear(context_dim, inner_dim, bias_attr=False)
57
+
58
+ self.to_out = nn.Sequential(
59
+ nn.Linear(inner_dim, query_dim), nn.Dropout(dropout)
60
+ )
61
+
62
+ def forward(self, x, context=None, mask=None):
63
+ h = self.heads
64
+
65
+ q = self.to_q(x)
66
+ context = default(context, x)
67
+
68
+ k = self.to_k(context)
69
+ v = self.to_v(context)
70
+
71
+ q, k, v = map(lambda t: rearrange(t, "b n (h d) -> (b h) n d", h=h), (q, k, v))
72
+
73
+ sim = paddle.einsum("b i d, b j d -> b i j", q, k) * self.scale
74
+
75
+ if mask is not None:
76
+ mask = rearrange(mask, "b ... -> b (...)")
77
+ max_neg_value = -paddle.finfo(sim.dtype).max
78
+ mask = repeat(mask, "b j -> (b h) () j", h=h)
79
+ tmp = paddle.full(sim.shape, max_neg_value, sim.dtype)
80
+ sim = paddle.where(~(mask == 1), tmp, sim)
81
+
82
+ # attention, what we cannot get enough of
83
+ attn = nn.functional.softmax(sim, axis=-1)
84
+ out = paddle.einsum("b i j, b j d -> b i d", attn, v)
85
+ out = rearrange(out, "(b h) n d -> b n (h d)", h=h)
86
+ return self.to_out(out)
87
+
88
+
89
+ class LinearAttention(nn.Layer):
90
+ def __init__(self, dim, heads=4, dim_head=32):
91
+ super().__init__()
92
+ self.heads = heads
93
+ hidden_dim = dim_head * heads
94
+ self.to_qkv = nn.Conv2D(dim, hidden_dim * 3, 1, bias_attr=False)
95
+ self.to_out = nn.Conv2D(hidden_dim, dim, 1)
96
+
97
+ def forward(self, x):
98
+ b, c, h, w = x.shape
99
+ qkv = self.to_qkv(x)
100
+ q, k, v = rearrange(
101
+ qkv, "b (qkv heads c) h w -> qkv b heads c (h w)", heads=self.heads, qkv=3
102
+ )
103
+ k = nn.functional.softmax(k, axis=-1)
104
+ context = paddle.einsum("bhdn,bhen->bhde", k, v)
105
+ out = paddle.einsum("bhde,bhdn->bhen", context, q)
106
+ out = rearrange(
107
+ out, "b heads c (h w) -> b (heads c) h w", heads=self.heads, h=h, w=w
108
+ )
109
+ return self.to_out(out)
110
+
111
+ class BasicTransformerBlock(nn.Layer):
112
+ def __init__(
113
+ self,
114
+ dim,
115
+ n_heads,
116
+ d_head,
117
+ dropout=0.0,
118
+ context_dim=None,
119
+ gated_ff=True,
120
+ checkpoint=True,
121
+ ):
122
+ super().__init__()
123
+ self.attn1 = CrossAttention(
124
+ query_dim=dim, heads=n_heads, dim_head=d_head, dropout=dropout
125
+ ) # is a self-attention
126
+ self.ff = FeedForward(dim, dropout=dropout, glu=gated_ff)
127
+ self.attn2 = CrossAttention(
128
+ query_dim=dim,
129
+ context_dim=context_dim,
130
+ heads=n_heads,
131
+ dim_head=d_head,
132
+ dropout=dropout,
133
+ ) # is self-attn if context is none
134
+ self.norm1 = nn.LayerNorm(dim)
135
+ self.norm2 = nn.LayerNorm(dim)
136
+ self.norm3 = nn.LayerNorm(dim)
137
+ self.checkpoint = checkpoint
138
+
139
+ def forward(self, x, context=None, mask=None):
140
+ x = self.attn1(self.norm1(x)) + x
141
+ x = self.attn2(self.norm2(x), context=context, mask=mask) + x
142
+ x = self.ff(self.norm3(x)) + x
143
+ return x
144
+
145
+ class SpatialTransformer(nn.Layer):
146
+ """
147
+ Transformer block for image-like data.
148
+ First, project the input (aka embedding)
149
+ and reshape to b, t, d.
150
+ Then apply standard transformer action.
151
+ Finally, reshape to image
152
+ """
153
+
154
+ def __init__(
155
+ self,
156
+ in_channels,
157
+ n_heads,
158
+ d_head,
159
+ depth=1,
160
+ dropout=0.0,
161
+ context_dim=None,
162
+ ):
163
+ super().__init__()
164
+
165
+ context_dim = context_dim
166
+
167
+ self.in_channels = in_channels
168
+ inner_dim = n_heads * d_head
169
+ self.norm = Normalize(in_channels)
170
+
171
+ self.proj_in = nn.Conv2D(
172
+ in_channels, inner_dim, kernel_size=1, stride=1, padding=0
173
+ )
174
+
175
+ self.transformer_blocks = nn.LayerList(
176
+ [
177
+ BasicTransformerBlock(
178
+ inner_dim, n_heads, d_head, dropout=dropout, context_dim=context_dim
179
+ )
180
+ for d in range(depth)
181
+ ]
182
+ )
183
+ weight_attr = paddle.ParamAttr(
184
+ initializer=nn.initializer.Constant(value=0.0)
185
+ )
186
+ self.proj_out = nn.Conv2D(inner_dim, in_channels, kernel_size=1, stride=1, padding=0, weight_attr=weight_attr)
187
+
188
+ def forward(self, x, context=None, mask=None):
189
+ # note: if no context is given, cross-attention defaults to self-attention
190
+ b, c, h, w = x.shape
191
+ x_in = x
192
+ x = self.norm(x)
193
+ x = self.proj_in(x)
194
+ x = rearrange(x, "b c h w -> b (h w) c")
195
+ for block in self.transformer_blocks:
196
+ x = block(x, context=context, mask=mask)
197
+ x = rearrange(x, "b (h w) c -> b c h w", h=h, w=w)
198
+ x = self.proj_out(x)
199
+ return x + x_in
VLMEvalKit_old/PaddleMIX/paddlemix/models/diffsinger/basics/base_augmentation.py ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ from utils.hparams import hparams
16
+
17
+
18
+ class BaseAugmentation:
19
+ """
20
+ Base class for data augmentation.
21
+ All methods of this class should be thread-safe.
22
+ 1. *process_item*:
23
+ Apply augmentation to one piece of data.
24
+ """
25
+
26
+ def __init__(self, data_dirs: list, augmentation_args: dict):
27
+ self.raw_data_dirs = data_dirs
28
+ self.augmentation_args = augmentation_args
29
+ self.timestep = hparams["hop_size"] / hparams["audio_sample_rate"]
30
+
31
+ def process_item(self, item: dict, **kwargs) -> dict:
32
+ raise NotImplementedError()
33
+
34
+
35
+ def require_same_keys(func):
36
+ def run(*args, **kwargs):
37
+ item: dict = args[1]
38
+ res: dict = func(*args, **kwargs)
39
+ assert set(item.keys()) == set(
40
+ res.keys()
41
+ ), f"""Item keys mismatch after augmentation.
42
+ Before: {sorted(item.keys())}
43
+ After: {sorted(res.keys())}"""
44
+ return res
45
+
46
+ return run
VLMEvalKit_old/PaddleMIX/paddlemix/models/diffsinger/basics/base_binarizer.py ADDED
@@ -0,0 +1,330 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ import json
16
+ import pathlib
17
+ import pickle
18
+ import random
19
+ import shutil
20
+ import warnings
21
+ from copy import deepcopy
22
+
23
+ import numpy as np
24
+ import paddle
25
+ from tqdm import tqdm
26
+ from utils.hparams import hparams
27
+ from utils.indexed_datasets import IndexedDatasetBuilder
28
+ from utils.multiprocess_utils import chunked_multiprocess_run
29
+ from utils.phoneme_utils import build_phoneme_list, locate_dictionary
30
+ from utils.plot import distribution_to_figure
31
+ from utils.text_encoder import TokenTextEncoder
32
+
33
+
34
+ class BinarizationError(Exception):
35
+ pass
36
+
37
+
38
+ class BaseBinarizer:
39
+ """
40
+ Base class for data processing.
41
+ 1. *process* and *process_data_split*:
42
+ process entire data, generate the train-test split (support parallel processing);
43
+ 2. *process_item*:
44
+ process singe piece of data;
45
+ 3. *get_pitch*:
46
+ infer the pitch using some algorithm;
47
+ 4. *get_align*:
48
+ get the alignment using 'mel2ph' format (see https://arxiv.org/abs/1905.09263).
49
+ 5. phoneme encoder, voice encoder, etc.
50
+
51
+ Subclasses should define:
52
+ 1. *load_metadata*:
53
+ how to read multiple datasets from files;
54
+ 2. *train_item_names*, *valid_item_names*, *test_item_names*:
55
+ how to split the dataset;
56
+ 3. load_ph_set:
57
+ the phoneme set.
58
+ """
59
+
60
+ def __init__(self, data_dir=None, data_attrs=None):
61
+ if data_dir is None:
62
+ data_dir = hparams["raw_data_dir"]
63
+ if not isinstance(data_dir, list):
64
+ data_dir = [data_dir]
65
+ self.raw_data_dirs = [pathlib.Path(d) for d in data_dir]
66
+ self.binary_data_dir = pathlib.Path(hparams["binary_data_dir"])
67
+ self.data_attrs = [] if data_attrs is None else data_attrs
68
+ self.binarization_args = hparams["binarization_args"]
69
+ self.augmentation_args = hparams.get("augmentation_args", {})
70
+ self.device = str("cuda" if paddle.device.cuda.device_count() >= 1 else "cpu").replace("cuda", "gpu")
71
+ self.spk_map = None
72
+ self.spk_ids = hparams["spk_ids"]
73
+ self.speakers = hparams["speakers"]
74
+ self.build_spk_map()
75
+ self.items = {}
76
+ self.item_names: list = None
77
+ self._train_item_names: list = None
78
+ self._valid_item_names: list = None
79
+ self.phone_encoder = TokenTextEncoder(vocab_list=build_phoneme_list())
80
+ self.timestep = hparams["hop_size"] / hparams["audio_sample_rate"]
81
+
82
+ def build_spk_map(self):
83
+ assert isinstance(self.speakers, list), "Speakers must be a list"
84
+ assert len(self.speakers) == len(
85
+ self.raw_data_dirs
86
+ ), "Number of raw data dirs must equal number of speaker names!"
87
+ if len(self.spk_ids) == 0:
88
+ self.spk_ids = list(range(len(self.raw_data_dirs)))
89
+ else:
90
+ assert len(self.spk_ids) == len(
91
+ self.raw_data_dirs
92
+ ), "Length of explicitly given spk_ids must equal the number of raw datasets."
93
+ assert (
94
+ max(self.spk_ids) < hparams["num_spk"]
95
+ ), f"Index in spk_id sequence {self.spk_ids} is out of range. All values should be smaller than num_spk."
96
+ self.spk_map = {}
97
+ for spk_name, spk_id in zip(self.speakers, self.spk_ids):
98
+ if spk_name in self.spk_map and self.spk_map[spk_name] != spk_id:
99
+ raise ValueError(
100
+ f"Invalid speaker ID assignment. Name '{spk_name}' is assigned with different speaker IDs: {self.spk_map[spk_name]} and {spk_id}."
101
+ )
102
+ self.spk_map[spk_name] = spk_id
103
+ print("| spk_map: ", self.spk_map)
104
+
105
+ def load_meta_data(self, raw_data_dir: pathlib.Path, ds_id, spk_id):
106
+ raise NotImplementedError()
107
+
108
+ def split_train_valid_set(self, item_names):
109
+ """
110
+ Split the dataset into training set and validation set.
111
+ :return: train_item_names, valid_item_names
112
+ """
113
+ prefixes = {str(pr): (1) for pr in hparams["test_prefixes"]}
114
+ valid_item_names = {}
115
+ for prefix in deepcopy(prefixes):
116
+ if prefix in item_names:
117
+ valid_item_names[prefix] = 1
118
+ prefixes.pop(prefix)
119
+ for prefix in deepcopy(prefixes):
120
+ matched = False
121
+ for name in item_names:
122
+ if name.split(":")[-1] == prefix:
123
+ valid_item_names[name] = 1
124
+ matched = True
125
+ if matched:
126
+ prefixes.pop(prefix)
127
+ for prefix in deepcopy(prefixes):
128
+ matched = False
129
+ for name in item_names:
130
+ if name.startswith(prefix):
131
+ valid_item_names[name] = 1
132
+ matched = True
133
+ if matched:
134
+ prefixes.pop(prefix)
135
+ for prefix in deepcopy(prefixes):
136
+ matched = False
137
+ for name in item_names:
138
+ if name.split(":")[-1].startswith(prefix):
139
+ valid_item_names[name] = 1
140
+ matched = True
141
+ if matched:
142
+ prefixes.pop(prefix)
143
+ if len(prefixes) != 0:
144
+ warnings.warn(
145
+ f"The following rules in test_prefixes have no matching names in the dataset: {', '.join(prefixes.keys())}",
146
+ category=UserWarning,
147
+ )
148
+ warnings.filterwarnings("default")
149
+ valid_item_names = list(valid_item_names.keys())
150
+ assert len(valid_item_names) > 0, "Validation set is empty!"
151
+ train_item_names = [x for x in item_names if x not in set(valid_item_names)]
152
+ assert len(train_item_names) > 0, "Training set is empty!"
153
+ return train_item_names, valid_item_names
154
+
155
+ @property
156
+ def train_item_names(self):
157
+ return self._train_item_names
158
+
159
+ @property
160
+ def valid_item_names(self):
161
+ return self._valid_item_names
162
+
163
+ def meta_data_iterator(self, prefix):
164
+ if prefix == "train":
165
+ item_names = self.train_item_names
166
+ else:
167
+ item_names = self.valid_item_names
168
+ for item_name in item_names:
169
+ meta_data = self.items[item_name]
170
+ yield item_name, meta_data
171
+
172
+ def process(self):
173
+ for ds_id, spk_id, data_dir in zip(range(len(self.raw_data_dirs)), self.spk_ids, self.raw_data_dirs):
174
+ self.load_meta_data(pathlib.Path(data_dir), ds_id=ds_id, spk_id=spk_id)
175
+ self.item_names = sorted(list(self.items.keys()))
176
+ self._train_item_names, self._valid_item_names = self.split_train_valid_set(self.item_names)
177
+ if self.binarization_args["shuffle"]:
178
+ random.shuffle(self.item_names)
179
+ self.binary_data_dir.mkdir(parents=True, exist_ok=True)
180
+ spk_map_fn = self.binary_data_dir / "spk_map.json"
181
+ with open(spk_map_fn, "w", encoding="utf-8") as f:
182
+ json.dump(self.spk_map, f)
183
+ shutil.copy(locate_dictionary(), self.binary_data_dir / "dictionary.txt")
184
+ self.check_coverage()
185
+ try:
186
+ self.process_dataset("valid")
187
+ self.process_dataset(
188
+ "train",
189
+ num_workers=int(self.binarization_args["num_workers"]),
190
+ apply_augmentation=any(args["enabled"] for args in self.augmentation_args.values()),
191
+ )
192
+ except KeyboardInterrupt:
193
+ exit(-1)
194
+
195
+ def check_coverage(self):
196
+ ph_required = set(build_phoneme_list())
197
+ phoneme_map = {}
198
+ for ph in ph_required:
199
+ phoneme_map[ph] = 0
200
+ ph_occurred = []
201
+ for item_name in self.items:
202
+ ph_occurred += self.items[item_name]["ph_seq"]
203
+ if len(ph_occurred) == 0:
204
+ raise BinarizationError(f"Empty tokens in {item_name}.")
205
+ for ph in ph_occurred:
206
+ if ph not in ph_required:
207
+ continue
208
+ phoneme_map[ph] += 1
209
+ ph_occurred = set(ph_occurred)
210
+ print("===== Phoneme Distribution Summary =====")
211
+ for i, key in enumerate(sorted(phoneme_map.keys())):
212
+ if i == len(ph_required) - 1:
213
+ end = "\n"
214
+ elif i % 10 == 9:
215
+ end = ",\n"
216
+ else:
217
+ end = ", "
218
+ print(f"'{key}': {phoneme_map[key]}", end=end)
219
+ x = sorted(phoneme_map.keys())
220
+ values = [phoneme_map[k] for k in x]
221
+ plt = distribution_to_figure(
222
+ title="Phoneme Distribution Summary",
223
+ x_label="Phoneme",
224
+ y_label="Number of occurrences",
225
+ items=x,
226
+ values=values,
227
+ )
228
+ filename = self.binary_data_dir / "phoneme_distribution.jpg"
229
+ plt.savefig(fname=filename, bbox_inches="tight", pad_inches=0.25)
230
+ print(f"| save summary to '{filename}'")
231
+ if ph_occurred != ph_required:
232
+ unrecognizable_phones = ph_occurred.difference(ph_required)
233
+ missing_phones = ph_required.difference(ph_occurred)
234
+ raise BinarizationError(
235
+ f"""transcriptions and dictionary mismatch.
236
+ (+) {sorted(unrecognizable_phones)}
237
+ (-) {sorted(missing_phones)}"""
238
+ )
239
+
240
+ def process_dataset(self, prefix, num_workers=0, apply_augmentation=False):
241
+ args = []
242
+ builder = IndexedDatasetBuilder(self.binary_data_dir, prefix=prefix, allowed_attr=self.data_attrs)
243
+ total_sec = {k: (0.0) for k in self.spk_map}
244
+ total_raw_sec = {k: (0.0) for k in self.spk_map}
245
+ extra_info = {"names": {}, "spk_ids": {}, "spk_names": {}, "lengths": {}}
246
+ max_no = -1
247
+ for item_name, meta_data in self.meta_data_iterator(prefix):
248
+ args.append([item_name, meta_data, self.binarization_args])
249
+ aug_map = self.arrange_data_augmentation(self.meta_data_iterator(prefix)) if apply_augmentation else {}
250
+
251
+ def postprocess(_item):
252
+ nonlocal total_sec, total_raw_sec, extra_info, max_no
253
+ if _item is None:
254
+ return
255
+ item_no = builder.add_item(_item)
256
+ max_no = max(max_no, item_no)
257
+ for k, v in _item.items():
258
+ if isinstance(v, np.ndarray):
259
+ if k not in extra_info:
260
+ extra_info[k] = {}
261
+ extra_info[k][item_no] = tuple(v.shape)[0]
262
+ extra_info["names"][item_no] = _item["name"].split(":", 1)[-1]
263
+ extra_info["spk_ids"][item_no] = _item["spk_id"]
264
+ extra_info["spk_names"][item_no] = _item["spk_name"]
265
+ extra_info["lengths"][item_no] = _item["length"]
266
+ total_raw_sec[_item["spk_name"]] += _item["seconds"]
267
+ total_sec[_item["spk_name"]] += _item["seconds"]
268
+ for task in aug_map.get(_item["name"], []):
269
+ aug_item = task["func"](_item, **task["kwargs"])
270
+ aug_item_no = builder.add_item(aug_item)
271
+ max_no = max(max_no, aug_item_no)
272
+ for k, v in aug_item.items():
273
+ if isinstance(v, np.ndarray):
274
+ if k not in extra_info:
275
+ extra_info[k] = {}
276
+ extra_info[k][aug_item_no] = tuple(v.shape)[0]
277
+ extra_info["names"][aug_item_no] = aug_item["name"].split(":", 1)[-1]
278
+ extra_info["spk_ids"][aug_item_no] = aug_item["spk_id"]
279
+ extra_info["spk_names"][aug_item_no] = aug_item["spk_name"]
280
+ extra_info["lengths"][aug_item_no] = aug_item["length"]
281
+ total_sec[aug_item["spk_name"]] += aug_item["seconds"]
282
+
283
+ try:
284
+ if num_workers > 0:
285
+ for item in tqdm(
286
+ chunked_multiprocess_run(self.process_item, args, num_workers=num_workers),
287
+ total=len(list(self.meta_data_iterator(prefix))),
288
+ ):
289
+ postprocess(item)
290
+ else:
291
+ for a in tqdm(args):
292
+ item = self.process_item(*a)
293
+ postprocess(item)
294
+ for k in extra_info:
295
+ assert set(extra_info[k]) == set(range(max_no + 1)), f"Item numbering is not consecutive."
296
+ extra_info[k] = list(map(lambda x: x[1], sorted(extra_info[k].items(), key=lambda x: x[0])))
297
+ except KeyboardInterrupt:
298
+ builder.finalize()
299
+ raise
300
+ builder.finalize()
301
+ if prefix == "train":
302
+ extra_info.pop("names")
303
+ extra_info.pop("spk_names")
304
+ with open(self.binary_data_dir / f"{prefix}.meta", "wb") as f:
305
+ pickle.dump(extra_info, f)
306
+ if apply_augmentation:
307
+ print(f"| {prefix} total duration (before augmentation): {sum(total_raw_sec.values()):.2f}s")
308
+ print(
309
+ f"| {prefix} respective duration (before augmentation): "
310
+ + ", ".join(f"{k}={v:.2f}s" for k, v in total_raw_sec.items())
311
+ )
312
+ print(
313
+ f"| {prefix} total duration (after augmentation): {sum(total_sec.values()):.2f}s ({sum(total_sec.values()) / sum(total_raw_sec.values()):.2f}x)"
314
+ )
315
+ print(
316
+ f"| {prefix} respective duration (after augmentation): "
317
+ + ", ".join(f"{k}={v:.2f}s" for k, v in total_sec.items())
318
+ )
319
+ else:
320
+ print(f"| {prefix} total duration: {sum(total_raw_sec.values()):.2f}s")
321
+ print(f"| {prefix} respective duration: " + ", ".join(f"{k}={v:.2f}s" for k, v in total_raw_sec.items()))
322
+
323
+ def arrange_data_augmentation(self, data_iterator):
324
+ """
325
+ Code for all types of data augmentation should be added here.
326
+ """
327
+ raise NotImplementedError()
328
+
329
+ def process_item(self, item_name, meta_data, binarization_args):
330
+ raise NotImplementedError()
VLMEvalKit_old/PaddleMIX/paddlemix/models/diffsinger/basics/base_exporter.py ADDED
@@ -0,0 +1,72 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ import json
16
+ from pathlib import Path
17
+ from typing import Union
18
+
19
+ import paddle
20
+ from utils.hparams import hparams
21
+
22
+
23
+ class BaseExporter:
24
+ def __init__(
25
+ self, device: Union[str, (paddle.CPUPlace, paddle.CUDAPlace, str)] = None, cache_dir: Path = None, **kwargs
26
+ ):
27
+ self.device = (
28
+ device
29
+ if device is not None
30
+ else str("cuda" if paddle.device.cuda.device_count() >= 1 else "cpu").replace("cuda", "gpu")
31
+ )
32
+ self.cache_dir: Path = (
33
+ cache_dir.resolve() if cache_dir is not None else Path(__file__).parent.parent / "deployment" / "cache"
34
+ )
35
+ self.cache_dir.mkdir(parents=True, exist_ok=True)
36
+
37
+ def build_spk_map(self) -> dict:
38
+ if hparams["use_spk_id"]:
39
+ with open(Path(hparams["work_dir"]) / "spk_map.json", "r", encoding="utf8") as f:
40
+ spk_map = json.load(f)
41
+ assert isinstance(spk_map, dict) and len(spk_map) > 0, "Invalid or empty speaker map!"
42
+ assert len(spk_map) == len(set(spk_map.values())), "Duplicate speaker id in speaker map!"
43
+ return spk_map
44
+ else:
45
+ return {}
46
+
47
+ def build_model(self) -> paddle.nn.Layer:
48
+ """
49
+ Creates an instance of nn.Module and load its state dict on the target device.
50
+ """
51
+ raise NotImplementedError()
52
+
53
+ def export_model(self, path: Path):
54
+ """
55
+ Exports the model to ONNX format.
56
+ :param path: the target model path
57
+ """
58
+ raise NotImplementedError()
59
+
60
+ def export_attachments(self, path: Path):
61
+ """
62
+ Exports related files and configs (e.g. the dictionary) to the target directory.
63
+ :param path: the target directory
64
+ """
65
+ raise NotImplementedError()
66
+
67
+ def export(self, path: Path):
68
+ """
69
+ Exports all the artifacts to the target directory.
70
+ :param path: the target directory
71
+ """
72
+ raise NotImplementedError()
VLMEvalKit_old/PaddleMIX/paddlemix/models/diffsinger/basics/base_svs_infer.py ADDED
@@ -0,0 +1,149 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ from typing import Dict, Tuple
16
+
17
+ import numpy as np
18
+ import paddle
19
+
20
+ from paddlemix.models.diffsinger.utils import hparams
21
+ from paddlemix.models.diffsinger.utils.infer_utils import resample_align_curve
22
+
23
+
24
+ class BaseSVSInfer:
25
+ """
26
+ Base class for SVS inference models.
27
+ Subclasses should define:
28
+ 1. *build_model*:
29
+ how to build the model;
30
+ 2. *run_model*:
31
+ how to run the model (typically, generate a mel-spectrogram and
32
+ pass it to the pre-built vocoder);
33
+ 3. *preprocess_input*:
34
+ how to preprocess user input.
35
+ 4. *infer_once*
36
+ infer from raw inputs to the final outputs
37
+ """
38
+
39
+ def __init__(self, device=None):
40
+ if device is None:
41
+ device = "gpu" if paddle.device.cuda.device_count() >= 1 else "cpu"
42
+ self.device = device
43
+ self.timestep = hparams["hop_size"] / hparams["audio_sample_rate"]
44
+ self.spk_map = {}
45
+ self.model: paddle.nn.Layer = None
46
+
47
+ def build_model(self, ckpt_steps=None) -> paddle.nn.Layer:
48
+ raise NotImplementedError()
49
+
50
+ def load_speaker_mix(
51
+ self, param_src: dict, summary_dst: dict, mix_mode: str = "frame", mix_length: int = None
52
+ ) -> Tuple[paddle.Tensor, paddle.Tensor]:
53
+ """
54
+
55
+ :param param_src: param dict
56
+ :param summary_dst: summary dict
57
+ :param mix_mode: 'token' or 'frame'
58
+ :param mix_length: total tokens or frames to mix
59
+ :return: spk_mix_id [B=1, 1, N], spk_mix_value [B=1, T, N]
60
+ """
61
+ assert mix_mode == "token" or mix_mode == "frame"
62
+ param_key = "spk_mix" if mix_mode == "frame" else "ph_spk_mix"
63
+ summary_solo_key = "spk" if mix_mode == "frame" else "ph_spk"
64
+ spk_mix_map = param_src.get(param_key)
65
+ dynamic = False
66
+ if spk_mix_map is None:
67
+ for name in self.spk_map.keys():
68
+ spk_mix_map = {name: 1.0}
69
+ break
70
+ else:
71
+ for name in spk_mix_map:
72
+ assert name in self.spk_map, f"Speaker '{name}' not found."
73
+ if len(spk_mix_map) == 1:
74
+ summary_dst[summary_solo_key] = list(spk_mix_map.keys())[0]
75
+ elif any([isinstance(val, str) for val in spk_mix_map.values()]):
76
+ print_mix = "|".join(spk_mix_map.keys())
77
+ summary_dst[param_key] = f"dynamic({print_mix})"
78
+ dynamic = True
79
+ else:
80
+ print_mix = "|".join([f"{n}:{'%.3f' % spk_mix_map[n]}" for n in spk_mix_map])
81
+ summary_dst[param_key] = f"static({print_mix})"
82
+ spk_mix_id_list = []
83
+ spk_mix_value_list = []
84
+ if dynamic:
85
+ for name, values in spk_mix_map.items():
86
+ spk_mix_id_list.append(self.spk_map[name])
87
+ if isinstance(values, str):
88
+ if mix_mode == "token":
89
+ cur_spk_mix_value = values.split()
90
+ assert (
91
+ len(cur_spk_mix_value) == mix_length
92
+ ), "Speaker mix checks failed. In dynamic token-level mix, number of proportion values must equal number of tokens."
93
+ cur_spk_mix_value = paddle.to_tensor(data=np.array(cur_spk_mix_value, "float32")).to(
94
+ self.device
95
+ )[None]
96
+ else:
97
+ cur_spk_mix_value = paddle.to_tensor(
98
+ data=resample_align_curve(
99
+ np.array(values.split(), "float32"),
100
+ original_timestep=float(param_src["spk_mix_timestep"]),
101
+ target_timestep=self.timestep,
102
+ align_length=mix_length,
103
+ )
104
+ ).to(self.device)[None]
105
+ assert paddle.all(
106
+ x=cur_spk_mix_value >= 0.0
107
+ ), f"""Speaker mix checks failed.
108
+ Proportions of speaker '{name}' on some {mix_mode}s are negative."""
109
+ else:
110
+ assert (
111
+ values >= 0.0
112
+ ), f"""Speaker mix checks failed.
113
+ Proportion of speaker '{name}' is negative."""
114
+ cur_spk_mix_value = paddle.full(shape=(1, mix_length), fill_value=values, dtype="float32")
115
+ spk_mix_value_list.append(cur_spk_mix_value)
116
+ spk_mix_id = paddle.to_tensor(data=spk_mix_id_list, dtype="int64").to(self.device)[None, None]
117
+ spk_mix_value = paddle.stack(x=spk_mix_value_list, axis=2)
118
+ spk_mix_value_sum = paddle.sum(x=spk_mix_value, axis=2, keepdim=True)
119
+ assert paddle.all(
120
+ x=spk_mix_value_sum > 0.0
121
+ ), f"""Speaker mix checks failed.
122
+ Proportions of speaker mix on some frames sum to zero."""
123
+ spk_mix_value /= spk_mix_value_sum
124
+ else:
125
+ for name, value in spk_mix_map.items():
126
+ spk_mix_id_list.append(self.spk_map[name])
127
+ assert (
128
+ value >= 0.0
129
+ ), f"""Speaker mix checks failed.
130
+ Proportion of speaker '{name}' is negative."""
131
+ spk_mix_value_list.append(value)
132
+ spk_mix_id = paddle.to_tensor(data=spk_mix_id_list, dtype="int64").to(self.device)[None, None]
133
+ spk_mix_value = paddle.to_tensor(data=spk_mix_value_list, dtype="float32").to(self.device)[None, None]
134
+ spk_mix_value_sum = spk_mix_value.sum()
135
+ assert (
136
+ spk_mix_value_sum > 0.0
137
+ ), f"""Speaker mix checks failed.
138
+ Proportions of speaker mix sum to zero."""
139
+ spk_mix_value /= spk_mix_value_sum
140
+ return spk_mix_id, spk_mix_value
141
+
142
+ def preprocess_input(self, param: dict, idx=0) -> Dict[str, paddle.Tensor]:
143
+ raise NotImplementedError()
144
+
145
+ def forward_model(self, sample: Dict[str, paddle.Tensor]):
146
+ raise NotImplementedError()
147
+
148
+ def run_inference(self, params, **kwargs):
149
+ raise NotImplementedError()
VLMEvalKit_old/PaddleMIX/paddlemix/models/diffsinger/basics/base_vocoder.py ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+
16
+ class BaseVocoder:
17
+ def to_device(self, device):
18
+ """
19
+
20
+ :param device: torch.device or str
21
+ """
22
+ raise NotImplementedError()
23
+
24
+ def get_device(self):
25
+ """
26
+
27
+ :return: device: torch.device or str
28
+ """
29
+ raise NotImplementedError()
30
+
31
+ def spec2wav(self, mel, **kwargs):
32
+ """
33
+
34
+ :param mel: [T, 80]
35
+ :return: wav: [T']
36
+ """
37
+ raise NotImplementedError()
VLMEvalKit_old/PaddleMIX/paddlemix/models/diffsinger/modules/aux_decoder/convnext.py ADDED
@@ -0,0 +1,103 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ import sys
16
+ from typing import Optional
17
+
18
+ import paddle
19
+ from paddlemix.models.diffsinger.utils import paddle_aux
20
+
21
+
22
+ class ConvNeXtBlock(paddle.nn.Layer):
23
+ """ConvNeXt Block adapted from https://github.com/facebookresearch/ConvNeXt to 1D audio signal.
24
+
25
+ Args:
26
+ dim (int): Number of input channels.
27
+ intermediate_dim (int): Dimensionality of the intermediate layer.
28
+ layer_scale_init_value (float, optional): Initial value for the layer scale. None means no scaling.
29
+ Defaults to None.
30
+ """
31
+
32
+ def __init__(
33
+ self, dim: int, intermediate_dim: int, layer_scale_init_value: Optional[float] = None, drop_out: float = 0.0
34
+ ):
35
+ super().__init__()
36
+ self.dwconv = paddle.nn.Conv1D(in_channels=dim, out_channels=dim, kernel_size=7, padding=3, groups=dim)
37
+ self.norm = paddle.nn.LayerNorm(normalized_shape=dim, epsilon=1e-06)
38
+ self.pwconv1 = paddle.nn.Linear(in_features=dim, out_features=intermediate_dim)
39
+ self.act = paddle.nn.GELU()
40
+ self.pwconv2 = paddle.nn.Linear(in_features=intermediate_dim, out_features=dim)
41
+ self.gamma = (
42
+ paddle.base.framework.EagerParamBase.from_tensor(
43
+ tensor=layer_scale_init_value * paddle.ones(shape=dim), trainable=True
44
+ )
45
+ if layer_scale_init_value > 0
46
+ else None
47
+ )
48
+ self.drop_path = paddle.nn.Identity()
49
+ self.dropout = paddle.nn.Dropout(p=drop_out) if drop_out > 0.0 else paddle.nn.Identity()
50
+
51
+ def forward(self, x: paddle.Tensor) -> paddle.Tensor:
52
+ residual = x
53
+ x = self.dwconv(x)
54
+ x = x.transpose(perm=paddle_aux.transpose_aux_func(x.ndim, 1, 2))
55
+ x = self.norm(x)
56
+ x = self.pwconv1(x)
57
+ x = self.act(x)
58
+ x = self.pwconv2(x)
59
+ if self.gamma is not None:
60
+ x = self.gamma * x
61
+ x = x.transpose(perm=paddle_aux.transpose_aux_func(x.ndim, 1, 2))
62
+ x = self.dropout(x)
63
+ x = residual + self.drop_path(x)
64
+ return x
65
+
66
+
67
+ class ConvNeXtDecoder(paddle.nn.Layer):
68
+ def __init__(self, in_dims, out_dims, /, *, num_channels=512, num_layers=6, kernel_size=7, dropout_rate=0.1):
69
+ super().__init__()
70
+ self.inconv = paddle.nn.Conv1D(
71
+ in_channels=in_dims,
72
+ out_channels=num_channels,
73
+ kernel_size=kernel_size,
74
+ stride=1,
75
+ padding=(kernel_size - 1) // 2,
76
+ )
77
+ self.conv = paddle.nn.LayerList(
78
+ sublayers=(
79
+ ConvNeXtBlock(
80
+ dim=num_channels,
81
+ intermediate_dim=num_channels * 4,
82
+ layer_scale_init_value=1e-06,
83
+ drop_out=dropout_rate,
84
+ )
85
+ for _ in range(num_layers)
86
+ )
87
+ )
88
+ self.outconv = paddle.nn.Conv1D(
89
+ in_channels=num_channels,
90
+ out_channels=out_dims,
91
+ kernel_size=kernel_size,
92
+ stride=1,
93
+ padding=(kernel_size - 1) // 2,
94
+ )
95
+
96
+ def forward(self, x, infer=False):
97
+ x = x.transpose(perm=paddle_aux.transpose_aux_func(x.ndim, 1, 2))
98
+ x = self.inconv(x)
99
+ for conv in self.conv:
100
+ x = conv(x)
101
+ x = self.outconv(x)
102
+ x = x.transpose(perm=paddle_aux.transpose_aux_func(x.ndim, 1, 2))
103
+ return x
VLMEvalKit_old/PaddleMIX/paddlemix/models/diffsinger/modules/backbones/__init__.py ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ import paddle
16
+ from paddlemix.models.diffsinger.modules.backbones.lynxnet import LYNXNet
17
+ from paddlemix.models.diffsinger.modules.backbones.wavenet import WaveNet
18
+ from paddlemix.models.diffsinger.utils import filter_kwargs
19
+
20
+ BACKBONES = {"wavenet": WaveNet, "lynxnet": LYNXNet}
21
+
22
+
23
+ def build_backbone(out_dims: int, num_feats: int, backbone_type: str, backbone_args: dict) -> paddle.nn.Layer:
24
+ backbone = BACKBONES[backbone_type]
25
+ kwargs = filter_kwargs(backbone_args, backbone)
26
+ return BACKBONES[backbone_type](out_dims, num_feats, **kwargs)
VLMEvalKit_old/PaddleMIX/paddlemix/models/diffsinger/modules/backbones/lynxnet.py ADDED
@@ -0,0 +1,188 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ import sys
16
+
17
+ import paddle
18
+ from paddlemix.models.diffsinger.utils import paddle_aux
19
+
20
+ from paddlemix.models.diffsinger.modules.commons.common_layers import SinusoidalPosEmb
21
+ from paddlemix.models.diffsinger.utils.hparams import hparams
22
+
23
+
24
+ class SwiGLU(paddle.nn.Layer):
25
+ def __init__(self, dim=-1):
26
+ super().__init__()
27
+ self.dim = dim
28
+
29
+ def forward(self, x):
30
+ out, gate = paddle_aux.split(x=x, num_or_sections=x.shape[self.dim] // 2, axis=self.dim)
31
+ return out * paddle.nn.functional.silu(x=gate)
32
+
33
+
34
+ class Transpose(paddle.nn.Layer):
35
+ def __init__(self, dims):
36
+ super().__init__()
37
+ assert len(dims) == 2, "dims must be a tuple of two dimensions"
38
+ self.dims = dims
39
+
40
+ def forward(self, x):
41
+ # return x.transpose(*self.dims)
42
+ # return x.transpose(perm=list(self.dims)) # or tuple(self.dims)
43
+ return x.transpose(perm=paddle_aux.transpose_aux_func(x.ndim, *self.dims))
44
+
45
+
46
+ class LYNXConvModule(paddle.nn.Layer):
47
+ @staticmethod
48
+ def calc_same_padding(kernel_size):
49
+ pad = kernel_size // 2
50
+ return pad, pad - (kernel_size + 1) % 2
51
+
52
+ def __init__(self, dim, expansion_factor, kernel_size=31, activation="PReLU", dropout=0.0):
53
+ super().__init__()
54
+ inner_dim = dim * expansion_factor
55
+ activation_classes = {
56
+ "SiLU": paddle.nn.Silu,
57
+ "ReLU": paddle.nn.ReLU,
58
+ "PReLU": lambda: paddle.nn.PReLU(num_parameters=inner_dim),
59
+ }
60
+ activation = activation if activation is not None else "PReLU"
61
+ if activation not in activation_classes:
62
+ raise ValueError(f"{activation} is not a valid activation")
63
+ _activation = activation_classes[activation]()
64
+ padding = self.calc_same_padding(kernel_size)
65
+ if float(dropout) > 0.0:
66
+ _dropout = paddle.nn.Dropout(p=dropout)
67
+ else:
68
+ _dropout = paddle.nn.Identity()
69
+ self.net = paddle.nn.Sequential(
70
+ paddle.nn.LayerNorm(normalized_shape=dim),
71
+ Transpose((1, 2)),
72
+ paddle.nn.Conv1D(in_channels=dim, out_channels=inner_dim * 2, kernel_size=1),
73
+ SwiGLU(dim=1),
74
+ paddle.nn.Conv1D(
75
+ in_channels=inner_dim,
76
+ out_channels=inner_dim,
77
+ kernel_size=kernel_size,
78
+ padding=padding[0],
79
+ groups=inner_dim,
80
+ ),
81
+ _activation,
82
+ paddle.nn.Conv1D(in_channels=inner_dim, out_channels=dim, kernel_size=1),
83
+ Transpose((1, 2)),
84
+ _dropout,
85
+ )
86
+
87
+ def forward(self, x):
88
+ return self.net(x)
89
+
90
+
91
+ class LYNXNetResidualLayer(paddle.nn.Layer):
92
+ def __init__(self, dim_cond, dim, expansion_factor, kernel_size=31, activation="PReLU", dropout=0.0):
93
+ super().__init__()
94
+ self.diffusion_projection = paddle.nn.Conv1D(in_channels=dim, out_channels=dim, kernel_size=1)
95
+ self.conditioner_projection = paddle.nn.Conv1D(in_channels=dim_cond, out_channels=dim, kernel_size=1)
96
+ self.convmodule = LYNXConvModule(
97
+ dim=dim, expansion_factor=expansion_factor, kernel_size=kernel_size, activation=activation, dropout=dropout
98
+ )
99
+
100
+ def forward(self, x, conditioner, diffusion_step):
101
+ res_x = x.transpose(perm=paddle_aux.transpose_aux_func(x.ndim, 1, 2))
102
+ x = x + self.diffusion_projection(diffusion_step) + self.conditioner_projection(conditioner)
103
+ x = x.transpose(perm=paddle_aux.transpose_aux_func(x.ndim, 1, 2))
104
+ x = self.convmodule(x)
105
+ x = x + res_x
106
+ x = x.transpose(perm=paddle_aux.transpose_aux_func(x.ndim, 1, 2))
107
+ return x
108
+
109
+
110
+ class LYNXNet(paddle.nn.Layer):
111
+ def __init__(
112
+ self,
113
+ in_dims,
114
+ n_feats,
115
+ *,
116
+ num_layers=6,
117
+ num_channels=512,
118
+ expansion_factor=2,
119
+ kernel_size=31,
120
+ activation="PReLU",
121
+ dropout=0.0
122
+ ):
123
+ """
124
+ LYNXNet(Linear Gated Depthwise Separable Convolution Network)
125
+ TIPS:You can control the style of the generated results by modifying the 'activation',
126
+ - 'PReLU'(default) : Similar to WaveNet
127
+ - 'SiLU' : Voice will be more pronounced, not recommended for use under DDPM
128
+ - 'ReLU' : Contrary to 'SiLU', Voice will be weakened
129
+ """
130
+ super().__init__()
131
+ self.in_dims = in_dims
132
+ self.n_feats = n_feats
133
+ self.input_projection = paddle.nn.Conv1D(
134
+ in_channels=in_dims * n_feats, out_channels=num_channels, kernel_size=1
135
+ )
136
+ self.diffusion_embedding = paddle.nn.Sequential(
137
+ SinusoidalPosEmb(num_channels),
138
+ paddle.nn.Linear(in_features=num_channels, out_features=num_channels * 4),
139
+ paddle.nn.GELU(),
140
+ paddle.nn.Linear(in_features=num_channels * 4, out_features=num_channels),
141
+ )
142
+ self.residual_layers = paddle.nn.LayerList(
143
+ sublayers=[
144
+ LYNXNetResidualLayer(
145
+ dim_cond=hparams["hidden_size"],
146
+ dim=num_channels,
147
+ expansion_factor=expansion_factor,
148
+ kernel_size=kernel_size,
149
+ activation=activation,
150
+ dropout=dropout,
151
+ )
152
+ for i in range(num_layers)
153
+ ]
154
+ )
155
+ self.norm = paddle.nn.LayerNorm(normalized_shape=num_channels)
156
+ self.output_projection = paddle.nn.Conv1D(
157
+ in_channels=num_channels, out_channels=in_dims * n_feats, kernel_size=1
158
+ )
159
+ init_Constant = paddle.nn.initializer.Constant(value=0.0)
160
+ init_Constant(self.output_projection.weight)
161
+
162
+ def forward(self, spec, diffusion_step, cond):
163
+ """
164
+ :param spec: [B, F, M, T]
165
+ :param diffusion_step: [B, 1]
166
+ :param cond: [B, H, T]
167
+ :return:
168
+ """
169
+ if self.n_feats == 1:
170
+ x = spec[:, 0]
171
+ else:
172
+ x = spec.flatten(start_axis=1, stop_axis=2)
173
+ x = self.input_projection(x)
174
+ x = paddle.nn.functional.gelu(x=x)
175
+ diffusion_step = self.diffusion_embedding(diffusion_step).unsqueeze(axis=-1)
176
+ for layer in self.residual_layers:
177
+ x = layer(x, cond, diffusion_step)
178
+ x = self.norm(x.transpose(perm=paddle_aux.transpose_aux_func(x.ndim, 1, 2))).transpose(
179
+ perm=paddle_aux.transpose_aux_func(
180
+ self.norm(x.transpose(perm=paddle_aux.transpose_aux_func(x.ndim, 1, 2))).ndim, 1, 2
181
+ )
182
+ )
183
+ x = self.output_projection(x)
184
+ if self.n_feats == 1:
185
+ x = x[:, None, :, :]
186
+ else:
187
+ x = x.reshape(-1, self.n_feats, self.in_dims, tuple(x.shape)[2])
188
+ return x
VLMEvalKit_old/PaddleMIX/paddlemix/models/diffsinger/modules/backbones/wavenet.py ADDED
@@ -0,0 +1,120 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ import math
16
+ import sys
17
+ from math import sqrt
18
+
19
+ import paddle
20
+ from paddlemix.models.diffsinger.utils import paddle_aux
21
+
22
+ from paddlemix.models.diffsinger.modules.commons.common_layers import SinusoidalPosEmb
23
+ from paddlemix.models.diffsinger.utils.hparams import hparams
24
+
25
+
26
+ class Conv1d(paddle.nn.Conv1D):
27
+ def __init__(self, *args, **kwargs):
28
+ super().__init__(*args, **kwargs)
29
+ init_KaimingNormal = paddle.nn.initializer.KaimingNormal(nonlinearity="leaky_relu")
30
+ init_KaimingNormal(self.weight)
31
+
32
+
33
+ class ResidualBlock(paddle.nn.Layer):
34
+ def __init__(self, encoder_hidden, residual_channels, dilation):
35
+ super().__init__()
36
+ self.residual_channels = residual_channels
37
+ self.dilated_conv = paddle.nn.Conv1D(
38
+ in_channels=residual_channels,
39
+ out_channels=2 * residual_channels,
40
+ kernel_size=3,
41
+ padding=dilation,
42
+ dilation=dilation,
43
+ )
44
+ self.diffusion_projection = paddle.nn.Linear(in_features=residual_channels, out_features=residual_channels)
45
+ self.conditioner_projection = paddle.nn.Conv1D(
46
+ in_channels=encoder_hidden, out_channels=2 * residual_channels, kernel_size=1
47
+ )
48
+ self.output_projection = paddle.nn.Conv1D(
49
+ in_channels=residual_channels, out_channels=2 * residual_channels, kernel_size=1
50
+ )
51
+
52
+ def forward(self, x, conditioner, diffusion_step):
53
+ diffusion_step = self.diffusion_projection(diffusion_step).unsqueeze(axis=-1)
54
+ conditioner = self.conditioner_projection(conditioner)
55
+ y = x + diffusion_step
56
+ y = self.dilated_conv(y) + conditioner
57
+ gate, filter = paddle_aux.split(x=y, num_or_sections=[self.residual_channels, self.residual_channels], axis=1)
58
+ y = paddle.nn.functional.sigmoid(x=gate) * paddle.nn.functional.tanh(x=filter)
59
+ y = self.output_projection(y)
60
+ residual, skip = paddle_aux.split(
61
+ x=y, num_or_sections=[self.residual_channels, self.residual_channels], axis=1
62
+ )
63
+ return (x + residual) / math.sqrt(2.0), skip
64
+
65
+
66
+ class WaveNet(paddle.nn.Layer):
67
+ def __init__(self, in_dims, n_feats, *, num_layers=20, num_channels=256, dilation_cycle_length=4):
68
+ super().__init__()
69
+ self.in_dims = in_dims
70
+ self.n_feats = n_feats
71
+ self.input_projection = Conv1d(in_dims * n_feats, num_channels, 1)
72
+ self.diffusion_embedding = SinusoidalPosEmb(num_channels)
73
+ self.mlp = paddle.nn.Sequential(
74
+ paddle.nn.Linear(in_features=num_channels, out_features=num_channels * 4),
75
+ paddle.nn.Mish(),
76
+ paddle.nn.Linear(in_features=num_channels * 4, out_features=num_channels),
77
+ )
78
+ self.residual_layers = paddle.nn.LayerList(
79
+ sublayers=[
80
+ ResidualBlock(
81
+ encoder_hidden=hparams["hidden_size"],
82
+ residual_channels=num_channels,
83
+ dilation=2 ** (i % dilation_cycle_length),
84
+ )
85
+ for i in range(num_layers)
86
+ ]
87
+ )
88
+ self.skip_projection = Conv1d(num_channels, num_channels, 1)
89
+ self.output_projection = Conv1d(num_channels, in_dims * n_feats, 1)
90
+ init_Constant = paddle.nn.initializer.Constant(value=0.0)
91
+ init_Constant(self.output_projection.weight)
92
+
93
+ def forward(self, spec, diffusion_step, cond):
94
+ """
95
+ :param spec: [B, F, M, T]
96
+ :param diffusion_step: [B, 1]
97
+ :param cond: [B, H, T]
98
+ :return:
99
+ """
100
+ if self.n_feats == 1:
101
+ x = spec.squeeze(axis=1)
102
+ else:
103
+ x = spec.flatten(start_axis=1, stop_axis=2)
104
+ x = self.input_projection(x)
105
+ x = paddle.nn.functional.relu(x=x)
106
+ diffusion_step = self.diffusion_embedding(diffusion_step)
107
+ diffusion_step = self.mlp(diffusion_step)
108
+ skip = []
109
+ for layer in self.residual_layers:
110
+ x, skip_connection = layer(x, cond, diffusion_step)
111
+ skip.append(skip_connection)
112
+ x = paddle.sum(x=paddle.stack(x=skip), axis=0) / sqrt(len(self.residual_layers))
113
+ x = self.skip_projection(x)
114
+ x = paddle.nn.functional.relu(x=x)
115
+ x = self.output_projection(x)
116
+ if self.n_feats == 1:
117
+ x = x[:, None, :, :]
118
+ else:
119
+ x = x.reshape(-1, self.n_feats, self.in_dims, tuple(x.shape)[2])
120
+ return x
VLMEvalKit_old/PaddleMIX/paddlemix/models/diffsinger/modules/commons/common_layers.py ADDED
@@ -0,0 +1,187 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ from __future__ import annotations
16
+
17
+ import math
18
+ import sys
19
+
20
+ import paddle
21
+ from paddlemix.models.diffsinger.utils import paddle_aux
22
+ from paddle.nn import GELU, LayerNorm
23
+ from paddle.nn import MultiHeadAttention as MultiheadAttention
24
+ from paddle.nn import ReLU
25
+ from paddle.nn import Silu as SiLU
26
+
27
+ import paddlemix.models.diffsinger.utils as utils
28
+
29
+
30
+ class NormalInitEmbedding(paddle.nn.Embedding):
31
+ def __init__(self, num_embeddings: int, embedding_dim: int, padding_idx: (int | None) = None, *args, **kwargs):
32
+ super().__init__(num_embeddings, embedding_dim, *args, padding_idx=padding_idx, **kwargs)
33
+ init_Normal = paddle.nn.initializer.Normal(mean=0, std=self._embedding_dim**-0.5)
34
+ init_Normal(self.weight)
35
+ if padding_idx is not None:
36
+ init_Constant = paddle.nn.initializer.Constant(value=0)
37
+ init_Constant(self.weight[padding_idx])
38
+
39
+
40
+ class XavierUniformInitLinear(paddle.nn.Linear):
41
+ def __init__(self, in_features: int, out_features: int, *args, bias: bool = True, **kwargs):
42
+ super().__init__(in_features, out_features, *args, bias_attr=bias, **kwargs)
43
+ init_XavierUniform = paddle.nn.initializer.XavierUniform()
44
+ init_XavierUniform(self.weight)
45
+ if bias:
46
+ init_Constant = paddle.nn.initializer.Constant(value=0.0)
47
+ init_Constant(self.bias)
48
+
49
+
50
+ class SinusoidalPositionalEmbedding(paddle.nn.Layer):
51
+ """This module produces sinusoidal positional embeddings of any length.
52
+
53
+ Padding symbols are ignored.
54
+ """
55
+
56
+ def __init__(self, embedding_dim, padding_idx, init_size=1024):
57
+ super().__init__()
58
+ self.embedding_dim = embedding_dim
59
+ self.padding_idx = padding_idx
60
+ self.weights = SinusoidalPositionalEmbedding.get_embedding(init_size, embedding_dim, padding_idx)
61
+ self.register_buffer(name="_float_tensor", tensor=paddle.empty(shape=[1], dtype="float32"))
62
+
63
+ @staticmethod
64
+ def get_embedding(num_embeddings, embedding_dim, padding_idx=None):
65
+ """Build sinusoidal embeddings.
66
+
67
+ This matches the implementation in tensor2tensor, but differs slightly
68
+ from the description in Section 3.5 of "Attention Is All You Need".
69
+ """
70
+ half_dim = embedding_dim // 2
71
+ emb = math.log(10000) / (half_dim - 1)
72
+ emb = paddle.exp(x=paddle.arange(dtype="float32", end=half_dim) * -emb)
73
+ emb = paddle.arange(dtype="float32", end=num_embeddings).unsqueeze(axis=1) * emb.unsqueeze(axis=0)
74
+ emb = paddle.concat(x=[paddle.sin(x=emb), paddle.cos(x=emb)], axis=1).view(num_embeddings, -1)
75
+ if embedding_dim % 2 == 1:
76
+ emb = paddle.concat(x=[emb, paddle.zeros(shape=[num_embeddings, 1])], axis=1)
77
+ if padding_idx is not None:
78
+ emb[padding_idx, :] = 0
79
+ return emb
80
+
81
+ def forward(self, x, incremental_state=None, timestep=None, positions=None):
82
+ """Input is expected to be of size [bsz x seqlen]."""
83
+ bsz, seq_len = tuple(x.shape)[:2]
84
+ max_pos = self.padding_idx + 1 + seq_len
85
+ if self.weights is None or max_pos > self.weights.shape[0]:
86
+ self.weights = SinusoidalPositionalEmbedding.get_embedding(max_pos, self.embedding_dim, self.padding_idx)
87
+ self.weights = self.weights.to(self._float_tensor)
88
+ if incremental_state is not None:
89
+ pos = timestep.view(-1)[0] + 1 if timestep is not None else seq_len
90
+ return self.weights[self.padding_idx + pos, :].expand(shape=[bsz, 1, -1])
91
+ positions = utils.make_positions(x, self.padding_idx) if positions is None else positions
92
+ return self.weights.index_select(axis=0, index=positions.view(-1)).view(bsz, seq_len, -1).detach()
93
+
94
+ @staticmethod
95
+ def max_positions():
96
+ """Maximum number of supported positions."""
97
+ return int(100000.0)
98
+
99
+
100
+ class TransformerFFNLayer(paddle.nn.Layer):
101
+ def __init__(self, hidden_size, filter_size, kernel_size=1, dropout=0.0, act="gelu"):
102
+ super().__init__()
103
+ self.kernel_size = kernel_size
104
+ self.dropout = dropout
105
+ self.act = act
106
+ self.ffn_1 = paddle.nn.Conv1D(
107
+ in_channels=hidden_size, out_channels=filter_size, kernel_size=kernel_size, padding=kernel_size // 2
108
+ )
109
+ if self.act == "relu":
110
+ self.act_fn = paddle.nn.ReLU()
111
+ elif self.act == "gelu":
112
+ self.act_fn = paddle.nn.GELU()
113
+ elif self.act == "swish":
114
+ self.act_fn = paddle.nn.Silu()
115
+ self.ffn_2 = XavierUniformInitLinear(filter_size, hidden_size)
116
+
117
+ def forward(self, x):
118
+ x = self.ffn_1(x.transpose(perm=[1, 2, 0])).transpose(perm=[2, 0, 1])
119
+ x = x * self.kernel_size**-0.5
120
+ x = self.act_fn(x)
121
+ x = paddle.nn.functional.dropout(x=x, p=self.dropout, training=self.training)
122
+ x = self.ffn_2(x)
123
+ return x
124
+
125
+
126
+ class EncSALayer(paddle.nn.Layer):
127
+ def __init__(self, c, num_heads, dropout, attention_dropout=0.1, relu_dropout=0.1, kernel_size=9, act="gelu"):
128
+ super().__init__()
129
+ self.dropout = dropout
130
+ self.layer_norm1 = paddle.nn.LayerNorm(normalized_shape=c)
131
+ self.self_attn = MultiheadAttention(
132
+ c,
133
+ num_heads,
134
+ dropout=attention_dropout,
135
+ bias_attr=False,
136
+ )
137
+ self.layer_norm2 = paddle.nn.LayerNorm(normalized_shape=c)
138
+ self.ffn = TransformerFFNLayer(c, 4 * c, kernel_size=kernel_size, dropout=relu_dropout, act=act)
139
+
140
+ def forward(self, x, encoder_padding_mask=None, **kwargs):
141
+ layer_norm_training = kwargs.get("layer_norm_training", None)
142
+ if layer_norm_training is not None:
143
+ self.layer_norm1.training = layer_norm_training
144
+ self.layer_norm2.training = layer_norm_training
145
+ residual = x
146
+ x = self.layer_norm1(x)
147
+ x = self.self_attn(
148
+ query=x,
149
+ key=x,
150
+ value=x,
151
+ attn_mask=paddle.any(encoder_padding_mask, -1), # key_padding_mask=encoder_padding_mask
152
+ )
153
+ x = paddle.nn.functional.dropout(x=x, p=self.dropout, training=self.training)
154
+ x = residual + x
155
+ x = (
156
+ x
157
+ * (1 - encoder_padding_mask.astype(dtype="float32")).transpose(
158
+ perm=paddle_aux.transpose_aux_func((1 - encoder_padding_mask.astype(dtype="float32")).ndim, 0, 1)
159
+ )[..., None]
160
+ )
161
+ residual = x
162
+ x = self.layer_norm2(x)
163
+ x = self.ffn(x)
164
+ x = paddle.nn.functional.dropout(x=x, p=self.dropout, training=self.training)
165
+ x = residual + x
166
+ x = (
167
+ x
168
+ * (1 - encoder_padding_mask.astype(dtype="float32")).transpose(
169
+ perm=paddle_aux.transpose_aux_func((1 - encoder_padding_mask.astype(dtype="float32")).ndim, 0, 1)
170
+ )[..., None]
171
+ )
172
+ return x
173
+
174
+
175
+ class SinusoidalPosEmb(paddle.nn.Layer):
176
+ def __init__(self, dim):
177
+ super().__init__()
178
+ self.dim = dim
179
+
180
+ def forward(self, x):
181
+ device = x.place
182
+ half_dim = self.dim // 2
183
+ emb = math.log(10000) / (half_dim - 1)
184
+ emb = paddle.exp(x=paddle.arange(end=half_dim) * -emb)
185
+ emb = x[:, None] * emb[None, :]
186
+ emb = paddle.concat(x=(emb.sin(), emb.cos()), axis=-1)
187
+ return emb
VLMEvalKit_old/PaddleMIX/paddlemix/models/diffsinger/modules/commons/espnet_positional_embedding.py ADDED
@@ -0,0 +1,129 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ import math
16
+ import sys
17
+
18
+ import paddle
19
+
20
+ from paddlemix.models.diffsinger.utils import paddle_aux
21
+
22
+
23
+ class PositionalEncoding(paddle.nn.Layer):
24
+ """Positional encoding.
25
+ Args:
26
+ d_model (int): Embedding dimension.
27
+ dropout_rate (float): Dropout rate.
28
+ max_len (int): Maximum input length.
29
+ reverse (bool): Whether to reverse the input position.
30
+ """
31
+
32
+ def __init__(self, d_model, dropout_rate, max_len=5000, reverse=False):
33
+ """Construct an PositionalEncoding object."""
34
+ super(PositionalEncoding, self).__init__()
35
+ self.d_model = d_model
36
+ self.reverse = reverse
37
+ self.xscale = math.sqrt(self.d_model)
38
+ self.dropout = paddle.nn.Dropout(p=dropout_rate)
39
+ self.pe = None
40
+ self.extend_pe(paddle.to_tensor(data=0.0).expand(shape=[1, max_len]))
41
+
42
+ def extend_pe(self, x):
43
+ """Reset the positional encodings."""
44
+ if self.pe is not None:
45
+ if self.pe.shape[1] >= x.shape[1]:
46
+ if self.pe.dtype != x.dtype or self.pe.place != x.place:
47
+ self.pe = self.pe.to(dtype=x.dtype, device=x.place)
48
+ return
49
+ if self.reverse:
50
+ position = paddle.arange(start=x.shape[1] - 1, end=-1, step=-1.0, dtype="float32").unsqueeze(axis=1)
51
+ else:
52
+ position = paddle.arange(start=0, end=x.shape[1], dtype="float32").unsqueeze(axis=1)
53
+ div_term = paddle.exp(
54
+ x=paddle.arange(start=0, end=self.d_model, step=2, dtype="float32") * -(math.log(10000.0) / self.d_model)
55
+ )
56
+ pe = (
57
+ paddle.stack(x=[paddle.sin(x=position * div_term), paddle.cos(x=position * div_term)], axis=2)
58
+ .view(-1, self.d_model)
59
+ .unsqueeze(axis=0)
60
+ )
61
+ self.pe = pe.to(device=x.place, dtype=x.dtype)
62
+
63
+ def forward(self, x: paddle.Tensor):
64
+ """Add positional encoding.
65
+ Args:
66
+ x (torch.Tensor): Input tensor (batch, time, `*`).
67
+ Returns:
68
+ torch.Tensor: Encoded tensor (batch, time, `*`).
69
+ """
70
+ self.extend_pe(x)
71
+ x = x * self.xscale + self.pe[:, : x.shape[1]]
72
+ return self.dropout(x)
73
+
74
+
75
+ class ScaledPositionalEncoding(PositionalEncoding):
76
+ """Scaled positional encoding module.
77
+ See Sec. 3.2 https://arxiv.org/abs/1809.08895
78
+ Args:
79
+ d_model (int): Embedding dimension.
80
+ dropout_rate (float): Dropout rate.
81
+ max_len (int): Maximum input length.
82
+ """
83
+
84
+ def __init__(self, d_model, dropout_rate, max_len=5000):
85
+ """Initialize class."""
86
+ super().__init__(d_model=d_model, dropout_rate=dropout_rate, max_len=max_len)
87
+ self.alpha = paddle.base.framework.EagerParamBase.from_tensor(tensor=paddle.to_tensor(data=1.0))
88
+
89
+ def reset_parameters(self):
90
+ """Reset parameters."""
91
+ self.alpha.data = paddle.to_tensor(data=1.0)
92
+
93
+ def forward(self, x):
94
+ """Add positional encoding.
95
+ Args:
96
+ x (torch.Tensor): Input tensor (batch, time, `*`).
97
+ Returns:
98
+ torch.Tensor: Encoded tensor (batch, time, `*`).
99
+ """
100
+ self.extend_pe(x)
101
+ x = x + self.alpha * self.pe[:, : x.shape[1]]
102
+ return self.dropout(x)
103
+
104
+
105
+ class RelPositionalEncoding(PositionalEncoding):
106
+ """Relative positional encoding module.
107
+ See : Appendix B in https://arxiv.org/abs/1901.02860
108
+ Args:
109
+ d_model (int): Embedding dimension.
110
+ dropout_rate (float): Dropout rate.
111
+ max_len (int): Maximum input length.
112
+ """
113
+
114
+ def __init__(self, d_model, dropout_rate, max_len=5000):
115
+ """Initialize class."""
116
+ super().__init__(d_model, dropout_rate, max_len, reverse=True)
117
+
118
+ def forward(self, x):
119
+ """Compute positional encoding.
120
+ Args:
121
+ x (torch.Tensor): Input tensor (batch, time, `*`).
122
+ Returns:
123
+ torch.Tensor: Encoded tensor (batch, time, `*`).
124
+ torch.Tensor: Positional embedding tensor (1, time, `*`).
125
+ """
126
+ self.extend_pe(x)
127
+ x = x * self.xscale
128
+ pos_emb = self.pe[:, : x.shape[1]]
129
+ return self.dropout(x) + self.dropout(pos_emb)
VLMEvalKit_old/PaddleMIX/paddlemix/models/diffsinger/modules/compat.py ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+
16
+ def get_backbone_type(root_config: dict, nested_config: dict = None):
17
+ if nested_config is None:
18
+ nested_config = root_config
19
+ return nested_config.get(
20
+ "backbone_type", root_config.get("backbone_type", root_config.get("diff_decoder_type", "wavenet"))
21
+ )
22
+
23
+
24
+ def get_backbone_args(config: dict, backbone_type: str):
25
+ args = config.get("backbone_args")
26
+ if args is not None:
27
+ return args
28
+ elif backbone_type == "wavenet":
29
+ return {
30
+ "num_layers": config.get("residual_layers"),
31
+ "num_channels": config.get("residual_channels"),
32
+ "dilation_cycle_length": config.get("dilation_cycle_length"),
33
+ }
34
+ else:
35
+ return None
VLMEvalKit_old/PaddleMIX/paddlemix/models/diffsinger/modules/core/__init__.py ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ from .ddpm import GaussianDiffusion, MultiVarianceDiffusion, PitchDiffusion
16
+ from .reflow import MultiVarianceRectifiedFlow, PitchRectifiedFlow, RectifiedFlow
VLMEvalKit_old/PaddleMIX/paddlemix/models/diffsinger/modules/core/ddpm.py ADDED
@@ -0,0 +1,521 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ from __future__ import annotations
16
+
17
+ import sys, os
18
+ from collections import deque
19
+ from functools import partial
20
+ from typing import List, Tuple
21
+
22
+ import numpy as np
23
+
24
+ import paddle
25
+ from tqdm import tqdm
26
+
27
+ from paddlemix.models.diffsinger.modules.backbones import build_backbone
28
+ from paddlemix.models.diffsinger.utils.hparams import hparams
29
+
30
+
31
+ def extract(a, t, x_shape):
32
+ b, *_ = tuple(t.shape)
33
+ out = a.take_along_axis(axis=-1, indices=t, broadcast=False)
34
+ return out.reshape(b, *((1,) * (len(x_shape) - 1)))
35
+
36
+
37
+ def noise_like(shape, device, repeat=False):
38
+ repeat_noise = lambda: paddle.randn(shape=(1, *shape[1:])).tile(
39
+ repeat_times=[shape[0], *((1,) * (len(shape) - 1))]
40
+ )
41
+ noise = lambda: paddle.randn(shape=shape)
42
+ return repeat_noise() if repeat else noise()
43
+
44
+
45
+ def linear_beta_schedule(timesteps, max_beta=0.01):
46
+ """
47
+ linear schedule
48
+ """
49
+ betas = np.linspace(0.0001, max_beta, timesteps)
50
+ return betas
51
+
52
+
53
+ def cosine_beta_schedule(timesteps, s=0.008):
54
+ """
55
+ cosine schedule
56
+ as proposed in https://openreview.net/forum?id=-NEXDKk8gZ
57
+ """
58
+ steps = timesteps + 1
59
+ x = np.linspace(0, steps, steps)
60
+ alphas_cumprod = np.cos((x / steps + s) / (1 + s) * np.pi * 0.5) ** 2
61
+ alphas_cumprod = alphas_cumprod / alphas_cumprod[0]
62
+ betas = 1 - alphas_cumprod[1:] / alphas_cumprod[:-1]
63
+ return np.clip(betas, a_min=0, a_max=0.999)
64
+
65
+
66
+ beta_schedule = {"cosine": cosine_beta_schedule, "linear": linear_beta_schedule}
67
+
68
+
69
+ class GaussianDiffusion(paddle.nn.Layer):
70
+ def __init__(
71
+ self,
72
+ out_dims,
73
+ num_feats=1,
74
+ timesteps=1000,
75
+ k_step=1000,
76
+ backbone_type=None,
77
+ backbone_args=None,
78
+ betas=None,
79
+ spec_min=None,
80
+ spec_max=None,
81
+ ):
82
+ super().__init__()
83
+ self.denoise_fn: paddle.nn.Layer = build_backbone(out_dims, num_feats, backbone_type, backbone_args)
84
+ self.out_dims = out_dims
85
+ self.num_feats = num_feats
86
+ if betas is not None:
87
+ betas = betas.detach().cpu().numpy() if isinstance(betas, paddle.Tensor) else betas
88
+ else:
89
+ betas = beta_schedule[hparams["schedule_type"]](timesteps)
90
+ alphas = 1.0 - betas
91
+ alphas_cumprod = np.cumprod(alphas, axis=0)
92
+ alphas_cumprod_prev = np.append(1.0, alphas_cumprod[:-1])
93
+ self.use_shallow_diffusion = hparams.get("use_shallow_diffusion", False)
94
+ if self.use_shallow_diffusion:
95
+ assert k_step <= timesteps, "K_step should not be larger than timesteps."
96
+ self.timesteps = timesteps
97
+ self.k_step = k_step if self.use_shallow_diffusion else timesteps
98
+ self.noise_list = deque(maxlen=4)
99
+ to_torch = partial(paddle.to_tensor, dtype="float32")
100
+ self.register_buffer(name="betas", tensor=to_torch(betas))
101
+ self.register_buffer(name="alphas_cumprod", tensor=to_torch(alphas_cumprod))
102
+ self.register_buffer(name="alphas_cumprod_prev", tensor=to_torch(alphas_cumprod_prev))
103
+ self.register_buffer(name="sqrt_alphas_cumprod", tensor=to_torch(np.sqrt(alphas_cumprod)))
104
+ self.register_buffer(name="sqrt_one_minus_alphas_cumprod", tensor=to_torch(np.sqrt(1.0 - alphas_cumprod)))
105
+ self.register_buffer(name="log_one_minus_alphas_cumprod", tensor=to_torch(np.log(1.0 - alphas_cumprod)))
106
+ self.register_buffer(name="sqrt_recip_alphas_cumprod", tensor=to_torch(np.sqrt(1.0 / alphas_cumprod)))
107
+ self.register_buffer(name="sqrt_recipm1_alphas_cumprod", tensor=to_torch(np.sqrt(1.0 / alphas_cumprod - 1)))
108
+ posterior_variance = betas * (1.0 - alphas_cumprod_prev) / (1.0 - alphas_cumprod)
109
+ self.register_buffer(name="posterior_variance", tensor=to_torch(posterior_variance))
110
+ self.register_buffer(
111
+ name="posterior_log_variance_clipped", tensor=to_torch(np.log(np.maximum(posterior_variance, 1e-20)))
112
+ )
113
+ self.register_buffer(
114
+ name="posterior_mean_coef1", tensor=to_torch(betas * np.sqrt(alphas_cumprod_prev) / (1.0 - alphas_cumprod))
115
+ )
116
+ self.register_buffer(
117
+ name="posterior_mean_coef2",
118
+ tensor=to_torch((1.0 - alphas_cumprod_prev) * np.sqrt(alphas) / (1.0 - alphas_cumprod)),
119
+ )
120
+ spec_min = paddle.to_tensor(data=spec_min, dtype="float32")[None, None, :out_dims].transpose(
121
+ perm=paddle_aux.transpose_aux_func(
122
+ paddle.to_tensor(data=spec_min, dtype="float32")[None, None, :out_dims].ndim, -3, -2
123
+ )
124
+ )
125
+ spec_max = paddle.to_tensor(data=spec_max, dtype="float32")[None, None, :out_dims].transpose(
126
+ perm=paddle_aux.transpose_aux_func(
127
+ paddle.to_tensor(data=spec_max, dtype="float32")[None, None, :out_dims].ndim, -3, -2
128
+ )
129
+ )
130
+ self.register_buffer(name="spec_min", tensor=spec_min)
131
+ self.register_buffer(name="spec_max", tensor=spec_max)
132
+ self.time_scale_factor = self.timesteps
133
+ self.t_start = 1 - self.k_step / self.timesteps
134
+ factors = paddle.to_tensor(
135
+ data=[i for i in range(1, self.timesteps + 1) if self.timesteps % i == 0], dtype="int64"
136
+ )
137
+ self.register_buffer(name="timestep_factors", tensor=factors, persistable=False)
138
+
139
+ def q_mean_variance(self, x_start, t):
140
+ mean = extract(self.sqrt_alphas_cumprod, t, tuple(x_start.shape)) * x_start
141
+ variance = extract(1.0 - self.alphas_cumprod, t, tuple(x_start.shape))
142
+ log_variance = extract(self.log_one_minus_alphas_cumprod, t, tuple(x_start.shape))
143
+ return mean, variance, log_variance
144
+
145
+ def predict_start_from_noise(self, x_t, t, noise):
146
+ return (
147
+ extract(self.sqrt_recip_alphas_cumprod, t, tuple(x_t.shape)) * x_t
148
+ - extract(self.sqrt_recipm1_alphas_cumprod, t, tuple(x_t.shape)) * noise
149
+ )
150
+
151
+ def q_posterior(self, x_start, x_t, t):
152
+ posterior_mean = (
153
+ extract(self.posterior_mean_coef1, t, tuple(x_t.shape)) * x_start
154
+ + extract(self.posterior_mean_coef2, t, tuple(x_t.shape)) * x_t
155
+ )
156
+ posterior_variance = extract(self.posterior_variance, t, tuple(x_t.shape))
157
+ posterior_log_variance_clipped = extract(self.posterior_log_variance_clipped, t, tuple(x_t.shape))
158
+ return (posterior_mean, posterior_variance, posterior_log_variance_clipped)
159
+
160
+ def p_mean_variance(self, x, t, cond):
161
+ noise_pred = self.denoise_fn(x, t, cond=cond)
162
+ x_recon = self.predict_start_from_noise(x, t=t, noise=noise_pred)
163
+ model_mean, posterior_variance, posterior_log_variance = self.q_posterior(x_start=x_recon, x_t=x, t=t)
164
+ return model_mean, posterior_variance, posterior_log_variance
165
+
166
+ @paddle.no_grad()
167
+ def p_sample(self, x, t, cond, clip_denoised=True, repeat_noise=False):
168
+ b, *_, device = *tuple(x.shape), x.place
169
+ model_mean, _, model_log_variance = self.p_mean_variance(x=x, t=t, cond=cond)
170
+ noise = noise_like(tuple(x.shape), device, repeat_noise)
171
+ nonzero_mask = (1 - (t == 0).astype(dtype="float32")).reshape(b, *((1,) * (len(tuple(x.shape)) - 1)))
172
+ return model_mean + nonzero_mask * (0.5 * model_log_variance).exp() * noise
173
+
174
+ @paddle.no_grad()
175
+ def p_sample_ddim(self, x, t, interval, cond):
176
+ a_t = extract(self.alphas_cumprod, t, tuple(x.shape))
177
+ a_prev = extract(self.alphas_cumprod, paddle_aux.max(t - interval, paddle.zeros_like(x=t)), tuple(x.shape))
178
+ noise_pred = self.denoise_fn(x, t, cond=cond)
179
+ x_prev = a_prev.sqrt() * (
180
+ x / a_t.sqrt() + (((1 - a_prev) / a_prev).sqrt() - ((1 - a_t) / a_t).sqrt()) * noise_pred
181
+ )
182
+ return x_prev
183
+
184
+ @paddle.no_grad()
185
+ def p_sample_plms(self, x, t, interval, cond, clip_denoised=True, repeat_noise=False):
186
+ """
187
+ Use the PLMS method from
188
+ [Pseudo Numerical Methods for Diffusion Models on Manifolds](https://arxiv.org/abs/2202.09778).
189
+ """
190
+
191
+ def get_x_pred(x, noise_t, t):
192
+ a_t = extract(self.alphas_cumprod, t, tuple(x.shape))
193
+ a_prev = extract(self.alphas_cumprod, paddle_aux.max(t - interval, paddle.zeros_like(x=t)), tuple(x.shape))
194
+ a_t_sq, a_prev_sq = a_t.sqrt(), a_prev.sqrt()
195
+ x_delta = (a_prev - a_t) * (
196
+ 1 / (a_t_sq * (a_t_sq + a_prev_sq)) * x
197
+ - 1 / (a_t_sq * (((1 - a_prev) * a_t).sqrt() + ((1 - a_t) * a_prev).sqrt())) * noise_t
198
+ )
199
+ x_pred = x + x_delta
200
+ return x_pred
201
+
202
+ noise_list = self.noise_list
203
+ noise_pred = self.denoise_fn(x, t, cond=cond)
204
+ if len(noise_list) == 0:
205
+ x_pred = get_x_pred(x, noise_pred, t)
206
+ noise_pred_prev = self.denoise_fn(x_pred, max(t - interval, 0), cond=cond)
207
+ noise_pred_prime = (noise_pred + noise_pred_prev) / 2
208
+ elif len(noise_list) == 1:
209
+ noise_pred_prime = (3 * noise_pred - noise_list[-1]) / 2
210
+ elif len(noise_list) == 2:
211
+ noise_pred_prime = (23 * noise_pred - 16 * noise_list[-1] + 5 * noise_list[-2]) / 12
212
+ else:
213
+ noise_pred_prime = (55 * noise_pred - 59 * noise_list[-1] + 37 * noise_list[-2] - 9 * noise_list[-3]) / 24
214
+ x_prev = get_x_pred(x, noise_pred_prime, t)
215
+ noise_list.append(noise_pred)
216
+ return x_prev
217
+
218
+ def q_sample(self, x_start, t, noise):
219
+ return (
220
+ extract(self.sqrt_alphas_cumprod, t, tuple(x_start.shape)) * x_start
221
+ + extract(self.sqrt_one_minus_alphas_cumprod, t, tuple(x_start.shape)) * noise
222
+ )
223
+
224
+ def p_losses(self, x_start, t, cond, noise=None):
225
+ if noise is None:
226
+ noise = paddle.randn(shape=x_start.shape, dtype=x_start.dtype)
227
+ x_noisy = self.q_sample(x_start=x_start, t=t, noise=noise)
228
+ x_recon = self.denoise_fn(x_noisy, t, cond)
229
+ return x_recon, noise
230
+
231
+ def inference(self, cond, b=1, x_start=None, device=None):
232
+ depth = hparams.get("K_step_infer", self.k_step)
233
+ speedup = hparams["diff_speedup"]
234
+ if speedup > 0:
235
+ assert depth % speedup == 0, f"Acceleration ratio must be a factor of diffusion depth {depth}."
236
+ noise = paddle.randn(shape=[b, self.num_feats, self.out_dims, tuple(cond.shape)[2]])
237
+ if self.use_shallow_diffusion:
238
+ t_max = min(depth, self.k_step)
239
+ else:
240
+ t_max = self.k_step
241
+ if t_max >= self.timesteps:
242
+ x = noise
243
+ elif t_max > 0:
244
+ assert x_start is not None, "Missing shallow diffusion source."
245
+ x = self.q_sample(x_start, paddle.full(shape=(b,), fill_value=t_max - 1, dtype="int64"), noise)
246
+ else:
247
+ assert x_start is not None, "Missing shallow diffusion source."
248
+ x = x_start
249
+ if speedup > 1 and t_max > 0:
250
+ algorithm = hparams["diff_accelerator"]
251
+ if algorithm == "dpm-solver":
252
+ from inference.dpm_solver_pytorch import (
253
+ DPM_Solver,
254
+ NoiseScheduleVP,
255
+ model_wrapper,
256
+ )
257
+
258
+ noise_schedule = NoiseScheduleVP(schedule="discrete", betas=self.betas[:t_max])
259
+
260
+ def my_wrapper(fn):
261
+ def wrapped(x, t, **kwargs):
262
+ ret = fn(x, t, **kwargs)
263
+ self.bar.update(1)
264
+ return ret
265
+
266
+ return wrapped
267
+
268
+ model_fn = model_wrapper(
269
+ my_wrapper(self.denoise_fn), noise_schedule, model_type="noise", model_kwargs={"cond": cond}
270
+ )
271
+ dpm_solver = DPM_Solver(model_fn, noise_schedule, algorithm_type="dpmsolver++")
272
+ steps = t_max // hparams["diff_speedup"]
273
+ self.bar = tqdm(desc="sample time step", total=steps, disable=not hparams["infer"], leave=False)
274
+ x = dpm_solver.sample(x, steps=steps, order=2, skip_type="time_uniform", method="multistep")
275
+ self.bar.close()
276
+ elif algorithm == "unipc":
277
+ from inference.uni_pc import NoiseScheduleVP, UniPC, model_wrapper
278
+
279
+ noise_schedule = NoiseScheduleVP(schedule="discrete", betas=self.betas[:t_max])
280
+
281
+ def my_wrapper(fn):
282
+ def wrapped(x, t, **kwargs):
283
+ ret = fn(x, t, **kwargs)
284
+ self.bar.update(1)
285
+ return ret
286
+
287
+ return wrapped
288
+
289
+ model_fn = model_wrapper(
290
+ my_wrapper(self.denoise_fn), noise_schedule, model_type="noise", model_kwargs={"cond": cond}
291
+ )
292
+ uni_pc = UniPC(model_fn, noise_schedule, variant="bh2")
293
+ steps = t_max // hparams["diff_speedup"]
294
+ self.bar = tqdm(desc="sample time step", total=steps, disable=not hparams["infer"], leave=False)
295
+ x = uni_pc.sample(x, steps=steps, order=2, skip_type="time_uniform", method="multistep")
296
+ self.bar.close()
297
+ elif algorithm == "pndm":
298
+ self.noise_list = deque(maxlen=4)
299
+ iteration_interval = speedup
300
+ for i in tqdm(
301
+ reversed(range(0, t_max, iteration_interval)),
302
+ desc="sample time step",
303
+ total=t_max // iteration_interval,
304
+ disable=not hparams["infer"],
305
+ leave=False,
306
+ ):
307
+ x = self.p_sample_plms(
308
+ x, paddle.full(shape=(b,), fill_value=i, dtype="int64"), iteration_interval, cond=cond
309
+ )
310
+ elif algorithm == "ddim":
311
+ iteration_interval = speedup
312
+ for i in tqdm(
313
+ reversed(range(0, t_max, iteration_interval)),
314
+ desc="sample time step",
315
+ total=t_max // iteration_interval,
316
+ disable=not hparams["infer"],
317
+ leave=False,
318
+ ):
319
+ x = self.p_sample_ddim(
320
+ x, paddle.full(shape=(b,), fill_value=i, dtype="int64"), iteration_interval, cond=cond
321
+ )
322
+ else:
323
+ raise ValueError(f"Unsupported acceleration algorithm for DDPM: {algorithm}.")
324
+ else:
325
+ for i in tqdm(
326
+ reversed(range(0, t_max)),
327
+ desc="sample time step",
328
+ total=t_max,
329
+ disable=not hparams["infer"],
330
+ leave=False,
331
+ ):
332
+ x = self.p_sample(x, paddle.full(shape=(b,), fill_value=i, dtype="int64"), cond)
333
+ x = x.transpose(perm=paddle_aux.transpose_aux_func(x.ndim, 2, 3)).squeeze(axis=1)
334
+ return x
335
+
336
+ def forward(self, condition, gt_spec=None, src_spec=None, infer=True):
337
+ """
338
+ conditioning diffusion, use fastspeech2 encoder output as the condition
339
+ """
340
+ cond = condition.transpose(perm=paddle_aux.transpose_aux_func(condition.ndim, 1, 2))
341
+ b, device = tuple(condition.shape)[0], condition.place
342
+ if not infer:
343
+ spec = self.norm_spec(gt_spec).transpose(
344
+ perm=paddle_aux.transpose_aux_func(self.norm_spec(gt_spec).ndim, -2, -1)
345
+ )
346
+ if self.num_feats == 1:
347
+ spec = spec[:, None, :, :]
348
+ t = paddle.randint(low=0, high=self.k_step, shape=(b,)).astype(dtype="int64")
349
+ x_recon, noise = self.p_losses(spec, t, cond=cond)
350
+ return x_recon, noise
351
+ else:
352
+ if src_spec is not None:
353
+ spec = self.norm_spec(src_spec).transpose(
354
+ perm=paddle_aux.transpose_aux_func(self.norm_spec(src_spec).ndim, -2, -1)
355
+ )
356
+ if self.num_feats == 1:
357
+ spec = spec[:, None, :, :]
358
+ else:
359
+ spec = None
360
+ x = self.inference(cond, b=b, x_start=spec, device=device)
361
+ return self.denorm_spec(x)
362
+
363
+ def norm_spec(self, x):
364
+ return (x - self.spec_min) / (self.spec_max - self.spec_min) * 2 - 1
365
+
366
+ def denorm_spec(self, x):
367
+ return (x + 1) / 2 * (self.spec_max - self.spec_min) + self.spec_min
368
+
369
+
370
+ class RepetitiveDiffusion(GaussianDiffusion):
371
+ def __init__(
372
+ self,
373
+ vmin: (float | int | list),
374
+ vmax: (float | int | list),
375
+ repeat_bins: int,
376
+ timesteps=1000,
377
+ k_step=1000,
378
+ backbone_type=None,
379
+ backbone_args=None,
380
+ betas=None,
381
+ ):
382
+ assert isinstance(vmin, (float, int)) and isinstance(vmin, (float, int)) or len(vmin) == len(vmax)
383
+ num_feats = 1 if isinstance(vmin, (float, int)) else len(vmin)
384
+ spec_min = [vmin] if num_feats == 1 else [[v] for v in vmin]
385
+ spec_max = [vmax] if num_feats == 1 else [[v] for v in vmax]
386
+ self.repeat_bins = repeat_bins
387
+ super().__init__(
388
+ out_dims=repeat_bins,
389
+ num_feats=num_feats,
390
+ timesteps=timesteps,
391
+ k_step=k_step,
392
+ backbone_type=backbone_type,
393
+ backbone_args=backbone_args,
394
+ betas=betas,
395
+ spec_min=spec_min,
396
+ spec_max=spec_max,
397
+ )
398
+
399
+ def norm_spec(self, x):
400
+ """
401
+
402
+ :param x: [B, T] or [B, F, T]
403
+ :return [B, T, R] or [B, F, T, R]
404
+ """
405
+ if self.num_feats == 1:
406
+ repeats = [1, 1, self.repeat_bins]
407
+ else:
408
+ repeats = [1, 1, 1, self.repeat_bins]
409
+ return super().norm_spec(x.unsqueeze(axis=-1).tile(repeat_times=repeats))
410
+
411
+ def denorm_spec(self, x):
412
+ """
413
+
414
+ :param x: [B, T, R] or [B, F, T, R]
415
+ :return [B, T] or [B, F, T]
416
+ """
417
+ return super().denorm_spec(x).mean(axis=-1)
418
+
419
+
420
+ class PitchDiffusion(RepetitiveDiffusion):
421
+ def __init__(
422
+ self,
423
+ vmin: float,
424
+ vmax: float,
425
+ cmin: float,
426
+ cmax: float,
427
+ repeat_bins,
428
+ timesteps=1000,
429
+ k_step=1000,
430
+ backbone_type=None,
431
+ backbone_args=None,
432
+ betas=None,
433
+ ):
434
+ self.vmin = vmin
435
+ self.vmax = vmax
436
+ self.cmin = cmin
437
+ self.cmax = cmax
438
+ super().__init__(
439
+ vmin=vmin,
440
+ vmax=vmax,
441
+ repeat_bins=repeat_bins,
442
+ timesteps=timesteps,
443
+ k_step=k_step,
444
+ backbone_type=backbone_type,
445
+ backbone_args=backbone_args,
446
+ betas=betas,
447
+ )
448
+
449
+ def norm_spec(self, x):
450
+ return super().norm_spec(x.clip(min=self.cmin, max=self.cmax))
451
+
452
+ def denorm_spec(self, x):
453
+ return super().denorm_spec(x).clip(min=self.cmin, max=self.cmax)
454
+
455
+
456
+ class MultiVarianceDiffusion(RepetitiveDiffusion):
457
+ def __init__(
458
+ self,
459
+ ranges: List[Tuple[float, float]],
460
+ clamps: List[Tuple[float | None, float | None] | None],
461
+ repeat_bins,
462
+ timesteps=1000,
463
+ k_step=1000,
464
+ backbone_type=None,
465
+ backbone_args=None,
466
+ betas=None,
467
+ ):
468
+ assert len(ranges) == len(clamps)
469
+ self.clamps = clamps
470
+ vmin = [r[0] for r in ranges]
471
+ vmax = [r[1] for r in ranges]
472
+ if len(vmin) == 1:
473
+ vmin = vmin[0]
474
+ if len(vmax) == 1:
475
+ vmax = vmax[0]
476
+ super().__init__(
477
+ vmin=vmin,
478
+ vmax=vmax,
479
+ repeat_bins=repeat_bins,
480
+ timesteps=timesteps,
481
+ k_step=k_step,
482
+ backbone_type=backbone_type,
483
+ backbone_args=backbone_args,
484
+ betas=betas,
485
+ )
486
+
487
+ def clamp_spec(self, xs: (list | tuple)):
488
+ clamped = []
489
+ for x, c in zip(xs, self.clamps):
490
+ if c is None:
491
+ clamped.append(x)
492
+ continue
493
+ clamped.append(x.clip(min=c[0], max=c[1]))
494
+ return clamped
495
+
496
+ def norm_spec(self, xs: (list | tuple)):
497
+ """
498
+
499
+ :param xs: sequence of [B, T]
500
+ :return: [B, F, T] => super().norm_spec(xs) => [B, F, T, R]
501
+ """
502
+ assert len(xs) == self.num_feats
503
+ clamped = self.clamp_spec(xs)
504
+ xs = paddle.stack(x=clamped, axis=1)
505
+ if self.num_feats == 1:
506
+ xs = xs.squeeze(axis=1)
507
+ return super().norm_spec(xs)
508
+
509
+ def denorm_spec(self, xs):
510
+ """
511
+
512
+ :param xs: [B, T, R] or [B, F, T, R] => super().denorm_spec(xs) => [B, T] or [B, F, T]
513
+ :return: sequence of [B, T]
514
+ """
515
+ xs = super().denorm_spec(xs)
516
+ if self.num_feats == 1:
517
+ xs = [xs]
518
+ else:
519
+ xs = xs.unbind(axis=1)
520
+ assert len(xs) == self.num_feats
521
+ return self.clamp_spec(xs)
VLMEvalKit_old/PaddleMIX/paddlemix/models/diffsinger/modules/core/reflow.py ADDED
@@ -0,0 +1,311 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ from __future__ import annotations
16
+
17
+ import sys
18
+ from typing import List, Tuple
19
+ import paddle
20
+
21
+ from tqdm import tqdm
22
+ from paddlemix.models.diffsinger.modules.backbones import build_backbone
23
+ from paddlemix.models.diffsinger.utils.hparams import hparams
24
+ from paddlemix.models.diffsinger.utils import paddle_aux
25
+
26
+ class RectifiedFlow(paddle.nn.Layer):
27
+ def __init__(
28
+ self,
29
+ out_dims,
30
+ num_feats=1,
31
+ t_start=0.0,
32
+ time_scale_factor=1000,
33
+ backbone_type=None,
34
+ backbone_args=None,
35
+ spec_min=None,
36
+ spec_max=None,
37
+ ):
38
+ super().__init__()
39
+ self.velocity_fn: paddle.nn.Layer = build_backbone(out_dims, num_feats, backbone_type, backbone_args)
40
+ self.out_dims = out_dims
41
+ self.num_feats = num_feats
42
+ self.use_shallow_diffusion = hparams.get("use_shallow_diffusion", False)
43
+ if self.use_shallow_diffusion:
44
+ assert 0.0 <= t_start <= 1.0, "T_start should be in [0, 1]."
45
+ else:
46
+ t_start = 0.0
47
+ self.t_start = t_start
48
+ self.time_scale_factor = time_scale_factor
49
+ spec_min = paddle.to_tensor(data=spec_min, dtype="float32")[None, None, :out_dims].transpose(
50
+ perm=paddle_aux.transpose_aux_func(
51
+ paddle.to_tensor(data=spec_min, dtype="float32")[None, None, :out_dims].ndim, -3, -2
52
+ )
53
+ )
54
+ spec_max = paddle.to_tensor(data=spec_max, dtype="float32")[None, None, :out_dims].transpose(
55
+ perm=paddle_aux.transpose_aux_func(
56
+ paddle.to_tensor(data=spec_max, dtype="float32")[None, None, :out_dims].ndim, -3, -2
57
+ )
58
+ )
59
+ self.register_buffer(name="spec_min", tensor=spec_min, persistable=False)
60
+ self.register_buffer(name="spec_max", tensor=spec_max, persistable=False)
61
+
62
+ def p_losses(self, x_end, t, cond):
63
+ x_start = paddle.randn(shape=x_end.shape, dtype=x_end.dtype)
64
+ x_t = x_start + t[:, None, None, None] * (x_end - x_start)
65
+ v_pred = self.velocity_fn(x_t, t * self.time_scale_factor, cond)
66
+ return v_pred, x_end - x_start
67
+
68
+ def forward(self, condition, gt_spec=None, src_spec=None, infer=True):
69
+ cond = condition.transpose(perm=paddle_aux.transpose_aux_func(condition.ndim, 1, 2))
70
+ b, device = tuple(condition.shape)[0], condition.place
71
+ if not infer:
72
+ spec = self.norm_spec(gt_spec).transpose(
73
+ perm=paddle_aux.transpose_aux_func(self.norm_spec(gt_spec).ndim, -2, -1)
74
+ )
75
+ if self.num_feats == 1:
76
+ spec = spec[:, None, :, :]
77
+ t = self.t_start + (1.0 - self.t_start) * paddle.rand(shape=(b,))
78
+ v_pred, v_gt = self.p_losses(spec, t, cond=cond)
79
+ return v_pred, v_gt, t
80
+ else:
81
+ if src_spec is not None:
82
+ spec = self.norm_spec(src_spec).transpose(
83
+ perm=paddle_aux.transpose_aux_func(self.norm_spec(src_spec).ndim, -2, -1)
84
+ )
85
+ if self.num_feats == 1:
86
+ spec = spec[:, None, :, :]
87
+ else:
88
+ spec = None
89
+ x = self.inference(cond, b=b, x_end=spec, device=device)
90
+ return self.denorm_spec(x)
91
+
92
+ @paddle.no_grad()
93
+ def sample_euler(self, x, t, dt, cond):
94
+ x += self.velocity_fn(x, self.time_scale_factor * t, cond) * dt
95
+ t += dt
96
+ return x, t
97
+
98
+ @paddle.no_grad()
99
+ def sample_rk2(self, x, t, dt, cond):
100
+ k_1 = self.velocity_fn(x, self.time_scale_factor * t, cond)
101
+ k_2 = self.velocity_fn(x + 0.5 * k_1 * dt, self.time_scale_factor * (t + 0.5 * dt), cond)
102
+ x += k_2 * dt
103
+ t += dt
104
+ return x, t
105
+
106
+ @paddle.no_grad()
107
+ def sample_rk4(self, x, t, dt, cond):
108
+ k_1 = self.velocity_fn(x, self.time_scale_factor * t, cond)
109
+ k_2 = self.velocity_fn(x + 0.5 * k_1 * dt, self.time_scale_factor * (t + 0.5 * dt), cond)
110
+ k_3 = self.velocity_fn(x + 0.5 * k_2 * dt, self.time_scale_factor * (t + 0.5 * dt), cond)
111
+ k_4 = self.velocity_fn(x + k_3 * dt, self.time_scale_factor * (t + dt), cond)
112
+ x += (k_1 + 2 * k_2 + 2 * k_3 + k_4) * dt / 6
113
+ t += dt
114
+ return x, t
115
+
116
+ @paddle.no_grad()
117
+ def sample_rk5(self, x, t, dt, cond):
118
+ k_1 = self.velocity_fn(x, self.time_scale_factor * t, cond)
119
+ k_2 = self.velocity_fn(x + 0.25 * k_1 * dt, self.time_scale_factor * (t + 0.25 * dt), cond)
120
+ k_3 = self.velocity_fn(x + 0.125 * (k_2 + k_1) * dt, self.time_scale_factor * (t + 0.25 * dt), cond)
121
+ k_4 = self.velocity_fn(x + 0.5 * (-k_2 + 2 * k_3) * dt, self.time_scale_factor * (t + 0.5 * dt), cond)
122
+ k_5 = self.velocity_fn(x + 0.0625 * (3 * k_1 + 9 * k_4) * dt, self.time_scale_factor * (t + 0.75 * dt), cond)
123
+ k_6 = self.velocity_fn(
124
+ x + (-3 * k_1 + 2 * k_2 + 12 * k_3 - 12 * k_4 + 8 * k_5) * dt / 7, self.time_scale_factor * (t + dt), cond
125
+ )
126
+ x += (7 * k_1 + 32 * k_3 + 12 * k_4 + 32 * k_5 + 7 * k_6) * dt / 90
127
+ t += dt
128
+ return x, t
129
+
130
+ @paddle.no_grad()
131
+ def inference(self, cond, b=1, x_end=None, device=None):
132
+ noise = paddle.randn(shape=[b, self.num_feats, self.out_dims, tuple(cond.shape)[2]])
133
+ t_start = hparams.get("T_start_infer", self.t_start)
134
+ if self.use_shallow_diffusion and t_start > 0:
135
+ assert x_end is not None, "Missing shallow diffusion source."
136
+ if t_start >= 1.0:
137
+ t_start = 1.0
138
+ x = x_end
139
+ else:
140
+ x = t_start * x_end + (1 - t_start) * noise
141
+ else:
142
+ t_start = 0.0
143
+ x = noise
144
+ algorithm = hparams["sampling_algorithm"]
145
+ infer_step = hparams["sampling_steps"]
146
+ if t_start < 1:
147
+ dt = (1.0 - t_start) / max(1, infer_step)
148
+ algorithm_fn = {
149
+ "euler": self.sample_euler,
150
+ "rk2": self.sample_rk2,
151
+ "rk4": self.sample_rk4,
152
+ "rk5": self.sample_rk5,
153
+ }.get(algorithm)
154
+ if algorithm_fn is None:
155
+ raise ValueError(f"Unsupported algorithm for Rectified Flow: {algorithm}.")
156
+ dts = paddle.to_tensor(data=[dt]).to(x)
157
+ for i in tqdm(
158
+ range(infer_step), desc="sample time step", total=infer_step, disable=not hparams["infer"], leave=False
159
+ ):
160
+ x, _ = algorithm_fn(x, t_start + i * dts, dt, cond)
161
+ x = x.astype(dtype="float32")
162
+ x = x.transpose(perm=paddle_aux.transpose_aux_func(x.ndim, 2, 3)).squeeze(axis=1)
163
+ return x
164
+
165
+ def norm_spec(self, x):
166
+ return (x - self.spec_min) / (self.spec_max - self.spec_min) * 2 - 1
167
+
168
+ def denorm_spec(self, x):
169
+ return (x + 1) / 2 * (self.spec_max - self.spec_min) + self.spec_min
170
+
171
+
172
+ class RepetitiveRectifiedFlow(RectifiedFlow):
173
+ def __init__(
174
+ self,
175
+ vmin: (float | int | list),
176
+ vmax: (float | int | list),
177
+ repeat_bins: int,
178
+ time_scale_factor=1000,
179
+ backbone_type=None,
180
+ backbone_args=None,
181
+ ):
182
+ assert isinstance(vmin, (float, int)) and isinstance(vmin, (float, int)) or len(vmin) == len(vmax)
183
+ num_feats = 1 if isinstance(vmin, (float, int)) else len(vmin)
184
+ spec_min = [vmin] if num_feats == 1 else [[v] for v in vmin]
185
+ spec_max = [vmax] if num_feats == 1 else [[v] for v in vmax]
186
+ self.repeat_bins = repeat_bins
187
+ super().__init__(
188
+ out_dims=repeat_bins,
189
+ num_feats=num_feats,
190
+ time_scale_factor=time_scale_factor,
191
+ backbone_type=backbone_type,
192
+ backbone_args=backbone_args,
193
+ spec_min=spec_min,
194
+ spec_max=spec_max,
195
+ )
196
+
197
+ def norm_spec(self, x):
198
+ """
199
+
200
+ :param x: [B, T] or [B, F, T]
201
+ :return [B, T, R] or [B, F, T, R]
202
+ """
203
+ if self.num_feats == 1:
204
+ repeats = [1, 1, self.repeat_bins]
205
+ else:
206
+ repeats = [1, 1, 1, self.repeat_bins]
207
+ return super().norm_spec(x.unsqueeze(axis=-1).tile(repeat_times=repeats))
208
+
209
+ def denorm_spec(self, x):
210
+ """
211
+
212
+ :param x: [B, T, R] or [B, F, T, R]
213
+ :return [B, T] or [B, F, T]
214
+ """
215
+ return super().denorm_spec(x).mean(axis=-1)
216
+
217
+
218
+ class PitchRectifiedFlow(RepetitiveRectifiedFlow):
219
+ def __init__(
220
+ self,
221
+ vmin: float,
222
+ vmax: float,
223
+ cmin: float,
224
+ cmax: float,
225
+ repeat_bins,
226
+ time_scale_factor=1000,
227
+ backbone_type=None,
228
+ backbone_args=None,
229
+ ):
230
+ self.vmin = vmin
231
+ self.vmax = vmax
232
+ self.cmin = cmin
233
+ self.cmax = cmax
234
+ super().__init__(
235
+ vmin=vmin,
236
+ vmax=vmax,
237
+ repeat_bins=repeat_bins,
238
+ time_scale_factor=time_scale_factor,
239
+ backbone_type=backbone_type,
240
+ backbone_args=backbone_args,
241
+ )
242
+
243
+ def norm_spec(self, x):
244
+ return super().norm_spec(x.clip(min=self.cmin, max=self.cmax))
245
+
246
+ def denorm_spec(self, x):
247
+ return super().denorm_spec(x).clip(min=self.cmin, max=self.cmax)
248
+
249
+
250
+ class MultiVarianceRectifiedFlow(RepetitiveRectifiedFlow):
251
+ def __init__(
252
+ self,
253
+ ranges: List[Tuple[float, float]],
254
+ clamps: List[Tuple[float | None, float | None] | None],
255
+ repeat_bins,
256
+ time_scale_factor=1000,
257
+ backbone_type=None,
258
+ backbone_args=None,
259
+ ):
260
+ assert len(ranges) == len(clamps)
261
+ self.clamps = clamps
262
+ vmin = [r[0] for r in ranges]
263
+ vmax = [r[1] for r in ranges]
264
+ if len(vmin) == 1:
265
+ vmin = vmin[0]
266
+ if len(vmax) == 1:
267
+ vmax = vmax[0]
268
+ super().__init__(
269
+ vmin=vmin,
270
+ vmax=vmax,
271
+ repeat_bins=repeat_bins,
272
+ time_scale_factor=time_scale_factor,
273
+ backbone_type=backbone_type,
274
+ backbone_args=backbone_args,
275
+ )
276
+
277
+ def clamp_spec(self, xs: (list | tuple)):
278
+ clamped = []
279
+ for x, c in zip(xs, self.clamps):
280
+ if c is None:
281
+ clamped.append(x)
282
+ continue
283
+ clamped.append(x.clip(min=c[0], max=c[1]))
284
+ return clamped
285
+
286
+ def norm_spec(self, xs: (list | tuple)):
287
+ """
288
+
289
+ :param xs: sequence of [B, T]
290
+ :return: [B, F, T] => super().norm_spec(xs) => [B, F, T, R]
291
+ """
292
+ assert len(xs) == self.num_feats
293
+ clamped = self.clamp_spec(xs)
294
+ xs = paddle.stack(x=clamped, axis=1)
295
+ if self.num_feats == 1:
296
+ xs = xs.squeeze(axis=1)
297
+ return super().norm_spec(xs)
298
+
299
+ def denorm_spec(self, xs):
300
+ """
301
+
302
+ :param xs: [B, T, R] or [B, F, T, R] => super().denorm_spec(xs) => [B, T] or [B, F, T]
303
+ :return: sequence of [B, T]
304
+ """
305
+ xs = super().denorm_spec(xs)
306
+ if self.num_feats == 1:
307
+ xs = [xs]
308
+ else:
309
+ xs = xs.unbind(axis=1)
310
+ assert len(xs) == self.num_feats
311
+ return self.clamp_spec(xs)
VLMEvalKit_old/PaddleMIX/paddlemix/models/diffsinger/modules/fastspeech/acoustic_encoder.py ADDED
@@ -0,0 +1,110 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ import paddle
16
+
17
+ from paddlemix.models.diffsinger.modules.commons.common_layers import (
18
+ NormalInitEmbedding as Embedding,
19
+ )
20
+ from paddlemix.models.diffsinger.modules.commons.common_layers import (
21
+ XavierUniformInitLinear as Linear,
22
+ )
23
+ from paddlemix.models.diffsinger.modules.fastspeech.tts_modules import (
24
+ FastSpeech2Encoder,
25
+ mel2ph_to_dur,
26
+ )
27
+ from paddlemix.models.diffsinger.utils.hparams import hparams
28
+ from paddlemix.models.diffsinger.utils.text_encoder import PAD_INDEX
29
+
30
+
31
+ class FastSpeech2Acoustic(paddle.nn.Layer):
32
+ def __init__(self, vocab_size):
33
+ super().__init__()
34
+ self.txt_embed = Embedding(vocab_size, hparams["hidden_size"], PAD_INDEX)
35
+ self.dur_embed = Linear(1, hparams["hidden_size"])
36
+ self.encoder = FastSpeech2Encoder(
37
+ hidden_size=hparams["hidden_size"],
38
+ num_layers=hparams["enc_layers"],
39
+ ffn_kernel_size=hparams["enc_ffn_kernel_size"],
40
+ ffn_act=hparams["ffn_act"],
41
+ dropout=hparams["dropout"],
42
+ num_heads=hparams["num_heads"],
43
+ use_pos_embed=hparams["use_pos_embed"],
44
+ rel_pos=hparams["rel_pos"],
45
+ )
46
+ self.pitch_embed = Linear(1, hparams["hidden_size"])
47
+ self.variance_embed_list = []
48
+ self.use_energy_embed = hparams.get("use_energy_embed", False)
49
+ self.use_breathiness_embed = hparams.get("use_breathiness_embed", False)
50
+ self.use_voicing_embed = hparams.get("use_voicing_embed", False)
51
+ self.use_tension_embed = hparams.get("use_tension_embed", False)
52
+ if self.use_energy_embed:
53
+ self.variance_embed_list.append("energy")
54
+ if self.use_breathiness_embed:
55
+ self.variance_embed_list.append("breathiness")
56
+ if self.use_voicing_embed:
57
+ self.variance_embed_list.append("voicing")
58
+ if self.use_tension_embed:
59
+ self.variance_embed_list.append("tension")
60
+ self.use_variance_embeds = len(self.variance_embed_list) > 0
61
+ if self.use_variance_embeds:
62
+ self.variance_embeds = paddle.nn.LayerDict(
63
+ sublayers={v_name: Linear(1, hparams["hidden_size"]) for v_name in self.variance_embed_list}
64
+ )
65
+ self.use_key_shift_embed = hparams.get("use_key_shift_embed", False)
66
+ if self.use_key_shift_embed:
67
+ self.key_shift_embed = Linear(1, hparams["hidden_size"])
68
+ self.use_speed_embed = hparams.get("use_speed_embed", False)
69
+ if self.use_speed_embed:
70
+ self.speed_embed = Linear(1, hparams["hidden_size"])
71
+ self.use_spk_id = hparams["use_spk_id"]
72
+ if self.use_spk_id:
73
+ self.spk_embed = Embedding(hparams["num_spk"], hparams["hidden_size"])
74
+
75
+ def forward_variance_embedding(self, condition, key_shift=None, speed=None, **variances):
76
+ if self.use_variance_embeds:
77
+ variance_embeds = paddle.stack(
78
+ x=[self.variance_embeds[v_name](variances[v_name][:, :, None]) for v_name in self.variance_embed_list],
79
+ axis=-1,
80
+ ).sum(axis=-1)
81
+ condition += variance_embeds
82
+ if self.use_key_shift_embed:
83
+ key_shift_embed = self.key_shift_embed(key_shift[:, :, None])
84
+ condition += key_shift_embed
85
+ if self.use_speed_embed:
86
+ speed_embed = self.speed_embed(speed[:, :, None])
87
+ condition += speed_embed
88
+ return condition
89
+
90
+ def forward(self, txt_tokens, mel2ph, f0, key_shift=None, speed=None, spk_embed_id=None, **kwargs):
91
+ txt_embed = self.txt_embed(txt_tokens)
92
+ # dur = mel2ph_to_dur(mel2ph, tuple(txt_tokens.shape)[1]).float()
93
+ dur = paddle.cast(mel2ph_to_dur(mel2ph, tuple(txt_tokens.shape)[1]), dtype="float32")
94
+ dur_embed = self.dur_embed(dur[:, :, None])
95
+ encoder_out = self.encoder(txt_embed, dur_embed, txt_tokens == 0)
96
+ encoder_out = paddle.nn.functional.pad(x=encoder_out, pad=[0, 0, 1, 0], pad_from_left_axis=False)
97
+ mel2ph_ = mel2ph[..., None].tile(repeat_times=[1, 1, tuple(encoder_out.shape)[-1]])
98
+ condition = paddle.take_along_axis(arr=encoder_out, axis=1, indices=mel2ph_, broadcast=False)
99
+ if self.use_spk_id:
100
+ spk_mix_embed = kwargs.get("spk_mix_embed")
101
+ if spk_mix_embed is not None:
102
+ spk_embed = spk_mix_embed
103
+ else:
104
+ spk_embed = self.spk_embed(spk_embed_id)[:, None, :]
105
+ condition += spk_embed
106
+ f0_mel = (1 + f0 / 700).log()
107
+ pitch_embed = self.pitch_embed(f0_mel[:, :, None])
108
+ condition += pitch_embed
109
+ condition = self.forward_variance_embedding(condition, key_shift=key_shift, speed=speed, **kwargs)
110
+ return condition
VLMEvalKit_old/PaddleMIX/paddlemix/models/diffsinger/modules/fastspeech/param_adaptor.py ADDED
@@ -0,0 +1,88 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ from __future__ import annotations
16
+
17
+ import sys
18
+
19
+ import paddle
20
+
21
+ import paddlemix.models.diffsinger.modules.compat as compat
22
+ from paddlemix.models.diffsinger.modules.core.ddpm import MultiVarianceDiffusion
23
+ from paddlemix.models.diffsinger.utils import filter_kwargs
24
+ from paddlemix.models.diffsinger.utils.hparams import hparams
25
+
26
+ VARIANCE_CHECKLIST = ["energy", "breathiness", "voicing", "tension"]
27
+
28
+
29
+ class ParameterAdaptorModule(paddle.nn.Layer):
30
+ def __init__(self):
31
+ super().__init__()
32
+ self.variance_prediction_list = []
33
+ self.predict_energy = hparams.get("predict_energy", False)
34
+ self.predict_breathiness = hparams.get("predict_breathiness", False)
35
+ self.predict_voicing = hparams.get("predict_voicing", False)
36
+ self.predict_tension = hparams.get("predict_tension", False)
37
+ if self.predict_energy:
38
+ self.variance_prediction_list.append("energy")
39
+ if self.predict_breathiness:
40
+ self.variance_prediction_list.append("breathiness")
41
+ if self.predict_voicing:
42
+ self.variance_prediction_list.append("voicing")
43
+ if self.predict_tension:
44
+ self.variance_prediction_list.append("tension")
45
+ self.predict_variances = len(self.variance_prediction_list) > 0
46
+
47
+ def build_adaptor(self, cls=MultiVarianceDiffusion):
48
+ ranges = []
49
+ clamps = []
50
+ if self.predict_energy:
51
+ ranges.append((hparams["energy_db_min"], hparams["energy_db_max"]))
52
+ clamps.append((hparams["energy_db_min"], 0.0))
53
+ if self.predict_breathiness:
54
+ ranges.append((hparams["breathiness_db_min"], hparams["breathiness_db_max"]))
55
+ clamps.append((hparams["breathiness_db_min"], 0.0))
56
+ if self.predict_voicing:
57
+ ranges.append((hparams["voicing_db_min"], hparams["voicing_db_max"]))
58
+ clamps.append((hparams["voicing_db_min"], 0.0))
59
+ if self.predict_tension:
60
+ ranges.append((hparams["tension_logit_min"], hparams["tension_logit_max"]))
61
+ clamps.append((hparams["tension_logit_min"], hparams["tension_logit_max"]))
62
+ variances_hparams = hparams["variances_prediction_args"]
63
+ total_repeat_bins = variances_hparams["total_repeat_bins"]
64
+ assert (
65
+ total_repeat_bins % len(self.variance_prediction_list) == 0
66
+ ), f"Total number of repeat bins must be divisible by number of variance parameters ({len(self.variance_prediction_list)})."
67
+ repeat_bins = total_repeat_bins // len(self.variance_prediction_list)
68
+ backbone_type = compat.get_backbone_type(hparams, nested_config=variances_hparams)
69
+ backbone_args = compat.get_backbone_args(variances_hparams, backbone_type=backbone_type)
70
+ kwargs = filter_kwargs(
71
+ {
72
+ "ranges": ranges,
73
+ "clamps": clamps,
74
+ "repeat_bins": repeat_bins,
75
+ "timesteps": hparams.get("timesteps"),
76
+ "time_scale_factor": hparams.get("time_scale_factor"),
77
+ "backbone_type": backbone_type,
78
+ "backbone_args": backbone_args,
79
+ },
80
+ cls,
81
+ )
82
+ return cls(**kwargs)
83
+
84
+ def collect_variance_inputs(self, **kwargs) -> list:
85
+ return [kwargs.get(name) for name in self.variance_prediction_list]
86
+
87
+ def collect_variance_outputs(self, variances: (list | tuple)) -> dict:
88
+ return {name: pred for name, pred in zip(self.variance_prediction_list, variances)}
VLMEvalKit_old/PaddleMIX/paddlemix/models/diffsinger/modules/fastspeech/tts_modules.py ADDED
@@ -0,0 +1,473 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ import math
16
+ import sys
17
+
18
+ import paddle
19
+
20
+ from paddlemix.models.diffsinger.utils import paddle_aux
21
+ from paddlemix.models.diffsinger.modules.commons.common_layers import (
22
+ EncSALayer,
23
+ SinusoidalPositionalEmbedding,
24
+ )
25
+ from paddlemix.models.diffsinger.modules.commons.espnet_positional_embedding import (
26
+ RelPositionalEncoding,
27
+ )
28
+
29
+ DEFAULT_MAX_SOURCE_POSITIONS = 2000
30
+ DEFAULT_MAX_TARGET_POSITIONS = 2000
31
+
32
+
33
+ class TransformerEncoderLayer(paddle.nn.Layer):
34
+ def __init__(self, hidden_size, dropout, kernel_size=None, act="gelu", num_heads=2):
35
+ super().__init__()
36
+ self.op = EncSALayer(
37
+ hidden_size,
38
+ num_heads,
39
+ dropout=dropout,
40
+ attention_dropout=0.0,
41
+ relu_dropout=dropout,
42
+ kernel_size=kernel_size,
43
+ act=act,
44
+ )
45
+
46
+ def forward(self, x, **kwargs):
47
+ return self.op(x, **kwargs)
48
+
49
+
50
+ class LayerNorm(paddle.nn.LayerNorm):
51
+ """Layer normalization module.
52
+ :param int nout: output dim size
53
+ :param int dim: dimension to be normalized
54
+ """
55
+
56
+ def __init__(self, nout, dim=-1):
57
+ """Construct an LayerNorm object."""
58
+ super(LayerNorm, self).__init__(nout, eps=1e-12)
59
+ self.dim = dim
60
+
61
+ def forward(self, x):
62
+ """Apply layer normalization.
63
+ :param torch.Tensor x: input tensor
64
+ :return: layer normalized tensor
65
+ :rtype torch.Tensor
66
+ """
67
+ if self.dim == -1:
68
+ return super(LayerNorm, self).forward(x)
69
+ return (
70
+ super(LayerNorm, self)
71
+ .forward(x.transpose(perm=paddle_aux.transpose_aux_func(x.ndim, 1, -1)))
72
+ .transpose(
73
+ perm=paddle_aux.transpose_aux_func(
74
+ super(LayerNorm, self)
75
+ .forward(x.transpose(perm=paddle_aux.transpose_aux_func(x.ndim, 1, -1)))
76
+ .ndim,
77
+ 1,
78
+ -1,
79
+ )
80
+ )
81
+ )
82
+
83
+
84
+ class DurationPredictor(paddle.nn.Layer):
85
+ """Duration predictor module.
86
+ This is a module of duration predictor described in `FastSpeech: Fast, Robust and Controllable Text to Speech`_.
87
+ The duration predictor predicts a duration of each frame in log domain from the hidden embeddings of encoder.
88
+ .. _`FastSpeech: Fast, Robust and Controllable Text to Speech`:
89
+ https://arxiv.org/pdf/1905.09263.pdf
90
+ Note:
91
+ The calculation domain of outputs is different between in `forward` and in `inference`. In `forward`,
92
+ the outputs are calculated in log domain but in `inference`, those are calculated in linear domain.
93
+ """
94
+
95
+ def __init__(
96
+ self, in_dims, n_layers=2, n_chans=384, kernel_size=3, dropout_rate=0.1, offset=1.0, dur_loss_type="mse"
97
+ ):
98
+ """Initialize duration predictor module.
99
+ Args:
100
+ in_dims (int): Input dimension.
101
+ n_layers (int, optional): Number of convolutional layers.
102
+ n_chans (int, optional): Number of channels of convolutional layers.
103
+ kernel_size (int, optional): Kernel size of convolutional layers.
104
+ dropout_rate (float, optional): Dropout rate.
105
+ offset (float, optional): Offset value to avoid nan in log domain.
106
+ """
107
+ super(DurationPredictor, self).__init__()
108
+ self.offset = offset
109
+ self.conv = paddle.nn.LayerList()
110
+ self.kernel_size = kernel_size
111
+ for idx in range(n_layers):
112
+ in_chans = in_dims if idx == 0 else n_chans
113
+ self.conv.append(
114
+ paddle.nn.Sequential(
115
+ paddle.nn.Identity(),
116
+ paddle.nn.Conv1D(
117
+ in_channels=in_chans,
118
+ out_channels=n_chans,
119
+ kernel_size=kernel_size,
120
+ stride=1,
121
+ padding=kernel_size // 2,
122
+ ),
123
+ paddle.nn.ReLU(),
124
+ LayerNorm(n_chans, dim=1),
125
+ paddle.nn.Dropout(p=dropout_rate),
126
+ )
127
+ )
128
+ self.loss_type = dur_loss_type
129
+ if self.loss_type in ["mse", "huber"]:
130
+ self.out_dims = 1
131
+ else:
132
+ raise NotImplementedError()
133
+ self.linear = paddle.nn.Linear(in_features=n_chans, out_features=self.out_dims)
134
+
135
+ def out2dur(self, xs):
136
+ if self.loss_type in ["mse", "huber"]:
137
+ dur = xs.squeeze(axis=-1).exp() - self.offset
138
+ else:
139
+ raise NotImplementedError()
140
+ return dur
141
+
142
+ def forward(self, xs, x_masks=None, infer=True):
143
+ """Calculate forward propagation.
144
+ Args:
145
+ xs (Tensor): Batch of input sequences (B, Tmax, idim).
146
+ x_masks (BoolTensor, optional): Batch of masks indicating padded part (B, Tmax).
147
+ infer (bool): Whether inference
148
+ Returns:
149
+ (train) FloatTensor, (infer) LongTensor: Batch of predicted durations in linear domain (B, Tmax).
150
+ """
151
+ xs = xs.transpose(perm=paddle_aux.transpose_aux_func(xs.ndim, 1, -1))
152
+ masks = 1 - x_masks.astype(dtype="float32")
153
+ masks_ = masks[:, None, :]
154
+ for f in self.conv:
155
+ xs = f(xs)
156
+ if x_masks is not None:
157
+ xs = xs * masks_
158
+ xs = self.linear(xs.transpose(perm=paddle_aux.transpose_aux_func(xs.ndim, 1, -1)))
159
+ xs = xs * masks[:, :, None]
160
+ dur_pred = self.out2dur(xs)
161
+ if infer:
162
+ dur_pred = dur_pred.clip(min=0.0)
163
+ return dur_pred
164
+
165
+
166
+ class VariancePredictor(paddle.nn.Layer):
167
+ def __init__(self, vmin, vmax, in_dims, n_layers=5, n_chans=512, kernel_size=5, dropout_rate=0.1):
168
+ """Initialize variance predictor module.
169
+ Args:
170
+ in_dims (int): Input dimension.
171
+ n_layers (int, optional): Number of convolutional layers.
172
+ n_chans (int, optional): Number of channels of convolutional layers.
173
+ kernel_size (int, optional): Kernel size of convolutional layers.
174
+ dropout_rate (float, optional): Dropout rate.
175
+ """
176
+ super(VariancePredictor, self).__init__()
177
+ self.vmin = vmin
178
+ self.vmax = vmax
179
+ self.conv = paddle.nn.LayerList()
180
+ self.kernel_size = kernel_size
181
+ for idx in range(n_layers):
182
+ in_chans = in_dims if idx == 0 else n_chans
183
+ self.conv.append(
184
+ paddle.nn.Sequential(
185
+ paddle.nn.Conv1D(
186
+ in_channels=in_chans,
187
+ out_channels=n_chans,
188
+ kernel_size=kernel_size,
189
+ stride=1,
190
+ padding=kernel_size // 2,
191
+ ),
192
+ paddle.nn.ReLU(),
193
+ LayerNorm(n_chans, dim=1),
194
+ paddle.nn.Dropout(p=dropout_rate),
195
+ )
196
+ )
197
+ self.linear = paddle.nn.Linear(in_features=n_chans, out_features=1)
198
+ self.embed_positions = SinusoidalPositionalEmbedding(in_dims, 0, init_size=4096)
199
+ self.pos_embed_alpha = paddle.base.framework.EagerParamBase.from_tensor(
200
+ tensor=paddle.to_tensor(data=[1], dtype="float32")
201
+ )
202
+
203
+ def out2value(self, xs):
204
+ return (xs + 1) / 2 * (self.vmax - self.vmin) + self.vmin
205
+
206
+ def forward(self, xs, infer=True):
207
+ """
208
+ :param xs: [B, T, H]
209
+ :param infer: whether inference
210
+ :return: [B, T]
211
+ """
212
+ positions = self.pos_embed_alpha * self.embed_positions(xs[..., 0])
213
+ xs = xs + positions
214
+ xs = xs.transpose(perm=paddle_aux.transpose_aux_func(xs.ndim, 1, -1))
215
+ for f in self.conv:
216
+ xs = f(xs)
217
+ xs = self.linear(xs.transpose(perm=paddle_aux.transpose_aux_func(xs.ndim, 1, -1))).squeeze(axis=-1)
218
+ if infer:
219
+ xs = self.out2value(xs)
220
+ return xs
221
+
222
+
223
+ class PitchPredictor(paddle.nn.Layer):
224
+ def __init__(
225
+ self, vmin, vmax, num_bins, deviation, in_dims, n_layers=5, n_chans=384, kernel_size=5, dropout_rate=0.1
226
+ ):
227
+ """Initialize pitch predictor module.
228
+ Args:
229
+ in_dims (int): Input dimension.
230
+ n_layers (int, optional): Number of convolutional layers.
231
+ n_chans (int, optional): Number of channels of convolutional layers.
232
+ kernel_size (int, optional): Kernel size of convolutional layers.
233
+ dropout_rate (float, optional): Dropout rate.
234
+ """
235
+ super(PitchPredictor, self).__init__()
236
+ self.vmin = vmin
237
+ self.vmax = vmax
238
+ self.interval = (vmax - vmin) / (num_bins - 1)
239
+ self.sigma = deviation / self.interval
240
+ self.register_buffer(name="x", tensor=paddle.arange(end=num_bins).astype(dtype="float32").reshape(1, 1, -1))
241
+ self.base_pitch_embed = paddle.nn.Linear(in_features=1, out_features=in_dims)
242
+ self.conv = paddle.nn.LayerList()
243
+ self.kernel_size = kernel_size
244
+ for idx in range(n_layers):
245
+ in_chans = in_dims if idx == 0 else n_chans
246
+ self.conv.append(
247
+ paddle.nn.Sequential(
248
+ paddle.nn.Conv1D(
249
+ in_channels=in_chans,
250
+ out_channels=n_chans,
251
+ kernel_size=kernel_size,
252
+ stride=1,
253
+ padding=kernel_size // 2,
254
+ ),
255
+ paddle.nn.ReLU(),
256
+ LayerNorm(n_chans, dim=1),
257
+ paddle.nn.Dropout(p=dropout_rate),
258
+ )
259
+ )
260
+ self.linear = paddle.nn.Linear(in_features=n_chans, out_features=num_bins)
261
+ self.embed_positions = SinusoidalPositionalEmbedding(in_dims, 0, init_size=4096)
262
+ self.pos_embed_alpha = paddle.base.framework.EagerParamBase.from_tensor(
263
+ tensor=paddle.to_tensor(data=[1], dtype="float32")
264
+ )
265
+
266
+ def bins_to_values(self, bins):
267
+ return bins * self.interval + self.vmin
268
+
269
+ def out2pitch(self, probs):
270
+ logits = probs.sigmoid()
271
+ bins = paddle.sum(x=self.x * logits, axis=2) / paddle.sum(x=logits, axis=2)
272
+ pitch = self.bins_to_values(bins)
273
+ return pitch
274
+
275
+ def forward(self, xs, base):
276
+ """
277
+ :param xs: [B, T, H]
278
+ :param base: [B, T]
279
+ :return: [B, T, N]
280
+ """
281
+ xs = xs + self.base_pitch_embed(base[..., None])
282
+ positions = self.pos_embed_alpha * self.embed_positions(xs[..., 0])
283
+ xs = xs + positions
284
+ xs = xs.transpose(perm=paddle_aux.transpose_aux_func(xs.ndim, 1, -1))
285
+ for f in self.conv:
286
+ xs = f(xs)
287
+ xs = self.linear(xs.transpose(perm=paddle_aux.transpose_aux_func(xs.ndim, 1, -1)))
288
+ return self.out2pitch(xs) + base, xs
289
+
290
+
291
+ class RhythmRegulator(paddle.nn.Layer):
292
+ def __init__(self, eps=1e-05):
293
+ super().__init__()
294
+ self.eps = eps
295
+
296
+ def forward(self, ph_dur, ph2word, word_dur):
297
+ """
298
+ Example (no batch dim version):
299
+ 1. ph_dur = [4,2,3,2]
300
+ 2. word_dur = [3,4,2], ph2word = [1,2,2,3]
301
+ 3. word_dur_in = [4,5,2]
302
+ 4. alpha_w = [0.75,0.8,1], alpha_ph = [0.75,0.8,0.8,1]
303
+ 5. ph_dur_out = [3,1.6,2.4,2]
304
+ :param ph_dur: [B, T_ph]
305
+ :param ph2word: [B, T_ph]
306
+ :param word_dur: [B, T_w]
307
+ """
308
+ ph_dur = ph_dur.astype(dtype="float32") * (ph2word > 0)
309
+ word_dur = word_dur.astype(dtype="float32")
310
+ word_dur_in = paddle.zeros(
311
+ shape=[tuple(ph_dur.shape)[0], ph2word.max() + 1], dtype=ph_dur.dtype
312
+ ).put_along_axis(axis=1, indices=ph2word, values=ph_dur, reduce="add")[:, 1:]
313
+ alpha_w = word_dur / word_dur_in.clip(min=self.eps)
314
+ alpha_ph = paddle.take_along_axis(
315
+ arr=paddle.nn.functional.pad(x=alpha_w, pad=[1, 0], pad_from_left_axis=False),
316
+ axis=1,
317
+ indices=ph2word,
318
+ broadcast=False,
319
+ )
320
+ ph_dur_out = ph_dur * alpha_ph
321
+ return ph_dur_out.round().astype(dtype="int64")
322
+
323
+
324
+ class LengthRegulator(paddle.nn.Layer):
325
+ def forward(self, dur, dur_padding=None, alpha=None):
326
+ """
327
+ Example (no batch dim version):
328
+ 1. dur = [2,2,3]
329
+ 2. token_idx = [[1],[2],[3]], dur_cumsum = [2,4,7], dur_cumsum_prev = [0,2,4]
330
+ 3. token_mask = [[1,1,0,0,0,0,0],
331
+ [0,0,1,1,0,0,0],
332
+ [0,0,0,0,1,1,1]]
333
+ 4. token_idx * token_mask = [[1,1,0,0,0,0,0],
334
+ [0,0,2,2,0,0,0],
335
+ [0,0,0,0,3,3,3]]
336
+ 5. (token_idx * token_mask).sum(0) = [1,1,2,2,3,3,3]
337
+
338
+ :param dur: Batch of durations of each frame (B, T_txt)
339
+ :param dur_padding: Batch of padding of each frame (B, T_txt)
340
+ :param alpha: duration rescale coefficient
341
+ :return:
342
+ mel2ph (B, T_speech)
343
+ """
344
+ assert alpha is None or alpha > 0
345
+ if alpha is not None:
346
+ dur = paddle.round(dur.astype(dtype="float32") * alpha).astype(dtype="int64")
347
+ if dur_padding is not None:
348
+ dur = dur * (1 - dur_padding.astype(dtype="int64"))
349
+ token_idx = paddle.arange(start=1, end=tuple(dur.shape)[1] + 1)[None, :, None].to(dur.place)
350
+ dur_cumsum = paddle.cumsum(x=dur, axis=1)
351
+ # dur_cumsum_prev = paddle.nn.functional.pad(x=dur_cumsum, pad=[1, -1
352
+ # ], mode='constant', value=0, pad_from_left_axis=False)
353
+ dur_cumsum_prev = paddle.concat([paddle.zeros_like(dur_cumsum[:, :1]), dur_cumsum[:, :-1]], axis=1)
354
+
355
+ pos_idx = paddle.arange(end=dur.sum(axis=-1).max())[None, None].to(dur.place)
356
+ token_mask = (pos_idx >= dur_cumsum_prev[:, :, None]) & (pos_idx < dur_cumsum[:, :, None])
357
+ mel2ph = (token_idx * token_mask.astype(dtype="int64")).sum(axis=1)
358
+ return mel2ph
359
+
360
+
361
+ class StretchRegulator(paddle.nn.Layer):
362
+ def forward(self, mel2ph, dur=None):
363
+ """
364
+ Example (no batch dim version):
365
+ 1. dur = [2,4,3]
366
+ 2. mel2ph = [1,1,2,2,2,2,3,3,3]
367
+ 3. mel2dur = [2,2,4,4,4,4,3,3,3]
368
+ 4. bound_mask = [0,1,0,0,0,1,0,0,1]
369
+ 5. 1 - bound_mask * mel2dur = [1,-1,1,1,1,-3,1,1,-2] => pad => [0,1,-1,1,1,1,-3,1,1]
370
+ 6. stretch_denorm = [0,1,0,1,2,3,0,1,2]
371
+
372
+ :param dur: Batch of durations of each frame (B, T_txt)
373
+ :param mel2ph: Batch of mel2ph (B, T_speech)
374
+ :return:
375
+ stretch (B, T_speech)
376
+ """
377
+ if dur is None:
378
+ dur = mel2ph_to_dur(mel2ph, mel2ph.max())
379
+ dur = paddle.nn.functional.pad(x=dur, pad=[1, 0], value=1, pad_from_left_axis=False)
380
+ mel2dur = paddle.take_along_axis(arr=dur, axis=1, indices=mel2ph, broadcast=False)
381
+ bound_mask = paddle.greater_than(x=mel2ph[:, 1:], y=paddle.to_tensor(mel2ph[:, :-1]))
382
+ bound_mask = paddle.nn.functional.pad(
383
+ x=bound_mask, pad=[0, 1], mode="constant", value=True, pad_from_left_axis=False
384
+ )
385
+ stretch_delta = 1 - bound_mask * mel2dur
386
+ stretch_delta = paddle.nn.functional.pad(
387
+ x=stretch_delta, pad=[1, -1], mode="constant", value=0, pad_from_left_axis=False
388
+ )
389
+ stretch_denorm = paddle.cumsum(x=stretch_delta, axis=1)
390
+ stretch = stretch_denorm / mel2dur
391
+ return stretch * (mel2ph > 0)
392
+
393
+
394
+ def mel2ph_to_dur(mel2ph, T_txt, max_dur=None):
395
+ B, _ = tuple(mel2ph.shape)
396
+ dur = paddle.zeros(shape=[B, T_txt + 1], dtype=mel2ph.dtype).put_along_axis(
397
+ axis=1, indices=mel2ph, values=paddle.ones_like(x=mel2ph), reduce="add"
398
+ )
399
+ dur = dur[:, 1:]
400
+ if max_dur is not None:
401
+ dur = dur.clip(max=max_dur)
402
+ return dur
403
+
404
+
405
+ class FastSpeech2Encoder(paddle.nn.Layer):
406
+ def __init__(
407
+ self,
408
+ hidden_size,
409
+ num_layers,
410
+ ffn_kernel_size=9,
411
+ ffn_act="gelu",
412
+ dropout=None,
413
+ num_heads=2,
414
+ use_pos_embed=True,
415
+ rel_pos=True,
416
+ ):
417
+ super().__init__()
418
+ self.num_layers = num_layers
419
+ embed_dim = self.hidden_size = hidden_size
420
+ self.dropout = dropout
421
+ self.use_pos_embed = use_pos_embed
422
+ self.layers = paddle.nn.LayerList(
423
+ sublayers=[
424
+ TransformerEncoderLayer(
425
+ self.hidden_size, self.dropout, kernel_size=ffn_kernel_size, act=ffn_act, num_heads=num_heads
426
+ )
427
+ for _ in range(self.num_layers)
428
+ ]
429
+ )
430
+ self.layer_norm = paddle.nn.LayerNorm(normalized_shape=embed_dim)
431
+ self.embed_scale = math.sqrt(hidden_size)
432
+ self.padding_idx = 0
433
+ self.rel_pos = rel_pos
434
+ if self.rel_pos:
435
+ self.embed_positions = RelPositionalEncoding(hidden_size, dropout_rate=0.0)
436
+ else:
437
+ self.embed_positions = SinusoidalPositionalEmbedding(
438
+ hidden_size, self.padding_idx, init_size=DEFAULT_MAX_TARGET_POSITIONS
439
+ )
440
+
441
+ def forward_embedding(self, main_embed, extra_embed=None, padding_mask=None):
442
+ x = self.embed_scale * main_embed
443
+ if extra_embed is not None:
444
+ x = x + extra_embed
445
+ if self.use_pos_embed:
446
+ if self.rel_pos:
447
+ x = self.embed_positions(x)
448
+ else:
449
+ positions = self.embed_positions(~padding_mask)
450
+ x = x + positions
451
+ x = paddle.nn.functional.dropout(x=x, p=self.dropout, training=self.training)
452
+ return x
453
+
454
+ def forward(self, main_embed, extra_embed, padding_mask, attn_mask=None, return_hiddens=False):
455
+ x = self.forward_embedding(main_embed, extra_embed, padding_mask=padding_mask)
456
+ nonpadding_mask_TB = (
457
+ 1
458
+ - padding_mask.transpose(perm=paddle_aux.transpose_aux_func(padding_mask.ndim, 0, 1)).astype(
459
+ dtype="float32"
460
+ )[:, :, None]
461
+ )
462
+ x = x.transpose(perm=paddle_aux.transpose_aux_func(x.ndim, 0, 1)) * nonpadding_mask_TB
463
+ hiddens = []
464
+ for layer in self.layers:
465
+ x = layer(x, encoder_padding_mask=padding_mask, attn_mask=attn_mask) * nonpadding_mask_TB
466
+ hiddens.append(x)
467
+ x = self.layer_norm(x) * nonpadding_mask_TB
468
+ if return_hiddens:
469
+ x = paddle.stack(x=hiddens, axis=0)
470
+ x = x.transpose(perm=paddle_aux.transpose_aux_func(x.ndim, 1, 2))
471
+ else:
472
+ x = x.transpose(perm=paddle_aux.transpose_aux_func(x.ndim, 0, 1))
473
+ return x
VLMEvalKit_old/PaddleMIX/paddlemix/models/diffsinger/modules/fastspeech/variance_encoder.py ADDED
@@ -0,0 +1,151 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ import sys
16
+
17
+ import paddle
18
+
19
+ from paddlemix.models.diffsinger.utils import paddle_aux
20
+ from paddlemix.models.diffsinger.modules.commons.common_layers import (
21
+ NormalInitEmbedding as Embedding,
22
+ )
23
+ from paddlemix.models.diffsinger.modules.commons.common_layers import (
24
+ XavierUniformInitLinear as Linear,
25
+ )
26
+ from paddlemix.models.diffsinger.modules.fastspeech.tts_modules import (
27
+ DurationPredictor,
28
+ FastSpeech2Encoder,
29
+ )
30
+ from paddlemix.models.diffsinger.utils.hparams import hparams
31
+ from paddlemix.models.diffsinger.utils.text_encoder import PAD_INDEX
32
+
33
+
34
+ class FastSpeech2Variance(paddle.nn.Layer):
35
+ def __init__(self, vocab_size):
36
+ super().__init__()
37
+ self.predict_dur = hparams["predict_dur"]
38
+ self.linguistic_mode = "word" if hparams["predict_dur"] else "phoneme"
39
+ self.txt_embed = Embedding(vocab_size, hparams["hidden_size"], PAD_INDEX)
40
+ if self.predict_dur:
41
+ self.onset_embed = Embedding(2, hparams["hidden_size"])
42
+ self.word_dur_embed = Linear(1, hparams["hidden_size"])
43
+ else:
44
+ self.ph_dur_embed = Linear(1, hparams["hidden_size"])
45
+ self.encoder = FastSpeech2Encoder(
46
+ hidden_size=hparams["hidden_size"],
47
+ num_layers=hparams["enc_layers"],
48
+ ffn_kernel_size=hparams["enc_ffn_kernel_size"],
49
+ ffn_act=hparams["ffn_act"],
50
+ dropout=hparams["dropout"],
51
+ num_heads=hparams["num_heads"],
52
+ use_pos_embed=hparams["use_pos_embed"],
53
+ rel_pos=hparams["rel_pos"],
54
+ )
55
+ dur_hparams = hparams["dur_prediction_args"]
56
+ if self.predict_dur:
57
+ self.midi_embed = Embedding(128, hparams["hidden_size"])
58
+ self.dur_predictor = DurationPredictor(
59
+ in_dims=hparams["hidden_size"],
60
+ n_chans=dur_hparams["hidden_size"],
61
+ n_layers=dur_hparams["num_layers"],
62
+ dropout_rate=dur_hparams["dropout"],
63
+ kernel_size=dur_hparams["kernel_size"],
64
+ offset=dur_hparams["log_offset"],
65
+ dur_loss_type=dur_hparams["loss_type"],
66
+ )
67
+
68
+ def forward(self, txt_tokens, midi, ph2word, ph_dur=None, word_dur=None, spk_embed=None, infer=True):
69
+ """
70
+ :param txt_tokens: (train, infer) [B, T_ph]
71
+ :param midi: (train, infer) [B, T_ph]
72
+ :param ph2word: (train, infer) [B, T_ph]
73
+ :param ph_dur: (train, [infer]) [B, T_ph]
74
+ :param word_dur: (infer) [B, T_w]
75
+ :param spk_embed: (train) [B, T_ph, H]
76
+ :param infer: whether inference
77
+ :return: encoder_out, ph_dur_pred
78
+ """
79
+ txt_embed = self.txt_embed(txt_tokens)
80
+ if self.linguistic_mode == "word":
81
+ b = tuple(txt_tokens.shape)[0]
82
+ onset = paddle.diff(x=ph2word, axis=1, prepend=paddle.zeros(shape=[b, 1], dtype=ph2word.dtype)) > 0
83
+ onset_embed = self.onset_embed(onset.astype(dtype="int64"))
84
+ if word_dur is None or not infer:
85
+ word_dur = paddle.zeros(shape=[b, ph2word.max() + 1], dtype=ph_dur.dtype).put_along_axis(
86
+ axis=1, indices=ph2word, values=ph_dur, reduce="add"
87
+ )[:, 1:]
88
+ word_dur = paddle.take_along_axis(
89
+ arr=paddle.nn.functional.pad(x=word_dur, pad=[1, 0], value=0, pad_from_left_axis=False),
90
+ axis=1,
91
+ indices=ph2word,
92
+ broadcast=False,
93
+ )
94
+ word_dur_embed = self.word_dur_embed(word_dur.astype(dtype="float32")[:, :, None])
95
+ encoder_out = self.encoder(txt_embed, onset_embed + word_dur_embed, txt_tokens == 0)
96
+ else:
97
+ ph_dur_embed = self.ph_dur_embed(ph_dur.astype(dtype="float32")[:, :, None])
98
+ encoder_out = self.encoder(txt_embed, ph_dur_embed, txt_tokens == 0)
99
+ if self.predict_dur:
100
+ midi_embed = self.midi_embed(midi)
101
+ dur_cond = encoder_out + midi_embed
102
+ if spk_embed is not None:
103
+ dur_cond += spk_embed
104
+ ph_dur_pred = self.dur_predictor(dur_cond, x_masks=txt_tokens == PAD_INDEX, infer=infer)
105
+ return encoder_out, ph_dur_pred
106
+ else:
107
+ return encoder_out, None
108
+
109
+
110
+ class MelodyEncoder(paddle.nn.Layer):
111
+ def __init__(self, enc_hparams: dict):
112
+ super().__init__()
113
+
114
+ def get_hparam(key):
115
+ return enc_hparams.get(key, hparams.get(key))
116
+
117
+ hidden_size = get_hparam("hidden_size")
118
+ self.note_midi_embed = Linear(1, hidden_size)
119
+ self.note_dur_embed = Linear(1, hidden_size)
120
+ self.use_glide_embed = hparams["use_glide_embed"]
121
+ self.glide_embed_scale = hparams["glide_embed_scale"]
122
+ if self.use_glide_embed:
123
+ self.note_glide_embed = Embedding(len(hparams["glide_types"]) + 1, hidden_size, padding_idx=0)
124
+ self.encoder = FastSpeech2Encoder(
125
+ hidden_size=hidden_size,
126
+ num_layers=get_hparam("enc_layers"),
127
+ ffn_kernel_size=get_hparam("enc_ffn_kernel_size"),
128
+ ffn_act=get_hparam("ffn_act"),
129
+ dropout=get_hparam("dropout"),
130
+ num_heads=get_hparam("num_heads"),
131
+ use_pos_embed=get_hparam("use_pos_embed"),
132
+ rel_pos=get_hparam("rel_pos"),
133
+ )
134
+ self.out_proj = Linear(hidden_size, hparams["hidden_size"])
135
+
136
+ def forward(self, note_midi, note_rest, note_dur, glide=None):
137
+ """
138
+ :param note_midi: float32 [B, T_n], -1: padding
139
+ :param note_rest: bool [B, T_n]
140
+ :param note_dur: int64 [B, T_n]
141
+ :param glide: int64 [B, T_n]
142
+ :return: [B, T_n, H]
143
+ """
144
+ midi_embed = self.note_midi_embed(note_midi[:, :, None]) * ~note_rest[:, :, None]
145
+ dur_embed = self.note_dur_embed(note_dur.astype(dtype="float32")[:, :, None])
146
+ ornament_embed = 0
147
+ if self.use_glide_embed:
148
+ ornament_embed += self.note_glide_embed(glide) * self.glide_embed_scale
149
+ encoder_out = self.encoder(midi_embed, dur_embed + ornament_embed, padding_mask=note_midi < 0)
150
+ encoder_out = self.out_proj(encoder_out)
151
+ return encoder_out
VLMEvalKit_old/PaddleMIX/paddlemix/models/diffsinger/modules/hnsep/vr/__init__.py ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ import pathlib
16
+
17
+ import paddle
18
+ import yaml
19
+
20
+ from .nets import CascadedNet
21
+
22
+
23
+ class DotDict(dict):
24
+ def __getattr__(*args):
25
+ val = dict.get(*args)
26
+ return DotDict(val) if type(val) is dict else val
27
+
28
+ __setattr__ = dict.__setitem__
29
+ __delattr__ = dict.__delitem__
30
+
31
+
32
+ def load_sep_model(model_path, device="cpu"):
33
+ model_path = pathlib.Path(model_path)
34
+ config_file = model_path.with_name("config.yaml")
35
+ with open(config_file, "r") as config:
36
+ args = yaml.safe_load(config)
37
+ args = DotDict(args)
38
+ model = CascadedNet(args.n_fft, args.hop_length, args.n_out, args.n_out_lstm, True, is_mono=args.is_mono)
39
+ model.to(device)
40
+ model.set_state_dict(state_dict=paddle.load(path=str(model_path)))
41
+ model.eval()
42
+ return model
VLMEvalKit_old/PaddleMIX/paddlemix/models/diffsinger/modules/hnsep/vr/layers.py ADDED
@@ -0,0 +1,140 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ import sys
16
+
17
+ import paddle
18
+ import paddle_aux
19
+
20
+
21
+ def crop_center(h1, h2):
22
+ h1_shape = tuple(h1.shape)
23
+ h2_shape = tuple(h2.shape)
24
+ if h1_shape[3] == h2_shape[3]:
25
+ return h1
26
+ elif h1_shape[3] < h2_shape[3]:
27
+ raise ValueError("h1_shape[3] must be greater than h2_shape[3]")
28
+ s_time = (h1_shape[3] - h2_shape[3]) // 2
29
+ e_time = s_time + h2_shape[3]
30
+ h1 = h1[:, :, :, s_time:e_time]
31
+ return h1
32
+
33
+
34
+ class Conv2DBNActiv(paddle.nn.Layer):
35
+ def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=paddle.nn.ReLU):
36
+ super(Conv2DBNActiv, self).__init__()
37
+ self.conv = paddle.nn.Sequential(
38
+ paddle.nn.Conv2D(
39
+ in_channels=nin,
40
+ out_channels=nout,
41
+ kernel_size=ksize,
42
+ stride=stride,
43
+ padding=pad,
44
+ dilation=dilation,
45
+ bias_attr=False,
46
+ ),
47
+ paddle.nn.BatchNorm2D(num_features=nout),
48
+ activ(),
49
+ )
50
+
51
+ def forward(self, x):
52
+ return self.conv(x)
53
+
54
+
55
+ class Encoder(paddle.nn.Layer):
56
+ def __init__(self, nin, nout, ksize=3, stride=1, pad=1, activ=paddle.nn.LeakyReLU):
57
+ super(Encoder, self).__init__()
58
+ self.conv1 = Conv2DBNActiv(nin, nout, ksize, stride, pad, activ=activ)
59
+ self.conv2 = Conv2DBNActiv(nout, nout, ksize, 1, pad, activ=activ)
60
+
61
+ def forward(self, x):
62
+ h = self.conv1(x)
63
+ h = self.conv2(h)
64
+ return h
65
+
66
+
67
+ class Decoder(paddle.nn.Layer):
68
+ def __init__(self, nin, nout, ksize=3, stride=1, pad=1, activ=paddle.nn.ReLU, dropout=False):
69
+ super(Decoder, self).__init__()
70
+ self.conv1 = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ)
71
+ self.dropout = paddle.nn.Dropout2D(p=0.1) if dropout else None
72
+
73
+ def forward(self, x, skip=None):
74
+ x = paddle.nn.functional.interpolate(x=x, scale_factor=2, mode="bilinear", align_corners=True)
75
+ if skip is not None:
76
+ skip = crop_center(skip, x)
77
+ x = paddle.concat(x=[x, skip], axis=1)
78
+ h = self.conv1(x)
79
+ if self.dropout is not None:
80
+ h = self.dropout(h)
81
+ return h
82
+
83
+
84
+ class Mean(paddle.nn.Layer):
85
+ def __init__(self, dim, keepdims=False):
86
+ super(Mean, self).__init__()
87
+ self.dim = dim
88
+ self.keepdims = keepdims
89
+
90
+ def forward(self, x):
91
+ return x.mean(self.dim, keepdims=self.keepdims)
92
+
93
+
94
+ class ASPPModule(paddle.nn.Layer):
95
+ def __init__(self, nin, nout, dilations=(4, 8, 12), activ=paddle.nn.ReLU, dropout=False):
96
+ super(ASPPModule, self).__init__()
97
+ self.conv1 = paddle.nn.Sequential(Mean(dim=-2, keepdims=True), Conv2DBNActiv(nin, nout, 1, 1, 0, activ=activ))
98
+ self.conv2 = Conv2DBNActiv(nin, nout, 1, 1, 0, activ=activ)
99
+ self.conv3 = Conv2DBNActiv(nin, nout, 3, 1, dilations[0], dilations[0], activ=activ)
100
+ self.conv4 = Conv2DBNActiv(nin, nout, 3, 1, dilations[1], dilations[1], activ=activ)
101
+ self.conv5 = Conv2DBNActiv(nin, nout, 3, 1, dilations[2], dilations[2], activ=activ)
102
+ self.bottleneck = Conv2DBNActiv(nout * 5, nout, 1, 1, 0, activ=activ)
103
+ self.dropout = paddle.nn.Dropout2D(p=0.1) if dropout else None
104
+
105
+ def forward(self, x):
106
+ _, _, h, w = tuple(x.shape)
107
+ feat1 = self.conv1(x).tile(repeat_times=[1, 1, h, 1])
108
+ feat2 = self.conv2(x)
109
+ feat3 = self.conv3(x)
110
+ feat4 = self.conv4(x)
111
+ feat5 = self.conv5(x)
112
+ out = paddle.concat(x=(feat1, feat2, feat3, feat4, feat5), axis=1)
113
+ out = self.bottleneck(out)
114
+ if self.dropout is not None:
115
+ out = self.dropout(out)
116
+ return out
117
+
118
+
119
+ class LSTMModule(paddle.nn.Layer):
120
+ def __init__(self, nin_conv, nin_lstm, nout_lstm):
121
+ super(LSTMModule, self).__init__()
122
+ self.conv = Conv2DBNActiv(nin_conv, 1, 1, 1, 0)
123
+ self.lstm = paddle.nn.LSTM(
124
+ input_size=nin_lstm, hidden_size=nout_lstm // 2, time_major=not False, direction="bidirect"
125
+ )
126
+ self.dense = paddle.nn.Sequential(
127
+ paddle.nn.Linear(in_features=nout_lstm, out_features=nin_lstm),
128
+ paddle.nn.BatchNorm1D(num_features=nin_lstm),
129
+ paddle.nn.ReLU(),
130
+ )
131
+
132
+ def forward(self, x):
133
+ N, _, nbins, nframes = tuple(x.shape)
134
+ h = self.conv(x)[:, 0]
135
+ h = h.transpose(perm=[2, 0, 1])
136
+ h, _ = self.lstm(h)
137
+ h = self.dense(h.reshape(-1, tuple(h.shape)[-1]))
138
+ h = h.reshape(nframes, N, 1, nbins)
139
+ h = h.transpose(perm=[1, 2, 3, 0])
140
+ return h
VLMEvalKit_old/PaddleMIX/paddlemix/models/diffsinger/modules/hnsep/vr/nets.py ADDED
@@ -0,0 +1,185 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ import sys
16
+
17
+ import paddle
18
+
19
+ from . import layers
20
+
21
+
22
+ class BaseNet(paddle.nn.Layer):
23
+ def __init__(self, nin, nout, nin_lstm, nout_lstm, dilations=((4, 2), (8, 4), (12, 6))):
24
+ super(BaseNet, self).__init__()
25
+ self.enc1 = layers.Conv2DBNActiv(nin, nout, 3, 1, 1)
26
+ self.enc2 = layers.Encoder(nout, nout * 2, 3, 2, 1)
27
+ self.enc3 = layers.Encoder(nout * 2, nout * 4, 3, 2, 1)
28
+ self.enc4 = layers.Encoder(nout * 4, nout * 6, 3, 2, 1)
29
+ self.enc5 = layers.Encoder(nout * 6, nout * 8, 3, 2, 1)
30
+ self.aspp = layers.ASPPModule(nout * 8, nout * 8, dilations, dropout=True)
31
+ self.dec4 = layers.Decoder(nout * (6 + 8), nout * 6, 3, 1, 1)
32
+ self.dec3 = layers.Decoder(nout * (4 + 6), nout * 4, 3, 1, 1)
33
+ self.dec2 = layers.Decoder(nout * (2 + 4), nout * 2, 3, 1, 1)
34
+ self.lstm_dec2 = layers.LSTMModule(nout * 2, nin_lstm, nout_lstm)
35
+ self.dec1 = layers.Decoder(nout * (1 + 2) + 1, nout * 1, 3, 1, 1)
36
+
37
+ def forward(self, x):
38
+ e1 = self.enc1(x)
39
+ e2 = self.enc2(e1)
40
+ e3 = self.enc3(e2)
41
+ e4 = self.enc4(e3)
42
+ e5 = self.enc5(e4)
43
+ h = self.aspp(e5)
44
+ h = self.dec4(h, e4)
45
+ h = self.dec3(h, e3)
46
+ h = self.dec2(h, e2)
47
+ h = paddle.concat(x=[h, self.lstm_dec2(h)], axis=1)
48
+ h = self.dec1(h, e1)
49
+ return h
50
+
51
+
52
+ class CascadedNet(paddle.nn.Layer):
53
+ def __init__(self, n_fft, hop_length, nout=32, nout_lstm=128, is_complex=False, is_mono=False):
54
+ super(CascadedNet, self).__init__()
55
+ self.n_fft = n_fft
56
+ self.hop_length = hop_length
57
+ self.is_complex = is_complex
58
+ self.is_mono = is_mono
59
+ self.register_buffer(
60
+ name="window",
61
+ tensor=paddle.audio.functional.get_window("hann", n_fft).astype("float32"),
62
+ persistable=False,
63
+ )
64
+ self.max_bin = n_fft // 2
65
+ self.output_bin = n_fft // 2 + 1
66
+ self.nin_lstm = self.max_bin // 2
67
+ self.offset = 64
68
+ nin = 4 if is_complex else 2
69
+ if is_mono:
70
+ nin = nin // 2
71
+ self.stg1_low_band_net = paddle.nn.Sequential(
72
+ BaseNet(nin, nout // 2, self.nin_lstm // 2, nout_lstm), layers.Conv2DBNActiv(nout // 2, nout // 4, 1, 1, 0)
73
+ )
74
+ self.stg1_high_band_net = BaseNet(nin, nout // 4, self.nin_lstm // 2, nout_lstm // 2)
75
+ self.stg2_low_band_net = paddle.nn.Sequential(
76
+ BaseNet(nout // 4 + nin, nout, self.nin_lstm // 2, nout_lstm),
77
+ layers.Conv2DBNActiv(nout, nout // 2, 1, 1, 0),
78
+ )
79
+ self.stg2_high_band_net = BaseNet(nout // 4 + nin, nout // 2, self.nin_lstm // 2, nout_lstm // 2)
80
+ self.stg3_full_band_net = BaseNet(3 * nout // 4 + nin, nout, self.nin_lstm, nout_lstm)
81
+ self.out = paddle.nn.Conv2D(in_channels=nout, out_channels=nin, kernel_size=1, bias_attr=False)
82
+ self.aux_out = paddle.nn.Conv2D(in_channels=3 * nout // 4, out_channels=nin, kernel_size=1, bias_attr=False)
83
+
84
+ def forward(self, x):
85
+ if self.is_complex:
86
+ x = paddle.concat(x=[x.real(), x.imag()], axis=1)
87
+ x = x[:, :, : self.max_bin]
88
+ bandw = tuple(x.shape)[2] // 2
89
+ l1_in = x[:, :, :bandw]
90
+ h1_in = x[:, :, bandw:]
91
+ l1 = self.stg1_low_band_net(l1_in)
92
+ h1 = self.stg1_high_band_net(h1_in)
93
+ aux1 = paddle.concat(x=[l1, h1], axis=2)
94
+ l2_in = paddle.concat(x=[l1_in, l1], axis=1)
95
+ h2_in = paddle.concat(x=[h1_in, h1], axis=1)
96
+ l2 = self.stg2_low_band_net(l2_in)
97
+ h2 = self.stg2_high_band_net(h2_in)
98
+ aux2 = paddle.concat(x=[l2, h2], axis=2)
99
+ f3_in = paddle.concat(x=[x, aux1, aux2], axis=1)
100
+ f3 = self.stg3_full_band_net(f3_in)
101
+ if self.is_complex:
102
+ mask = self.out(f3)
103
+ if self.is_mono:
104
+ mask = paddle.complex(real=mask[:, :1], imag=mask[:, 1:])
105
+ else:
106
+ mask = paddle.complex(real=mask[:, :2], imag=mask[:, 2:])
107
+ mask = self.bounded_mask(mask)
108
+ else:
109
+ mask = paddle.nn.functional.sigmoid(x=self.out(f3))
110
+ mask = paddle.nn.functional.pad(
111
+ x=mask, pad=(0, 0, 0, self.output_bin - tuple(mask.shape)[2]), mode="replicate", pad_from_left_axis=False
112
+ )
113
+ return mask
114
+
115
+ def bounded_mask(self, mask, eps=1e-08):
116
+ mask_mag = paddle.abs(x=mask)
117
+ mask = paddle.nn.functional.tanh(x=mask_mag) * mask / (mask_mag + eps)
118
+ return mask
119
+
120
+ def predict_mask(self, x):
121
+ mask = self.forward(x)
122
+ if self.offset > 0:
123
+ mask = mask[:, :, :, self.offset : -self.offset]
124
+ assert tuple(mask.shape)[3] > 0
125
+ return mask
126
+
127
+ def predict(self, x):
128
+ mask = self.forward(x)
129
+ pred = x * mask
130
+ if self.offset > 0:
131
+ pred = pred[:, :, :, self.offset : -self.offset]
132
+ assert tuple(pred.shape)[3] > 0
133
+ return pred
134
+
135
+ def audio2spec(self, x, use_pad=False):
136
+ B, C, T = tuple(x.shape)
137
+ x = x.reshape(B * C, T)
138
+ if use_pad:
139
+ n_frames = T // self.hop_length + 1
140
+ T_pad = (32 * ((n_frames - 1) // 32 + 1) - 1) * self.hop_length - T
141
+ nl_pad = T_pad // 2 // self.hop_length
142
+ Tl_pad = nl_pad * self.hop_length
143
+ x = paddle.nn.functional.pad(x=x, pad=(Tl_pad, T_pad - Tl_pad), pad_from_left_axis=False)
144
+ spec = paddle.signal.stft(
145
+ x,
146
+ n_fft=self.n_fft,
147
+ hop_length=self.hop_length,
148
+ return_complex=True,
149
+ window=self.window,
150
+ pad_mode="constant",
151
+ )
152
+ spec = spec.reshape(B, C, tuple(spec.shape)[-2], tuple(spec.shape)[-1])
153
+ return spec
154
+
155
+ def spec2audio(self, x):
156
+ B, C, N, T = tuple(x.shape)
157
+ x = x.reshape(-1, N, T)
158
+ x = paddle.signal.istft(x=x, n_fft=self.n_fft, hop_length=self.hop_length, window=self.window)
159
+ x = x.reshape(B, C, -1)
160
+ return x
161
+
162
+ def predict_from_audio(self, x):
163
+ B, C, T = tuple(x.shape)
164
+ x = x.reshape(B * C, T)
165
+ n_frames = T // self.hop_length + 1
166
+ T_pad = (32 * (n_frames // 32 + 1) - 1) * self.hop_length - T
167
+ nl_pad = T_pad // 2 // self.hop_length
168
+ Tl_pad = nl_pad * self.hop_length
169
+ x = paddle.nn.functional.pad(x=x, pad=(Tl_pad, T_pad - Tl_pad), pad_from_left_axis=False)
170
+ spec = paddle.signal.stft(
171
+ x,
172
+ n_fft=self.n_fft,
173
+ hop_length=self.hop_length,
174
+ return_complex=True,
175
+ window=self.window,
176
+ pad_mode="constant",
177
+ )
178
+ spec = spec.reshape(B, C, tuple(spec.shape)[-2], tuple(spec.shape)[-1])
179
+ mask = self.forward(spec)
180
+ spec_pred = spec * mask
181
+ spec_pred = spec_pred.reshape(B * C, tuple(spec.shape)[-2], tuple(spec.shape)[-1])
182
+ x_pred = paddle.signal.istft(x=spec_pred, n_fft=self.n_fft, hop_length=self.hop_length, window=self.window)
183
+ x_pred = x_pred[:, Tl_pad : Tl_pad + T]
184
+ x_pred = x_pred.reshape(B, C, T)
185
+ return x_pred
VLMEvalKit_old/PaddleMIX/paddlemix/models/diffsinger/modules/nsf_hifigan/env.py ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+
16
+ class AttrDict(dict):
17
+ """A dictionary with attribute-style access. It maps attribute access to
18
+ the real dictionary."""
19
+
20
+ def __init__(self, *args, **kwargs):
21
+ dict.__init__(self, *args, **kwargs)
22
+
23
+ def __getstate__(self):
24
+ return self.__dict__.items()
25
+
26
+ def __setstate__(self, items):
27
+ for key, val in items:
28
+ self.__dict__[key] = val
29
+
30
+ def __repr__(self):
31
+ return "%s(%s)" % (self.__class__.__name__, dict.__repr__(self))
32
+
33
+ def __setitem__(self, key, value):
34
+ return super(AttrDict, self).__setitem__(key, value)
35
+
36
+ def __getitem__(self, name):
37
+ return super(AttrDict, self).__getitem__(name)
38
+
39
+ def __delitem__(self, name):
40
+ return super(AttrDict, self).__delitem__(name)
41
+
42
+ __getattr__ = __getitem__
43
+ __setattr__ = __setitem__
44
+
45
+ def copy(self):
46
+ return AttrDict(self)
VLMEvalKit_old/PaddleMIX/paddlemix/models/diffsinger/modules/nsf_hifigan/models.py ADDED
@@ -0,0 +1,380 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ import json
16
+ import pathlib
17
+ import sys
18
+ import numpy as np
19
+ import paddle
20
+ import paddle.nn.functional as F
21
+
22
+ from paddlemix.models.diffsinger.utils import paddle_aux
23
+ from paddle.nn.utils import remove_weight_norm, weight_norm
24
+
25
+ from .env import AttrDict
26
+ from .utils import get_padding, init_weights
27
+
28
+ LRELU_SLOPE = 0.1
29
+
30
+
31
+ def load_model(model_path: pathlib.Path):
32
+ config_file = model_path.with_name("config.json")
33
+ with open(config_file) as f:
34
+ data = f.read()
35
+ json_config = json.loads(data)
36
+ h = AttrDict(json_config)
37
+ generator = Generator(h)
38
+ cp_dict = paddle.load(path=str(model_path))
39
+ generator.set_state_dict(state_dict=cp_dict["generator"])
40
+ generator.eval()
41
+ generator.remove_weight_norm()
42
+ del cp_dict
43
+ return generator, h
44
+
45
+
46
+ class ResBlock1(paddle.nn.Layer):
47
+ def __init__(self, h, channels, kernel_size=3, dilation=(1, 3, 5)):
48
+ super(ResBlock1, self).__init__()
49
+ self.h = h
50
+ self.convs1 = paddle.nn.LayerList(
51
+ sublayers=[
52
+ paddle.nn.utils.weight_norm(
53
+ layer=paddle.nn.Conv1D(
54
+ in_channels=channels,
55
+ out_channels=channels,
56
+ kernel_size=kernel_size,
57
+ stride=1,
58
+ dilation=dilation[0],
59
+ padding=get_padding(kernel_size, dilation[0]),
60
+ )
61
+ ),
62
+ paddle.nn.utils.weight_norm(
63
+ layer=paddle.nn.Conv1D(
64
+ in_channels=channels,
65
+ out_channels=channels,
66
+ kernel_size=kernel_size,
67
+ stride=1,
68
+ dilation=dilation[1],
69
+ padding=get_padding(kernel_size, dilation[1]),
70
+ )
71
+ ),
72
+ paddle.nn.utils.weight_norm(
73
+ layer=paddle.nn.Conv1D(
74
+ in_channels=channels,
75
+ out_channels=channels,
76
+ kernel_size=kernel_size,
77
+ stride=1,
78
+ dilation=dilation[2],
79
+ padding=get_padding(kernel_size, dilation[2]),
80
+ )
81
+ ),
82
+ ]
83
+ )
84
+ self.convs1.apply(init_weights)
85
+ self.convs2 = paddle.nn.LayerList(
86
+ sublayers=[
87
+ paddle.nn.utils.weight_norm(
88
+ layer=paddle.nn.Conv1D(
89
+ in_channels=channels,
90
+ out_channels=channels,
91
+ kernel_size=kernel_size,
92
+ stride=1,
93
+ dilation=1,
94
+ padding=get_padding(kernel_size, 1),
95
+ )
96
+ ),
97
+ paddle.nn.utils.weight_norm(
98
+ layer=paddle.nn.Conv1D(
99
+ in_channels=channels,
100
+ out_channels=channels,
101
+ kernel_size=kernel_size,
102
+ stride=1,
103
+ dilation=1,
104
+ padding=get_padding(kernel_size, 1),
105
+ )
106
+ ),
107
+ paddle.nn.utils.weight_norm(
108
+ layer=paddle.nn.Conv1D(
109
+ in_channels=channels,
110
+ out_channels=channels,
111
+ kernel_size=kernel_size,
112
+ stride=1,
113
+ dilation=1,
114
+ padding=get_padding(kernel_size, 1),
115
+ )
116
+ ),
117
+ ]
118
+ )
119
+ self.convs2.apply(init_weights)
120
+
121
+ def forward(self, x):
122
+ for c1, c2 in zip(self.convs1, self.convs2):
123
+ xt = paddle.nn.functional.leaky_relu(x=x, negative_slope=LRELU_SLOPE)
124
+ xt = c1(xt)
125
+ xt = paddle.nn.functional.leaky_relu(x=xt, negative_slope=LRELU_SLOPE)
126
+ xt = c2(xt)
127
+ x = xt + x
128
+ return x
129
+
130
+ def remove_weight_norm(self):
131
+ for l in self.convs1:
132
+ paddle.nn.utils.remove_weight_norm(layer=l)
133
+ for l in self.convs2:
134
+ paddle.nn.utils.remove_weight_norm(layer=l)
135
+
136
+
137
+ class ResBlock2(paddle.nn.Layer):
138
+ def __init__(self, h, channels, kernel_size=3, dilation=(1, 3)):
139
+ super(ResBlock2, self).__init__()
140
+ self.h = h
141
+ self.convs = paddle.nn.LayerList(
142
+ sublayers=[
143
+ paddle.nn.utils.weight_norm(
144
+ layer=paddle.nn.Conv1D(
145
+ in_channels=channels,
146
+ out_channels=channels,
147
+ kernel_size=kernel_size,
148
+ stride=1,
149
+ dilation=dilation[0],
150
+ padding=get_padding(kernel_size, dilation[0]),
151
+ )
152
+ ),
153
+ paddle.nn.utils.weight_norm(
154
+ layer=paddle.nn.Conv1D(
155
+ in_channels=channels,
156
+ out_channels=channels,
157
+ kernel_size=kernel_size,
158
+ stride=1,
159
+ dilation=dilation[1],
160
+ padding=get_padding(kernel_size, dilation[1]),
161
+ )
162
+ ),
163
+ ]
164
+ )
165
+ self.convs.apply(init_weights)
166
+
167
+ def forward(self, x):
168
+ for c in self.convs:
169
+ xt = paddle.nn.functional.leaky_relu(x=x, negative_slope=LRELU_SLOPE)
170
+ xt = c(xt)
171
+ x = xt + x
172
+ return x
173
+
174
+ def remove_weight_norm(self):
175
+ for l in self.convs:
176
+ paddle.nn.utils.remove_weight_norm(layer=l)
177
+
178
+
179
+ class SineGen(paddle.nn.Layer):
180
+ """Definition of sine generator
181
+ SineGen(samp_rate, harmonic_num = 0,
182
+ sine_amp = 0.1, noise_std = 0.003,
183
+ voiced_threshold = 0,
184
+ flag_for_pulse=False)
185
+ samp_rate: sampling rate in Hz
186
+ harmonic_num: number of harmonic overtones (default 0)
187
+ sine_amp: amplitude of sine-waveform (default 0.1)
188
+ noise_std: std of Gaussian noise (default 0.003)
189
+ voiced_threshold: F0 threshold for U/V classification (default 0)
190
+ flag_for_pulse: this SinGen is used inside PulseGen (default False)
191
+ Note: when flag_for_pulse is True, the first time step of a voiced
192
+ segment is always sin(np.pi) or cos(0)
193
+ """
194
+
195
+ def __init__(self, samp_rate, harmonic_num=0, sine_amp=0.1, noise_std=0.003, voiced_threshold=0):
196
+ super(SineGen, self).__init__()
197
+ self.sine_amp = sine_amp
198
+ self.noise_std = noise_std
199
+ self.harmonic_num = harmonic_num
200
+ self.dim = self.harmonic_num + 1
201
+ self.sampling_rate = samp_rate
202
+ self.voiced_threshold = voiced_threshold
203
+
204
+ def _f02uv(self, f0):
205
+ uv = paddle.ones_like(x=f0)
206
+ uv = uv * (f0 > self.voiced_threshold)
207
+ return uv
208
+
209
+ def _f02sine(self, f0, upp):
210
+ """f0: (batchsize, length, dim)
211
+ where dim indicates fundamental tone and overtones
212
+ """
213
+ # rad = f0 / self.sampling_rate * paddle.arange(start=1, end=upp + 1)
214
+ rad = f0 / self.sampling_rate * paddle.arange(start=1, end=upp + 1, dtype="float32")
215
+ rad2 = (
216
+ paddle.mod(
217
+ x=rad[..., -1:].astype(dtype="float32") + 0.5,
218
+ y=paddle.to_tensor(1.0, dtype=(rad[..., -1:].astype(dtype="float32") + 0.5).dtype),
219
+ )
220
+ - 0.5
221
+ )
222
+ rad_acc = rad2.cumsum(axis=1).mod(y=paddle.to_tensor(1.0)).to(f0)
223
+ # rad += paddle.nn.functional.pad(x=rad_acc, pad=(0, 0, 1, -1),
224
+ # pad_from_left_axis=False)
225
+ # 等效实现
226
+ rad_shifted = paddle.concat([paddle.zeros_like(rad_acc[:, :1]), rad_acc[:, :-1]], axis=1)
227
+ rad += rad_shifted
228
+ rad = rad.reshape(tuple(f0.shape)[0], -1, 1)
229
+ # rad = paddle.multiply(x=rad, y=paddle.to_tensor(paddle.arange(start
230
+ # =1, end=self.dim + 1).reshape(1, 1, -1)))
231
+ rad = paddle.multiply(
232
+ x=rad,
233
+ y=paddle.to_tensor(
234
+ paddle.arange(start=1, end=self.dim + 1), dtype="float32" # Explicitly set dtype to float32
235
+ ).reshape(1, 1, -1),
236
+ )
237
+
238
+ rand_ini = paddle.rand(shape=[1, 1, self.dim])
239
+ rand_ini[..., 0] = 0
240
+ rad += rand_ini
241
+ sines = paddle.sin(x=2 * np.pi * rad)
242
+ return sines
243
+
244
+ @paddle.no_grad()
245
+ def forward(self, f0, upp):
246
+ """sine_tensor, uv = forward(f0)
247
+ input F0: tensor(batchsize=1, length, dim=1)
248
+ f0 for unvoiced steps should be 0
249
+ output sine_tensor: tensor(batchsize=1, length, dim)
250
+ output uv: tensor(batchsize=1, length, 1)
251
+ """
252
+ f0 = f0.unsqueeze(axis=-1)
253
+ sine_waves = self._f02sine(f0, upp) * self.sine_amp
254
+ uv = (f0 > self.voiced_threshold).astype(dtype="float32")
255
+ uv = F.interpolate(uv.transpose([0, 2, 1]), scale_factor=upp, mode="linear", data_format="NCW").transpose(
256
+ [0, 2, 1]
257
+ )
258
+ noise_amp = uv * self.noise_std + (1 - uv) * self.sine_amp / 3
259
+ noise = noise_amp * paddle.randn(shape=sine_waves.shape, dtype=sine_waves.dtype)
260
+ sine_waves = sine_waves * uv + noise
261
+ return sine_waves
262
+
263
+
264
+ class SourceModuleHnNSF(paddle.nn.Layer):
265
+ """SourceModule for hn-nsf
266
+ SourceModule(sampling_rate, harmonic_num=0, sine_amp=0.1,
267
+ add_noise_std=0.003, voiced_threshod=0)
268
+ sampling_rate: sampling_rate in Hz
269
+ harmonic_num: number of harmonic above F0 (default: 0)
270
+ sine_amp: amplitude of sine source signal (default: 0.1)
271
+ add_noise_std: std of additive Gaussian noise (default: 0.003)
272
+ note that amplitude of noise in unvoiced is decided
273
+ by sine_amp
274
+ voiced_threshold: threhold to set U/V given F0 (default: 0)
275
+ Sine_source, noise_source = SourceModuleHnNSF(F0_sampled)
276
+ F0_sampled (batchsize, length, 1)
277
+ Sine_source (batchsize, length, 1)
278
+ noise_source (batchsize, length 1)
279
+ uv (batchsize, length, 1)
280
+ """
281
+
282
+ def __init__(self, sampling_rate, harmonic_num=0, sine_amp=0.1, add_noise_std=0.003, voiced_threshold=0):
283
+ super(SourceModuleHnNSF, self).__init__()
284
+ self.sine_amp = sine_amp
285
+ self.noise_std = add_noise_std
286
+ self.l_sin_gen = SineGen(sampling_rate, harmonic_num, sine_amp, add_noise_std, voiced_threshold)
287
+ self.l_linear = paddle.nn.Linear(in_features=harmonic_num + 1, out_features=1)
288
+ self.l_tanh = paddle.nn.Tanh()
289
+
290
+ def forward(self, x, upp):
291
+ sine_wavs = self.l_sin_gen(x, upp)
292
+ sine_merge = self.l_tanh(self.l_linear(sine_wavs))
293
+ return sine_merge
294
+
295
+
296
+ class Generator(paddle.nn.Layer):
297
+ def __init__(self, h):
298
+ super(Generator, self).__init__()
299
+ self.h = h
300
+ self.num_kernels = len(h.resblock_kernel_sizes)
301
+ self.num_upsamples = len(h.upsample_rates)
302
+ self.m_source = SourceModuleHnNSF(sampling_rate=h.sampling_rate, harmonic_num=8)
303
+ self.noise_convs = paddle.nn.LayerList()
304
+ self.conv_pre = paddle.nn.utils.weight_norm(
305
+ layer=paddle.nn.Conv1D(
306
+ in_channels=h.num_mels, out_channels=h.upsample_initial_channel, kernel_size=7, stride=1, padding=3
307
+ )
308
+ )
309
+ resblock = ResBlock1 if h.resblock == "1" else ResBlock2
310
+ self.ups = paddle.nn.LayerList()
311
+ for i, (u, k) in enumerate(zip(h.upsample_rates, h.upsample_kernel_sizes)):
312
+ c_cur = h.upsample_initial_channel // 2 ** (i + 1)
313
+ self.ups.append(
314
+ paddle.nn.utils.weight_norm(
315
+ layer=paddle.nn.Conv1DTranspose(
316
+ in_channels=h.upsample_initial_channel // 2**i,
317
+ out_channels=h.upsample_initial_channel // 2 ** (i + 1),
318
+ kernel_size=k,
319
+ stride=u,
320
+ padding=(k - u) // 2,
321
+ )
322
+ )
323
+ )
324
+ if i + 1 < len(h.upsample_rates):
325
+ stride_f0 = int(np.prod(h.upsample_rates[i + 1 :]))
326
+ self.noise_convs.append(
327
+ paddle.nn.Conv1D(
328
+ in_channels=1,
329
+ out_channels=c_cur,
330
+ kernel_size=stride_f0 * 2,
331
+ stride=stride_f0,
332
+ padding=stride_f0 // 2,
333
+ )
334
+ )
335
+ else:
336
+ self.noise_convs.append(paddle.nn.Conv1D(in_channels=1, out_channels=c_cur, kernel_size=1))
337
+ self.resblocks = paddle.nn.LayerList()
338
+ ch = h.upsample_initial_channel
339
+ for i in range(len(self.ups)):
340
+ ch //= 2
341
+ for j, (k, d) in enumerate(zip(h.resblock_kernel_sizes, h.resblock_dilation_sizes)):
342
+ self.resblocks.append(resblock(h, ch, k, d))
343
+ self.conv_post = paddle.nn.utils.weight_norm(
344
+ layer=paddle.nn.Conv1D(in_channels=ch, out_channels=1, kernel_size=7, stride=1, padding=3)
345
+ )
346
+ self.ups.apply(init_weights)
347
+ self.conv_post.apply(init_weights)
348
+ self.upp = int(np.prod(h.upsample_rates))
349
+
350
+ def forward(self, x, f0):
351
+ har_source = self.m_source(f0, self.upp).transpose(
352
+ perm=paddle_aux.transpose_aux_func(self.m_source(f0, self.upp).ndim, 1, 2)
353
+ )
354
+ # har_source = self.m_source(f0, self.upp).transpose(1, 2)
355
+ x = self.conv_pre(x)
356
+ for i in range(self.num_upsamples):
357
+ x = paddle.nn.functional.leaky_relu(x=x, negative_slope=LRELU_SLOPE)
358
+ x = self.ups[i](x)
359
+ x_source = self.noise_convs[i](har_source)
360
+ x = x + x_source
361
+ xs = None
362
+ for j in range(self.num_kernels):
363
+ if xs is None:
364
+ xs = self.resblocks[i * self.num_kernels + j](x)
365
+ else:
366
+ xs += self.resblocks[i * self.num_kernels + j](x)
367
+ x = xs / self.num_kernels
368
+ x = paddle.nn.functional.leaky_relu(x=x)
369
+ x = self.conv_post(x)
370
+ x = paddle.nn.functional.tanh(x=x)
371
+ return x
372
+
373
+ def remove_weight_norm(self):
374
+ print("Removing weight norm...")
375
+ for l in self.ups:
376
+ remove_weight_norm(l)
377
+ for l in self.resblocks:
378
+ l.remove_weight_norm()
379
+ remove_weight_norm(self.conv_pre)
380
+ remove_weight_norm(self.conv_post)
VLMEvalKit_old/PaddleMIX/paddlemix/models/diffsinger/modules/nsf_hifigan/nvSTFT.py ADDED
@@ -0,0 +1,104 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ import os
16
+
17
+ import paddle
18
+
19
+ os.environ["LRU_CACHE_CAPACITY"] = "3"
20
+ import numpy as np
21
+ from librosa.filters import mel as librosa_mel_fn
22
+
23
+
24
+ def dynamic_range_compression(x, C=1, clip_val=1e-05):
25
+ return np.log(np.clip(x, a_min=clip_val, a_max=None) * C)
26
+
27
+
28
+ def dynamic_range_decompression(x, C=1):
29
+ return np.exp(x) / C
30
+
31
+
32
+ def dynamic_range_compression_torch(x, C=1, clip_val=1e-05):
33
+ return paddle.log(x=paddle.clip(x=x, min=clip_val) * C)
34
+
35
+
36
+ def dynamic_range_decompression_torch(x, C=1):
37
+ return paddle.exp(x=x) / C
38
+
39
+
40
+ class STFT:
41
+ def __init__(
42
+ self,
43
+ sr=22050,
44
+ n_mels=80,
45
+ n_fft=1024,
46
+ win_size=1024,
47
+ hop_length=256,
48
+ fmin=20,
49
+ fmax=11025,
50
+ clip_val=1e-05,
51
+ device=None,
52
+ ):
53
+ self.target_sr = sr
54
+ self.n_mels = n_mels
55
+ self.n_fft = n_fft
56
+ self.win_size = win_size
57
+ self.hop_length = hop_length
58
+ self.fmin = fmin
59
+ self.fmax = fmax
60
+ self.clip_val = clip_val
61
+ if device is None:
62
+ device = str("cuda" if paddle.device.cuda.device_count() >= 1 else "cpu").replace("cuda", "gpu")
63
+ self.device = device
64
+ mel_basis = librosa_mel_fn(sr=sr, n_fft=n_fft, n_mels=n_mels, fmin=fmin, fmax=fmax)
65
+ self.mel_basis = paddle.to_tensor(data=mel_basis).astype(dtype="float32").to(device)
66
+
67
+ def get_mel(self, y, keyshift=0, speed=1, center=False):
68
+ factor = 2 ** (keyshift / 12)
69
+ n_fft_new = int(np.round(self.n_fft * factor))
70
+ win_size_new = int(np.round(self.win_size * factor))
71
+ hop_length_new = int(np.round(self.hop_length * speed))
72
+ if paddle.min(x=y) < -1.0:
73
+ print("min value is ", paddle.min(x=y))
74
+ if paddle.max(x=y) > 1.0:
75
+ print("max value is ", paddle.max(x=y))
76
+ window = paddle.audio.functional.get_window("hann", win_size_new).astype("float32").to(self.device)
77
+ y = paddle.nn.functional.pad(
78
+ x=y.unsqueeze(axis=1),
79
+ pad=((win_size_new - hop_length_new) // 2, (win_size_new - hop_length_new + 1) // 2),
80
+ mode="reflect",
81
+ pad_from_left_axis=False,
82
+ )
83
+ y = y.squeeze(axis=1)
84
+ spec = paddle.signal.stft(
85
+ y,
86
+ n_fft_new,
87
+ hop_length=hop_length_new,
88
+ win_length=win_size_new,
89
+ window=window,
90
+ center=center,
91
+ pad_mode="reflect",
92
+ normalized=False,
93
+ onesided=True,
94
+ ).abs()
95
+
96
+ if keyshift != 0:
97
+ size = self.n_fft // 2 + 1
98
+ resize = spec.shape[1]
99
+ if resize < size:
100
+ spec = paddle.nn.functional.pad(x=spec, pad=(0, 0, 0, size - resize), pad_from_left_axis=False)
101
+ spec = spec[:, :size, :] * self.win_size / win_size_new
102
+ spec = paddle.matmul(x=self.mel_basis, y=spec)
103
+ spec = dynamic_range_compression_torch(spec, clip_val=self.clip_val)
104
+ return spec
VLMEvalKit_old/PaddleMIX/paddlemix/models/diffsinger/modules/nsf_hifigan/utils.py ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ import matplotlib
16
+
17
+ matplotlib.use("Agg")
18
+
19
+
20
+ def init_weights(m, mean=0.0, std=0.01):
21
+ classname = m.__class__.__name__
22
+ if classname.find("Conv") != -1:
23
+ m.weight.data.normal_(mean, std)
24
+
25
+
26
+ def get_padding(kernel_size, dilation=1):
27
+ return int((kernel_size * dilation - dilation) / 2)
VLMEvalKit_old/PaddleMIX/paddlemix/models/diffsinger/modules/pe/pm.py ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ from paddlemix.models.diffsinger.basics.base_pe import BasePE
16
+ from paddlemix.models.diffsinger.utils.binarizer_utils import get_pitch_parselmouth
17
+
18
+
19
+ class ParselmouthPE(BasePE):
20
+ def get_pitch(self, waveform, samplerate, length, *, hop_size, f0_min=65, f0_max=1100, speed=1, interp_uv=False):
21
+ return get_pitch_parselmouth(
22
+ waveform,
23
+ samplerate=samplerate,
24
+ length=length,
25
+ hop_size=hop_size,
26
+ f0_min=f0_min,
27
+ f0_max=f0_max,
28
+ speed=speed,
29
+ interp_uv=interp_uv,
30
+ )
VLMEvalKit_old/PaddleMIX/paddlemix/models/diffsinger/modules/pe/rmvpe/__init__.py ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ from .constants import *
16
+ from .inference import RMVPE
17
+ from .model import E2E0
18
+ from .spec import MelSpectrogram
19
+ from .utils import to_local_average_f0, to_viterbi_f0
VLMEvalKit_old/PaddleMIX/paddlemix/models/diffsinger/modules/pe/rmvpe/constants.py ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ SAMPLE_RATE = 16000
16
+ N_CLASS = 360
17
+ N_MELS = 128
18
+ MEL_FMIN = 30
19
+ MEL_FMAX = 8000
20
+ WINDOW_LENGTH = 1024
21
+ CONST = 1997.379408437619
VLMEvalKit_old/PaddleMIX/paddlemix/models/diffsinger/modules/pe/rmvpe/deepunet.py ADDED
@@ -0,0 +1,194 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ import paddle
16
+
17
+ from .constants import N_MELS
18
+
19
+
20
+ class ConvBlockRes(paddle.nn.Layer):
21
+ def __init__(self, in_channels, out_channels, momentum=0.01):
22
+ super(ConvBlockRes, self).__init__()
23
+ self.conv = paddle.nn.Sequential(
24
+ paddle.nn.Conv2D(
25
+ in_channels=in_channels,
26
+ out_channels=out_channels,
27
+ kernel_size=(3, 3),
28
+ stride=(1, 1),
29
+ padding=(1, 1),
30
+ bias_attr=False,
31
+ ),
32
+ paddle.nn.BatchNorm2D(num_features=out_channels, momentum=1 - momentum),
33
+ paddle.nn.ReLU(),
34
+ paddle.nn.Conv2D(
35
+ in_channels=out_channels,
36
+ out_channels=out_channels,
37
+ kernel_size=(3, 3),
38
+ stride=(1, 1),
39
+ padding=(1, 1),
40
+ bias_attr=False,
41
+ ),
42
+ paddle.nn.BatchNorm2D(num_features=out_channels, momentum=1 - momentum),
43
+ paddle.nn.ReLU(),
44
+ )
45
+ if in_channels != out_channels:
46
+ self.shortcut = paddle.nn.Conv2D(in_channels=in_channels, out_channels=out_channels, kernel_size=(1, 1))
47
+ self.is_shortcut = True
48
+ else:
49
+ self.is_shortcut = False
50
+
51
+ def forward(self, x):
52
+ if self.is_shortcut:
53
+ return self.conv(x) + self.shortcut(x)
54
+ else:
55
+ return self.conv(x) + x
56
+
57
+
58
+ class ResEncoderBlock(paddle.nn.Layer):
59
+ def __init__(self, in_channels, out_channels, kernel_size, n_blocks=1, momentum=0.01):
60
+ super(ResEncoderBlock, self).__init__()
61
+ self.n_blocks = n_blocks
62
+ self.conv = paddle.nn.LayerList()
63
+ self.conv.append(ConvBlockRes(in_channels, out_channels, momentum))
64
+ for i in range(n_blocks - 1):
65
+ self.conv.append(ConvBlockRes(out_channels, out_channels, momentum))
66
+ self.kernel_size = kernel_size
67
+ if self.kernel_size is not None:
68
+ self.pool = paddle.nn.AvgPool2D(kernel_size=kernel_size, exclusive=False)
69
+
70
+ def forward(self, x):
71
+ for i in range(self.n_blocks):
72
+ x = self.conv[i](x)
73
+ if self.kernel_size is not None:
74
+ return x, self.pool(x)
75
+ else:
76
+ return x
77
+
78
+
79
+ class ResDecoderBlock(paddle.nn.Layer):
80
+ def __init__(self, in_channels, out_channels, stride, n_blocks=1, momentum=0.01):
81
+ super(ResDecoderBlock, self).__init__()
82
+ out_padding = (0, 1) if stride == (1, 2) else (1, 1)
83
+ self.n_blocks = n_blocks
84
+ self.conv1 = paddle.nn.Sequential(
85
+ paddle.nn.Conv2DTranspose(
86
+ in_channels=in_channels,
87
+ out_channels=out_channels,
88
+ kernel_size=(3, 3),
89
+ stride=stride,
90
+ padding=(1, 1),
91
+ output_padding=out_padding,
92
+ bias_attr=False,
93
+ ),
94
+ paddle.nn.BatchNorm2D(num_features=out_channels, momentum=1 - momentum),
95
+ paddle.nn.ReLU(),
96
+ )
97
+ self.conv2 = paddle.nn.LayerList()
98
+ self.conv2.append(ConvBlockRes(out_channels * 2, out_channels, momentum))
99
+ for i in range(n_blocks - 1):
100
+ self.conv2.append(ConvBlockRes(out_channels, out_channels, momentum))
101
+
102
+ def forward(self, x, concat_tensor):
103
+ x = self.conv1(x)
104
+ x = paddle.concat(x=(x, concat_tensor), axis=1)
105
+ for i in range(self.n_blocks):
106
+ x = self.conv2[i](x)
107
+ return x
108
+
109
+
110
+ class Encoder(paddle.nn.Layer):
111
+ def __init__(self, in_channels, in_size, n_encoders, kernel_size, n_blocks, out_channels=16, momentum=0.01):
112
+ super(Encoder, self).__init__()
113
+ self.n_encoders = n_encoders
114
+ self.bn = paddle.nn.BatchNorm2D(num_features=in_channels, momentum=1 - momentum)
115
+ self.layers = paddle.nn.LayerList()
116
+ self.latent_channels = []
117
+ for i in range(self.n_encoders):
118
+ self.layers.append(ResEncoderBlock(in_channels, out_channels, kernel_size, n_blocks, momentum=momentum))
119
+ self.latent_channels.append([out_channels, in_size])
120
+ in_channels = out_channels
121
+ out_channels *= 2
122
+ in_size //= 2
123
+ self.out_size = in_size
124
+ self.out_channel = out_channels
125
+
126
+ def forward(self, x):
127
+ concat_tensors = []
128
+ x = self.bn(x)
129
+ for i in range(self.n_encoders):
130
+ _, x = self.layers[i](x)
131
+ concat_tensors.append(_)
132
+ return x, concat_tensors
133
+
134
+
135
+ class Intermediate(paddle.nn.Layer):
136
+ def __init__(self, in_channels, out_channels, n_inters, n_blocks, momentum=0.01):
137
+ super(Intermediate, self).__init__()
138
+ self.n_inters = n_inters
139
+ self.layers = paddle.nn.LayerList()
140
+ self.layers.append(ResEncoderBlock(in_channels, out_channels, None, n_blocks, momentum))
141
+ for i in range(self.n_inters - 1):
142
+ self.layers.append(ResEncoderBlock(out_channels, out_channels, None, n_blocks, momentum))
143
+
144
+ def forward(self, x):
145
+ for i in range(self.n_inters):
146
+ x = self.layers[i](x)
147
+ return x
148
+
149
+
150
+ class Decoder(paddle.nn.Layer):
151
+ def __init__(self, in_channels, n_decoders, stride, n_blocks, momentum=0.01):
152
+ super(Decoder, self).__init__()
153
+ self.layers = paddle.nn.LayerList()
154
+ self.n_decoders = n_decoders
155
+ for i in range(self.n_decoders):
156
+ out_channels = in_channels // 2
157
+ self.layers.append(ResDecoderBlock(in_channels, out_channels, stride, n_blocks, momentum))
158
+ in_channels = out_channels
159
+
160
+ def forward(self, x, concat_tensors):
161
+ for i in range(self.n_decoders):
162
+ x = self.layers[i](x, concat_tensors[-1 - i])
163
+ return x
164
+
165
+
166
+ class TimbreFilter(paddle.nn.Layer):
167
+ def __init__(self, latent_rep_channels):
168
+ super(TimbreFilter, self).__init__()
169
+ self.layers = paddle.nn.LayerList()
170
+ for latent_rep in latent_rep_channels:
171
+ self.layers.append(ConvBlockRes(latent_rep[0], latent_rep[0]))
172
+
173
+ def forward(self, x_tensors):
174
+ out_tensors = []
175
+ for i, layer in enumerate(self.layers):
176
+ out_tensors.append(layer(x_tensors[i]))
177
+ return out_tensors
178
+
179
+
180
+ class DeepUnet0(paddle.nn.Layer):
181
+ def __init__(self, kernel_size, n_blocks, en_de_layers=5, inter_layers=4, in_channels=1, en_out_channels=16):
182
+ super(DeepUnet0, self).__init__()
183
+ self.encoder = Encoder(in_channels, N_MELS, en_de_layers, kernel_size, n_blocks, en_out_channels)
184
+ self.intermediate = Intermediate(
185
+ self.encoder.out_channel // 2, self.encoder.out_channel, inter_layers, n_blocks
186
+ )
187
+ self.tf = TimbreFilter(self.encoder.latent_channels)
188
+ self.decoder = Decoder(self.encoder.out_channel, en_de_layers, kernel_size, n_blocks)
189
+
190
+ def forward(self, x):
191
+ x, concat_tensors = self.encoder(x)
192
+ x = self.intermediate(x)
193
+ x = self.decoder(x, concat_tensors)
194
+ return x
VLMEvalKit_old/PaddleMIX/paddlemix/models/diffsinger/modules/pe/rmvpe/inference.py ADDED
@@ -0,0 +1,80 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ import numpy as np
16
+ import paddle
17
+ from basics.base_pe import BasePE
18
+ from torchaudio.transforms import Resample
19
+ from utils.infer_utils import resample_align_curve
20
+ from utils.pitch_utils import interp_f0
21
+
22
+ from .constants import *
23
+ from .model import E2E0
24
+ from .spec import MelSpectrogram
25
+ from .utils import to_local_average_f0, to_viterbi_f0
26
+
27
+
28
+ class RMVPE(BasePE):
29
+ def __init__(self, model_path, hop_length=160):
30
+ self.resample_kernel = {}
31
+ self.device = "cuda" if paddle.device.cuda.device_count() >= 1 else "cpu"
32
+ self.model = E2E0(4, 1, (2, 2)).eval().to(self.device)
33
+ ckpt = paddle.load(path=str(model_path))
34
+ self.model.set_state_dict(state_dict=ckpt["model"])
35
+ self.mel_extractor = MelSpectrogram(
36
+ N_MELS, SAMPLE_RATE, WINDOW_LENGTH, hop_length, None, MEL_FMIN, MEL_FMAX
37
+ ).to(self.device)
38
+
39
+ @paddle.no_grad()
40
+ def mel2hidden(self, mel):
41
+ n_frames = tuple(mel.shape)[-1]
42
+ mel = paddle.nn.functional.pad(
43
+ x=mel, pad=(0, 32 * ((n_frames - 1) // 32 + 1) - n_frames), mode="constant", pad_from_left_axis=False
44
+ )
45
+ hidden = self.model(mel)
46
+ return hidden[:, :n_frames]
47
+
48
+ def decode(self, hidden, thred=0.03, use_viterbi=False):
49
+ if use_viterbi:
50
+ f0 = to_viterbi_f0(hidden, thred=thred)
51
+ else:
52
+ f0 = to_local_average_f0(hidden, thred=thred)
53
+ return f0
54
+
55
+ def infer_from_audio(self, audio, sample_rate=16000, thred=0.03, use_viterbi=False):
56
+ audio = paddle.to_tensor(data=audio).astype(dtype="float32").unsqueeze(axis=0).to(self.device)
57
+ if sample_rate == 16000:
58
+ audio_res = audio
59
+ else:
60
+ key_str = str(sample_rate)
61
+ if key_str not in self.resample_kernel:
62
+ self.resample_kernel[key_str] = Resample(sample_rate, 16000, lowpass_filter_width=128)
63
+ self.resample_kernel[key_str] = self.resample_kernel[key_str].to(self.device)
64
+ audio_res = self.resample_kernel[key_str](audio)
65
+ mel = self.mel_extractor(audio_res, center=True)
66
+ hidden = self.mel2hidden(mel)
67
+ f0 = self.decode(hidden, thred=thred, use_viterbi=use_viterbi)
68
+ return f0
69
+
70
+ def get_pitch(self, waveform, samplerate, length, *, hop_size, f0_min=65, f0_max=1100, speed=1, interp_uv=False):
71
+ f0 = self.infer_from_audio(waveform, sample_rate=samplerate)
72
+ uv = f0 == 0
73
+ f0, uv = interp_f0(f0, uv)
74
+ hop_size = int(np.round(hop_size * speed))
75
+ time_step = hop_size / samplerate
76
+ f0_res = resample_align_curve(f0, 0.01, time_step, length)
77
+ uv_res = resample_align_curve(uv.astype(np.float32), 0.01, time_step, length) > 0.5
78
+ if not interp_uv:
79
+ f0_res[uv_res] = 0
80
+ return f0_res, uv_res
VLMEvalKit_old/PaddleMIX/paddlemix/models/diffsinger/modules/pe/rmvpe/model.py ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ import sys
16
+
17
+ import paddle
18
+ import paddle_aux
19
+
20
+ from .constants import *
21
+ from .deepunet import DeepUnet0
22
+ from .seq import BiGRU
23
+
24
+
25
+ class E2E0(paddle.nn.Layer):
26
+ def __init__(
27
+ self, n_blocks, n_gru, kernel_size, en_de_layers=5, inter_layers=4, in_channels=1, en_out_channels=16
28
+ ):
29
+ super(E2E0, self).__init__()
30
+ self.unet = DeepUnet0(kernel_size, n_blocks, en_de_layers, inter_layers, in_channels, en_out_channels)
31
+ self.cnn = paddle.nn.Conv2D(in_channels=en_out_channels, out_channels=3, kernel_size=(3, 3), padding=(1, 1))
32
+ if n_gru:
33
+ self.fc = paddle.nn.Sequential(
34
+ BiGRU(3 * N_MELS, 256, n_gru),
35
+ paddle.nn.Linear(in_features=512, out_features=N_CLASS),
36
+ paddle.nn.Dropout(p=0.25),
37
+ paddle.nn.Sigmoid(),
38
+ )
39
+ else:
40
+ self.fc = paddle.nn.Sequential(
41
+ paddle.nn.Linear(in_features=3 * N_MELS, out_features=N_CLASS),
42
+ paddle.nn.Dropout(p=0.25),
43
+ paddle.nn.Sigmoid(),
44
+ )
45
+
46
+ def forward(self, mel):
47
+ mel = mel.transpose(perm=paddle_aux.transpose_aux_func(mel.ndim, -1, -2)).unsqueeze(axis=1)
48
+ x = (
49
+ self.cnn(self.unet(mel))
50
+ .transpose(perm=paddle_aux.transpose_aux_func(self.cnn(self.unet(mel)).ndim, 1, 2))
51
+ .flatten(start_axis=-2)
52
+ )
53
+ x = self.fc(x)
54
+ return x
VLMEvalKit_old/PaddleMIX/paddlemix/models/diffsinger/modules/pe/rmvpe/seq.py ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ import paddle
16
+
17
+
18
+ class BiGRU(paddle.nn.Layer):
19
+ def __init__(self, input_features, hidden_features, num_layers):
20
+ super(BiGRU, self).__init__()
21
+ self.gru = paddle.nn.GRU(
22
+ input_size=input_features,
23
+ hidden_size=hidden_features,
24
+ num_layers=num_layers,
25
+ time_major=not True,
26
+ direction="bidirect",
27
+ )
28
+
29
+ def forward(self, x):
30
+ return self.gru(x)[0]
VLMEvalKit_old/PaddleMIX/paddlemix/models/diffsinger/modules/pe/rmvpe/spec.py ADDED
@@ -0,0 +1,65 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ import numpy as np
16
+ import paddle
17
+ from librosa.filters import mel
18
+
19
+
20
+ class MelSpectrogram(paddle.nn.Layer):
21
+ def __init__(
22
+ self, n_mel_channels, sampling_rate, win_length, hop_length, n_fft=None, mel_fmin=0, mel_fmax=None, clamp=1e-05
23
+ ):
24
+ super().__init__()
25
+ n_fft = win_length if n_fft is None else n_fft
26
+ self.hann_window = {}
27
+ mel_basis = mel(sr=sampling_rate, n_fft=n_fft, n_mels=n_mel_channels, fmin=mel_fmin, fmax=mel_fmax, htk=True)
28
+ mel_basis = paddle.to_tensor(data=mel_basis).astype(dtype="float32")
29
+ self.register_buffer(name="mel_basis", tensor=mel_basis)
30
+ self.n_fft = win_length if n_fft is None else n_fft
31
+ self.hop_length = hop_length
32
+ self.win_length = win_length
33
+ self.sampling_rate = sampling_rate
34
+ self.n_mel_channels = n_mel_channels
35
+ self.clamp = clamp
36
+
37
+ def forward(self, audio, keyshift=0, speed=1, center=True):
38
+ factor = 2 ** (keyshift / 12)
39
+ n_fft_new = int(np.round(self.n_fft * factor))
40
+ win_length_new = int(np.round(self.win_length * factor))
41
+ hop_length_new = int(np.round(self.hop_length * speed))
42
+ keyshift_key = str(keyshift) + "_" + str(audio.place)
43
+ if keyshift_key not in self.hann_window:
44
+ self.hann_window[keyshift_key] = paddle.audio.functional.get_window("hann", win_length_new).to(audio.place)
45
+ fft = paddle.signal.stft(
46
+ audio,
47
+ n_fft=n_fft_new,
48
+ hop_length=hop_length_new,
49
+ win_length=win_length_new,
50
+ window=self.hann_window[keyshift_key],
51
+ center=center,
52
+ return_complex=True,
53
+ )
54
+ magnitude = fft.abs()
55
+ if keyshift != 0:
56
+ size = self.n_fft // 2 + 1
57
+ resize = magnitude.shape[1]
58
+ if resize < size:
59
+ magnitude = paddle.nn.functional.pad(
60
+ x=magnitude, pad=(0, 0, 0, size - resize), pad_from_left_axis=False
61
+ )
62
+ magnitude = magnitude[:, :size, :] * self.win_length / win_length_new
63
+ mel_output = paddle.matmul(x=self.mel_basis, y=magnitude)
64
+ log_mel_spec = paddle.log(x=paddle.clip(x=mel_output, min=self.clamp))
65
+ return log_mel_spec
VLMEvalKit_old/PaddleMIX/paddlemix/models/diffsinger/modules/pe/rmvpe/utils.py ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ import sys
16
+
17
+ import librosa
18
+ import numpy as np
19
+ import paddle
20
+ import paddle_aux
21
+
22
+ from .constants import *
23
+
24
+
25
+ def to_local_average_f0(hidden, center=None, thred=0.03):
26
+ idx = paddle.arange(end=N_CLASS)[None, None, :]
27
+ idx_cents = idx * 20 + CONST
28
+ if center is None:
29
+ center = paddle.argmax(x=hidden, axis=2, keepdim=True)
30
+ start = paddle.clip(x=center - 4, min=0)
31
+ end = paddle.clip(x=center + 5, max=N_CLASS)
32
+ idx_mask = (idx >= start) & (idx < end)
33
+ weights = hidden * idx_mask
34
+ product_sum = paddle.sum(x=weights * idx_cents, axis=2)
35
+ weight_sum = paddle.sum(x=weights, axis=2)
36
+ cents = product_sum / (weight_sum + (weight_sum == 0))
37
+ f0 = 10 * 2 ** (cents / 1200)
38
+ uv = hidden.max(dim=2)[0] < thred
39
+ f0 = f0 * ~uv
40
+ return f0.squeeze(axis=0).cpu().numpy()
41
+
42
+
43
+ def to_viterbi_f0(hidden, thred=0.03):
44
+ if not hasattr(to_viterbi_f0, "transition"):
45
+ xx, yy = np.meshgrid(range(N_CLASS), range(N_CLASS))
46
+ transition = np.maximum(30 - abs(xx - yy), 0)
47
+ transition = transition / transition.sum(axis=1, keepdims=True)
48
+ to_viterbi_f0.transition = transition
49
+ prob = hidden.squeeze(axis=0).cpu().numpy()
50
+ prob = prob.T
51
+ prob = prob / prob.sum(axis=0)
52
+ path = librosa.sequence.viterbi(prob, to_viterbi_f0.transition).astype(np.int64)
53
+ center = paddle.to_tensor(data=path).unsqueeze(axis=0).unsqueeze(axis=-1).to(hidden.place)
54
+ return to_local_average_f0(hidden, center=center, thred=thred)
VLMEvalKit_old/PaddleMIX/paddlemix/models/diffsinger/modules/toplevel.py ADDED
@@ -0,0 +1,323 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ from typing import Dict
16
+
17
+ import paddle
18
+
19
+ import paddlemix.models.diffsinger.modules.compat as compat
20
+ from paddlemix.models.diffsinger.basics.base_module import CategorizedModule
21
+ from paddlemix.models.diffsinger.modules.aux_decoder import AuxDecoderAdaptor
22
+ from paddlemix.models.diffsinger.modules.commons.common_layers import (
23
+ NormalInitEmbedding as Embedding,
24
+ )
25
+ from paddlemix.models.diffsinger.modules.commons.common_layers import (
26
+ XavierUniformInitLinear as Linear,
27
+ )
28
+ from paddlemix.models.diffsinger.modules.core import (
29
+ GaussianDiffusion,
30
+ MultiVarianceDiffusion,
31
+ MultiVarianceRectifiedFlow,
32
+ PitchDiffusion,
33
+ PitchRectifiedFlow,
34
+ RectifiedFlow,
35
+ )
36
+ from paddlemix.models.diffsinger.modules.fastspeech.acoustic_encoder import (
37
+ FastSpeech2Acoustic,
38
+ )
39
+ from paddlemix.models.diffsinger.modules.fastspeech.param_adaptor import (
40
+ ParameterAdaptorModule,
41
+ )
42
+ from paddlemix.models.diffsinger.modules.fastspeech.tts_modules import (
43
+ LengthRegulator,
44
+ RhythmRegulator,
45
+ )
46
+ from paddlemix.models.diffsinger.modules.fastspeech.variance_encoder import (
47
+ FastSpeech2Variance,
48
+ MelodyEncoder,
49
+ )
50
+ from paddlemix.models.diffsinger.utils.hparams import hparams
51
+
52
+
53
+ class ShallowDiffusionOutput:
54
+ def __init__(self, *, aux_out=None, diff_out=None):
55
+ self.aux_out = aux_out
56
+ self.diff_out = diff_out
57
+
58
+
59
+ class DiffSingerAcoustic(CategorizedModule, ParameterAdaptorModule):
60
+ @property
61
+ def category(self):
62
+ return "acoustic"
63
+
64
+ def __init__(self, vocab_size, out_dims):
65
+ CategorizedModule.__init__(self)
66
+ ParameterAdaptorModule.__init__(self)
67
+ self.fs2 = FastSpeech2Acoustic(vocab_size=vocab_size)
68
+ self.use_shallow_diffusion = hparams.get("use_shallow_diffusion", False)
69
+ self.shallow_args = hparams.get("shallow_diffusion_args", {})
70
+ if self.use_shallow_diffusion:
71
+ self.train_aux_decoder = self.shallow_args["train_aux_decoder"]
72
+ self.train_diffusion = self.shallow_args["train_diffusion"]
73
+ self.aux_decoder_grad = self.shallow_args["aux_decoder_grad"]
74
+ self.aux_decoder = AuxDecoderAdaptor(
75
+ in_dims=hparams["hidden_size"],
76
+ out_dims=out_dims,
77
+ num_feats=1,
78
+ spec_min=hparams["spec_min"],
79
+ spec_max=hparams["spec_max"],
80
+ aux_decoder_arch=self.shallow_args["aux_decoder_arch"],
81
+ aux_decoder_args=self.shallow_args["aux_decoder_args"],
82
+ )
83
+ self.diffusion_type = hparams.get("diffusion_type", "ddpm")
84
+ self.backbone_type = compat.get_backbone_type(hparams)
85
+ self.backbone_args = compat.get_backbone_args(hparams, self.backbone_type)
86
+ if self.diffusion_type == "ddpm":
87
+ self.diffusion = GaussianDiffusion(
88
+ out_dims=out_dims,
89
+ num_feats=1,
90
+ timesteps=hparams["timesteps"],
91
+ k_step=hparams["K_step"],
92
+ backbone_type=self.backbone_type,
93
+ backbone_args=self.backbone_args,
94
+ spec_min=hparams["spec_min"],
95
+ spec_max=hparams["spec_max"],
96
+ )
97
+ elif self.diffusion_type == "reflow":
98
+ self.diffusion = RectifiedFlow(
99
+ out_dims=out_dims,
100
+ num_feats=1,
101
+ t_start=hparams["T_start"],
102
+ time_scale_factor=hparams["time_scale_factor"],
103
+ backbone_type=self.backbone_type,
104
+ backbone_args=self.backbone_args,
105
+ spec_min=hparams["spec_min"],
106
+ spec_max=hparams["spec_max"],
107
+ )
108
+ else:
109
+ raise NotImplementedError(self.diffusion_type)
110
+
111
+ def forward(
112
+ self, txt_tokens, mel2ph, f0, key_shift=None, speed=None, spk_embed_id=None, gt_mel=None, infer=True, **kwargs
113
+ ) -> ShallowDiffusionOutput:
114
+ condition = self.fs2(
115
+ txt_tokens, mel2ph, f0, key_shift=key_shift, speed=speed, spk_embed_id=spk_embed_id, **kwargs
116
+ )
117
+ if infer:
118
+ if self.use_shallow_diffusion:
119
+ aux_mel_pred = self.aux_decoder(condition, infer=True)
120
+ aux_mel_pred *= (mel2ph > 0).astype(dtype="float32")[:, :, None]
121
+ if gt_mel is not None and self.shallow_args["val_gt_start"]:
122
+ src_mel = gt_mel
123
+ else:
124
+ src_mel = aux_mel_pred
125
+ else:
126
+ aux_mel_pred = src_mel = None
127
+ mel_pred = self.diffusion(condition, src_spec=src_mel, infer=True)
128
+ mel_pred *= (mel2ph > 0).astype(dtype="float32")[:, :, None]
129
+ return ShallowDiffusionOutput(aux_out=aux_mel_pred, diff_out=mel_pred)
130
+ elif self.use_shallow_diffusion:
131
+ if self.train_aux_decoder:
132
+ aux_cond = condition * self.aux_decoder_grad + condition.detach() * (1 - self.aux_decoder_grad)
133
+ aux_out = self.aux_decoder(aux_cond, infer=False)
134
+ else:
135
+ aux_out = None
136
+ if self.train_diffusion:
137
+ diff_out = self.diffusion(condition, gt_spec=gt_mel, infer=False)
138
+ else:
139
+ diff_out = None
140
+ return ShallowDiffusionOutput(aux_out=aux_out, diff_out=diff_out)
141
+ else:
142
+ aux_out = None
143
+ diff_out = self.diffusion(condition, gt_spec=gt_mel, infer=False)
144
+ return ShallowDiffusionOutput(aux_out=aux_out, diff_out=diff_out)
145
+
146
+
147
+ class DiffSingerVariance(CategorizedModule, ParameterAdaptorModule):
148
+ @property
149
+ def category(self):
150
+ return "variance"
151
+
152
+ def __init__(self, vocab_size):
153
+ CategorizedModule.__init__(self)
154
+ ParameterAdaptorModule.__init__(self)
155
+ self.predict_dur = hparams["predict_dur"]
156
+ self.predict_pitch = hparams["predict_pitch"]
157
+ self.use_spk_id = hparams["use_spk_id"]
158
+ if self.use_spk_id:
159
+ self.spk_embed = Embedding(hparams["num_spk"], hparams["hidden_size"])
160
+ self.fs2 = FastSpeech2Variance(vocab_size=vocab_size)
161
+ self.rr = RhythmRegulator()
162
+ self.lr = LengthRegulator()
163
+ self.diffusion_type = hparams.get("diffusion_type", "ddpm")
164
+ if self.predict_pitch:
165
+ self.use_melody_encoder = hparams.get("use_melody_encoder", False)
166
+ if self.use_melody_encoder:
167
+ self.melody_encoder = MelodyEncoder(enc_hparams=hparams["melody_encoder_args"])
168
+ self.delta_pitch_embed = Linear(1, hparams["hidden_size"])
169
+ else:
170
+ self.base_pitch_embed = Linear(1, hparams["hidden_size"])
171
+ self.pitch_retake_embed = Embedding(2, hparams["hidden_size"])
172
+ pitch_hparams = hparams["pitch_prediction_args"]
173
+ self.pitch_backbone_type = compat.get_backbone_type(hparams, nested_config=pitch_hparams)
174
+ self.pitch_backbone_args = compat.get_backbone_args(pitch_hparams, backbone_type=self.pitch_backbone_type)
175
+ if self.diffusion_type == "ddpm":
176
+ self.pitch_predictor = PitchDiffusion(
177
+ vmin=pitch_hparams["pitd_norm_min"],
178
+ vmax=pitch_hparams["pitd_norm_max"],
179
+ cmin=pitch_hparams["pitd_clip_min"],
180
+ cmax=pitch_hparams["pitd_clip_max"],
181
+ repeat_bins=pitch_hparams["repeat_bins"],
182
+ timesteps=hparams["timesteps"],
183
+ k_step=hparams["K_step"],
184
+ backbone_type=self.pitch_backbone_type,
185
+ backbone_args=self.pitch_backbone_args,
186
+ )
187
+ elif self.diffusion_type == "reflow":
188
+ self.pitch_predictor = PitchRectifiedFlow(
189
+ vmin=pitch_hparams["pitd_norm_min"],
190
+ vmax=pitch_hparams["pitd_norm_max"],
191
+ cmin=pitch_hparams["pitd_clip_min"],
192
+ cmax=pitch_hparams["pitd_clip_max"],
193
+ repeat_bins=pitch_hparams["repeat_bins"],
194
+ time_scale_factor=hparams["time_scale_factor"],
195
+ backbone_type=self.pitch_backbone_type,
196
+ backbone_args=self.pitch_backbone_args,
197
+ )
198
+ else:
199
+ raise ValueError(f"Invalid diffusion type: {self.diffusion_type}")
200
+ if self.predict_variances:
201
+ self.pitch_embed = Linear(1, hparams["hidden_size"])
202
+ self.variance_embeds = paddle.nn.LayerDict(
203
+ sublayers={v_name: Linear(1, hparams["hidden_size"]) for v_name in self.variance_prediction_list}
204
+ )
205
+ if self.diffusion_type == "ddpm":
206
+ self.variance_predictor = self.build_adaptor(cls=MultiVarianceDiffusion)
207
+ elif self.diffusion_type == "reflow":
208
+ self.variance_predictor = self.build_adaptor(cls=MultiVarianceRectifiedFlow)
209
+ else:
210
+ raise NotImplementedError(self.diffusion_type)
211
+
212
+ def forward(
213
+ self,
214
+ txt_tokens,
215
+ midi,
216
+ ph2word,
217
+ ph_dur=None,
218
+ word_dur=None,
219
+ mel2ph=None,
220
+ note_midi=None,
221
+ note_rest=None,
222
+ note_dur=None,
223
+ note_glide=None,
224
+ mel2note=None,
225
+ base_pitch=None,
226
+ pitch=None,
227
+ pitch_expr=None,
228
+ pitch_retake=None,
229
+ variance_retake: Dict[str, paddle.Tensor] = None,
230
+ spk_id=None,
231
+ infer=True,
232
+ **kwargs
233
+ ):
234
+ if self.use_spk_id:
235
+ ph_spk_mix_embed = kwargs.get("ph_spk_mix_embed")
236
+ spk_mix_embed = kwargs.get("spk_mix_embed")
237
+ if ph_spk_mix_embed is not None and spk_mix_embed is not None:
238
+ ph_spk_embed = ph_spk_mix_embed
239
+ spk_embed = spk_mix_embed
240
+ else:
241
+ ph_spk_embed = spk_embed = self.spk_embed(spk_id)[:, None, :]
242
+ else:
243
+ ph_spk_embed = spk_embed = None
244
+ encoder_out, dur_pred_out = self.fs2(
245
+ txt_tokens,
246
+ midi=midi,
247
+ ph2word=ph2word,
248
+ ph_dur=ph_dur,
249
+ word_dur=word_dur,
250
+ spk_embed=ph_spk_embed,
251
+ infer=infer,
252
+ )
253
+ if not self.predict_pitch and not self.predict_variances:
254
+ return dur_pred_out, None, {} if infer else None
255
+ if mel2ph is None and word_dur is not None:
256
+ dur_pred_align = self.rr(dur_pred_out, ph2word, word_dur)
257
+ mel2ph = self.lr(dur_pred_align)
258
+ mel2ph = paddle.nn.functional.pad(
259
+ x=mel2ph, pad=[0, tuple(base_pitch.shape)[1] - tuple(mel2ph.shape)[1]], pad_from_left_axis=False
260
+ )
261
+ encoder_out = paddle.nn.functional.pad(x=encoder_out, pad=[0, 0, 1, 0], pad_from_left_axis=False)
262
+ mel2ph_ = mel2ph[..., None].tile(repeat_times=[1, 1, hparams[hidden_size]])
263
+ condition = paddle.take_along_axis(arr=encoder_out, axis=1, indices=mel2ph_, broadcast=False)
264
+ if self.use_spk_id:
265
+ condition += spk_embed
266
+ if self.predict_pitch:
267
+ if self.use_melody_encoder:
268
+ melody_encoder_out = self.melody_encoder(note_midi, note_rest, note_dur, glide=note_glide)
269
+ melody_encoder_out = paddle.nn.functional.pad(
270
+ x=melody_encoder_out, pad=[0, 0, 1, 0], pad_from_left_axis=False
271
+ )
272
+ mel2note_ = mel2note[..., None].tile(repeat_times=[1, 1, hparams[hidden_size]])
273
+ melody_condition = paddle.take_along_axis(
274
+ arr=melody_encoder_out, axis=1, indices=mel2note_, broadcast=False
275
+ )
276
+ pitch_cond = condition + melody_condition
277
+ else:
278
+ pitch_cond = condition.clone()
279
+ retake_unset = pitch_retake is None
280
+ if retake_unset:
281
+ pitch_retake = paddle.ones_like(x=mel2ph, dtype="bool")
282
+ if pitch_expr is None:
283
+ pitch_retake_embed = self.pitch_retake_embed(pitch_retake.astype(dtype="int64"))
284
+ else:
285
+ retake_true_embed = self.pitch_retake_embed(paddle.ones(shape=[1, 1], dtype="int64"))
286
+ retake_false_embed = self.pitch_retake_embed(paddle.zeros(shape=[1, 1], dtype="int64"))
287
+ pitch_expr = (pitch_expr * pitch_retake)[:, :, None]
288
+ pitch_retake_embed = pitch_expr * retake_true_embed + (1.0 - pitch_expr) * retake_false_embed
289
+ pitch_cond += pitch_retake_embed
290
+ if self.use_melody_encoder:
291
+ if retake_unset:
292
+ delta_pitch_in = paddle.zeros_like(x=base_pitch)
293
+ else:
294
+ delta_pitch_in = (pitch - base_pitch) * ~pitch_retake
295
+ pitch_cond += self.delta_pitch_embed(delta_pitch_in[:, :, None])
296
+ else:
297
+ if not retake_unset:
298
+ base_pitch = base_pitch * pitch_retake + pitch * ~pitch_retake
299
+ pitch_cond += self.base_pitch_embed(base_pitch[:, :, None])
300
+ if infer:
301
+ pitch_pred_out = self.pitch_predictor(pitch_cond, infer=True)
302
+ else:
303
+ pitch_pred_out = self.pitch_predictor(pitch_cond, pitch - base_pitch, infer=False)
304
+ else:
305
+ pitch_pred_out = None
306
+ if not self.predict_variances:
307
+ return dur_pred_out, pitch_pred_out, {} if infer else None
308
+ if pitch is None:
309
+ pitch = base_pitch + pitch_pred_out
310
+ var_cond = condition + self.pitch_embed(pitch[:, :, None])
311
+ variance_inputs = self.collect_variance_inputs(**kwargs)
312
+ if variance_retake is not None:
313
+ variance_embeds = [
314
+ (self.variance_embeds[v_name](v_input[:, :, None]) * ~variance_retake[v_name][:, :, None])
315
+ for v_name, v_input in zip(self.variance_prediction_list, variance_inputs)
316
+ ]
317
+ var_cond += paddle.stack(x=variance_embeds, axis=-1).sum(axis=-1)
318
+ variance_outputs = self.variance_predictor(var_cond, variance_inputs, infer=infer)
319
+ if infer:
320
+ variances_pred_out = self.collect_variance_outputs(variance_outputs)
321
+ else:
322
+ variances_pred_out = variance_outputs
323
+ return dur_pred_out, pitch_pred_out, variances_pred_out
VLMEvalKit_old/PaddleMIX/paddlemix/models/diffsinger/utils/__init__.py ADDED
@@ -0,0 +1,342 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ from __future__ import annotations
16
+
17
+ import pathlib
18
+ import re
19
+ import time
20
+ import types
21
+ from collections import OrderedDict
22
+
23
+ import numpy as np
24
+ import paddle
25
+
26
+ from paddlemix.models.diffsinger.basics.base_module import CategorizedModule
27
+ from paddlemix.models.diffsinger.utils import paddle_aux
28
+ from paddlemix.models.diffsinger.utils.hparams import hparams
29
+
30
+ def tensors_to_scalars(metrics):
31
+ new_metrics = {}
32
+ for k, v in metrics.items():
33
+ if isinstance(v, paddle.Tensor):
34
+ v = v.item()
35
+ if type(v) is dict:
36
+ v = tensors_to_scalars(v)
37
+ new_metrics[k] = v
38
+ return new_metrics
39
+
40
+
41
+ def collate_nd(values, pad_value=0, max_len=None):
42
+ """
43
+ Pad a list of Nd tensors on their first dimension and stack them into a (N+1)d tensor.
44
+ """
45
+ size = max(v.shape[0] for v in values) if max_len is None else max_len, *tuple(values[0].shape)[1:]
46
+ res = paddle.full(shape=(len(values), *size), fill_value=pad_value, dtype=values[0].dtype)
47
+ for i, v in enumerate(values):
48
+ res[i, : len(v), ...] = v
49
+ return res
50
+
51
+
52
+ def random_continuous_masks(*shape: int, dim: int, device: (str | (paddle.CPUPlace, paddle.CUDAPlace, str)) = "cpu"): # type: ignore
53
+ start, end = (
54
+ paddle.sort(
55
+ x=paddle.randint(
56
+ low=0, high=shape[dim] + 1, shape=(*shape[:dim], 2, *((1,) * (len(shape) - dim - 1)))
57
+ ).expand(shape=[*((-1,) * (dim + 1)), *shape[dim + 1 :]]),
58
+ axis=dim,
59
+ ),
60
+ paddle.argsort(
61
+ x=paddle.randint(
62
+ low=0, high=shape[dim] + 1, shape=(*shape[:dim], 2, *((1,) * (len(shape) - dim - 1)))
63
+ ).expand(shape=[*((-1,) * (dim + 1)), *shape[dim + 1 :]]),
64
+ axis=dim,
65
+ ),
66
+ )[0].split(1, dim=dim)
67
+ idx = paddle.arange(start=0, end=shape[dim], dtype="int64").reshape(
68
+ *((1,) * dim), shape[dim], *((1,) * (len(shape) - dim - 1))
69
+ )
70
+ masks = (idx >= start) & (idx < end)
71
+ return masks
72
+
73
+
74
+ def _is_batch_full(batch, num_frames, max_batch_frames, max_batch_size):
75
+ if len(batch) == 0:
76
+ return 0
77
+ if len(batch) == max_batch_size:
78
+ return 1
79
+ if num_frames > max_batch_frames:
80
+ return 1
81
+ return 0
82
+
83
+
84
+ def batch_by_size(indices, num_frames_fn, max_batch_frames=80000, max_batch_size=48, required_batch_size_multiple=1):
85
+ """
86
+ Yield mini-batches of indices bucketed by size. Batches may contain
87
+ sequences of different lengths.
88
+
89
+ Args:
90
+ indices (List[int]): ordered list of dataset indices
91
+ num_frames_fn (callable): function that returns the number of frames at
92
+ a given index
93
+ max_batch_frames (int, optional): max number of frames in each batch
94
+ (default: 80000).
95
+ max_batch_size (int, optional): max number of sentences in each
96
+ batch (default: 48).
97
+ required_batch_size_multiple: require the batch size to be multiple
98
+ of a given number
99
+ """
100
+ bsz_mult = required_batch_size_multiple
101
+ if isinstance(indices, types.GeneratorType):
102
+ indices = np.fromiter(indices, dtype=np.int64, count=-1)
103
+ sample_len = 0
104
+ sample_lens = []
105
+ batch = []
106
+ batches = []
107
+ for i in range(len(indices)):
108
+ idx = indices[i]
109
+ num_frames = num_frames_fn(idx)
110
+ sample_lens.append(num_frames)
111
+ sample_len = max(sample_len, num_frames)
112
+ assert (
113
+ sample_len <= max_batch_frames
114
+ ), "sentence at index {} of size {} exceeds max_batch_samples limit of {}!".format(
115
+ idx, sample_len, max_batch_frames
116
+ )
117
+ num_frames = (len(batch) + 1) * sample_len
118
+ if _is_batch_full(batch, num_frames, max_batch_frames, max_batch_size):
119
+ mod_len = max(bsz_mult * (len(batch) // bsz_mult), len(batch) % bsz_mult)
120
+ batches.append(batch[:mod_len])
121
+ batch = batch[mod_len:]
122
+ sample_lens = sample_lens[mod_len:]
123
+ sample_len = max(sample_lens) if len(sample_lens) > 0 else 0
124
+ batch.append(idx)
125
+ if len(batch) > 0:
126
+ batches.append(batch)
127
+ return batches
128
+
129
+
130
+ def make_positions(tensor, padding_idx):
131
+ """Replace non-padding symbols with their position numbers.
132
+
133
+ Position numbers begin at padding_idx+1. Padding symbols are ignored.
134
+ """
135
+ mask = tensor.not_equal(y=paddle.to_tensor(padding_idx)).astype(dtype="int32")
136
+ return (paddle.cumsum(x=mask, axis=1).astype(dtype=mask.dtype) * mask).astype(dtype="int64") + padding_idx
137
+
138
+
139
+ def softmax(x, dim):
140
+ return paddle.nn.functional.softmax(x=x, axis=dim, dtype="float32")
141
+
142
+
143
+ def unpack_dict_to_list(samples):
144
+ samples_ = []
145
+ bsz = samples.get("outputs").shape[0]
146
+ for i in range(bsz):
147
+ res = {}
148
+ for k, v in samples.items():
149
+ try:
150
+ res[k] = v[i]
151
+ except:
152
+ pass
153
+ samples_.append(res)
154
+ return samples_
155
+
156
+
157
+ def filter_kwargs(dict_to_filter, kwarg_obj):
158
+ import inspect
159
+
160
+ sig = inspect.signature(kwarg_obj)
161
+ if any(param.kind == param.VAR_KEYWORD for param in sig.parameters.values()):
162
+ return dict_to_filter.copy()
163
+ filter_keys = [
164
+ param.name
165
+ for param in sig.parameters.values()
166
+ if param.kind == param.POSITIONAL_OR_KEYWORD or param.kind == param.KEYWORD_ONLY
167
+ ]
168
+ filtered_dict = {
169
+ filter_key: dict_to_filter[filter_key] for filter_key in filter_keys if filter_key in dict_to_filter
170
+ }
171
+ return filtered_dict
172
+
173
+
174
+ def load_ckpt(
175
+ cur_model,
176
+ ckpt_base_dir,
177
+ ckpt_steps=None,
178
+ prefix_in_ckpt="model",
179
+ ignored_prefixes=None,
180
+ key_in_ckpt="state_dict",
181
+ strict=True,
182
+ device="cpu",
183
+ ):
184
+ if ignored_prefixes is None:
185
+ ignored_prefixes = ["model.fs2.encoder.embed_tokens"]
186
+ if not isinstance(ckpt_base_dir, pathlib.Path):
187
+ ckpt_base_dir = pathlib.Path(ckpt_base_dir)
188
+ if ckpt_base_dir.is_file():
189
+ checkpoint_path = [ckpt_base_dir]
190
+ elif ckpt_steps is not None:
191
+ checkpoint_path = [ckpt_base_dir / f"model_ckpt_steps_{int(ckpt_steps)}.ckpt"]
192
+ else:
193
+ base_dir = ckpt_base_dir
194
+ checkpoint_path = sorted(
195
+ [
196
+ ckpt_file
197
+ for ckpt_file in base_dir.iterdir()
198
+ if ckpt_file.is_file() and re.fullmatch("model_ckpt_steps_\\d+\\.ckpt", ckpt_file.name)
199
+ ],
200
+ key=lambda x: int(re.search("\\d+", x.name).group(0)),
201
+ )
202
+ assert len(checkpoint_path) > 0, f"| ckpt not found in {ckpt_base_dir}."
203
+ checkpoint_path = checkpoint_path[-1]
204
+ ckpt_loaded = paddle.load(path=str(checkpoint_path))
205
+ if isinstance(cur_model, CategorizedModule):
206
+ cur_model.check_category(ckpt_loaded.get("category"))
207
+ if key_in_ckpt is None:
208
+ state_dict = ckpt_loaded
209
+ else:
210
+ state_dict = ckpt_loaded[key_in_ckpt]
211
+ if prefix_in_ckpt is not None:
212
+ state_dict = OrderedDict(
213
+ {
214
+ k[len(prefix_in_ckpt) + 1 :]: v
215
+ for k, v in state_dict.items()
216
+ if k.startswith(f"{prefix_in_ckpt}.")
217
+ if all(not k.startswith(p) for p in ignored_prefixes)
218
+ }
219
+ )
220
+ if not strict:
221
+ cur_model_state_dict = cur_model.state_dict()
222
+ unmatched_keys = []
223
+ for key, param in state_dict.items():
224
+ if key in cur_model_state_dict:
225
+ new_param = cur_model_state_dict[key]
226
+ if tuple(new_param.shape) != tuple(param.shape):
227
+ unmatched_keys.append(key)
228
+ print("| Unmatched keys: ", key, tuple(new_param.shape), tuple(param.shape))
229
+ for key in unmatched_keys:
230
+ del state_dict[key]
231
+ cur_model.set_state_dict(state_dict=state_dict)
232
+ shown_model_name = "state dict"
233
+ if prefix_in_ckpt is not None:
234
+ shown_model_name = f"'{prefix_in_ckpt}'"
235
+ elif key_in_ckpt is not None:
236
+ shown_model_name = f"'{key_in_ckpt}'"
237
+ print(f"| load {shown_model_name} from '{checkpoint_path}'.")
238
+
239
+
240
+ def remove_padding(x, padding_idx=0):
241
+ if x is None:
242
+ return None
243
+ assert len(tuple(x.shape)) in [1, 2]
244
+ if len(tuple(x.shape)) == 2:
245
+ return x[np.abs(x).sum(-1) != padding_idx]
246
+ elif len(tuple(x.shape)) == 1:
247
+ return x[x != padding_idx]
248
+
249
+
250
+ class Timer:
251
+ timer_map = {}
252
+
253
+ def __init__(self, name, print_time=False):
254
+ if name not in Timer.timer_map:
255
+ Timer.timer_map[name] = 0
256
+ self.name = name
257
+ self.print_time = print_time
258
+
259
+ def __enter__(self):
260
+ self.t = time.time()
261
+
262
+ def __exit__(self, exc_type, exc_val, exc_tb):
263
+ Timer.timer_map[self.name] += time.time() - self.t
264
+ if self.print_time:
265
+ print(self.name, Timer.timer_map[self.name])
266
+
267
+
268
+ def print_arch(model, model_name="model"):
269
+ print(f"| {model_name} Arch: ", model)
270
+
271
+
272
+ def num_params(model, print_out=True, model_name="model"):
273
+ parameters = filter(lambda p: not p.stop_gradient, model.parameters())
274
+ parameters = sum([np.prod(tuple(p.shape)) for p in parameters]) / 1000000
275
+ if print_out:
276
+ print(f"| {model_name} Trainable Parameters: %.3fM" % parameters)
277
+ return parameters
278
+
279
+
280
+ def build_object_from_class_name(cls_str, parent_cls, *args, **kwargs):
281
+ import importlib
282
+
283
+ pkg = ".".join(cls_str.split(".")[:-1])
284
+ cls_name = cls_str.split(".")[-1]
285
+ cls_type = getattr(importlib.import_module(pkg), cls_name)
286
+ if parent_cls is not None:
287
+ assert issubclass(cls_type, parent_cls), f"| {cls_type} is not subclass of {parent_cls}."
288
+ return cls_type(*args, **filter_kwargs(kwargs, cls_type))
289
+
290
+
291
+ def build_lr_scheduler_from_config(optimizer, scheduler_args):
292
+ # try:
293
+ # except ImportError:
294
+ from paddle.optimizer.lr import LRScheduler as LRScheduler
295
+
296
+ def helper(params):
297
+ if isinstance(params, list):
298
+ return [helper(s) for s in params]
299
+ elif isinstance(params, dict):
300
+ resolved = {k: helper(v) for k, v in params.items()}
301
+ if "cls" in resolved:
302
+ if (
303
+ resolved["cls"] == "torch.optim.lr_scheduler.ChainedScheduler"
304
+ and scheduler_args["scheduler_cls"] == "torch.optim.lr_scheduler.SequentialLR"
305
+ ):
306
+ raise ValueError(f"ChainedScheduler cannot be part of a SequentialLR.")
307
+ resolved["optimizer"] = optimizer
308
+ obj = build_object_from_class_name(resolved["cls"], LRScheduler, **resolved)
309
+ return obj
310
+ return resolved
311
+ else:
312
+ return params
313
+
314
+ resolved = helper(scheduler_args)
315
+ resolved["optimizer"] = optimizer
316
+ return build_object_from_class_name(scheduler_args["scheduler_cls"], LRScheduler, **resolved)
317
+
318
+
319
+ def simulate_lr_scheduler(optimizer_args, scheduler_args, step_count, num_param_groups=1):
320
+ optimizer = build_object_from_class_name(
321
+ optimizer_args["optimizer_cls"],
322
+ paddle.optimizer.Optimizer,
323
+ [
324
+ {
325
+ "params": paddle.base.framework.EagerParamBase.from_tensor(tensor=paddle.to_tensor([])),
326
+ "initial_lr": optimizer_args["lr"],
327
+ }
328
+ for _ in range(num_param_groups)
329
+ ],
330
+ **optimizer_args,
331
+ )
332
+ scheduler = build_lr_scheduler_from_config(optimizer, scheduler_args)
333
+ scheduler.optimizer._step_count = 1
334
+ for _ in range(step_count):
335
+ scheduler.step()
336
+ return scheduler.state_dict()
337
+
338
+
339
+ def remove_suffix(string: str, suffix: str):
340
+ if string.endswith(suffix):
341
+ string = string[: -len(suffix)]
342
+ return string