Add files using upload-large-folder tool
Browse filesThis view is limited to 50 files because it contains too many changes.
See raw diff
- .gitattributes +1 -0
- DeepSeek-VL2/vg.jpg +0 -0
- VLMEvalKit_old/PaddleMIX/paddlemix/examples/GOT_OCR_2_0/README.md +110 -0
- VLMEvalKit_old/PaddleMIX/paddlemix/examples/GOT_OCR_2_0/requirement.txt +3 -0
- VLMEvalKit_old/PaddleMIX/paddlemix/examples/GOT_OCR_2_0/run_train.sh +78 -0
- VLMEvalKit_old/PaddleMIX/paddlemix/examples/GOT_OCR_2_0/train_GOT.py +243 -0
- VLMEvalKit_old/PaddleMIX/paddlemix/examples/ppdocbee/app.py +350 -0
- VLMEvalKit_old/PaddleMIX/paddlemix/models/GOT/utils/conversation.py +400 -0
- VLMEvalKit_old/PaddleMIX/paddlemix/models/audioldm2/encoders/phoneme_encoder/__init__.py +13 -0
- VLMEvalKit_old/PaddleMIX/paddlemix/models/audioldm2/encoders/phoneme_encoder/cleaners.py +103 -0
- VLMEvalKit_old/PaddleMIX/paddlemix/models/audioldm2/encoders/phoneme_encoder/symbols.py +28 -0
- VLMEvalKit_old/PaddleMIX/paddlemix/models/audioldm2/encoders/phoneme_encoder/text.py +62 -0
- VLMEvalKit_old/PaddleMIX/paddlemix/models/audioldm2/unet/attention.py +199 -0
- VLMEvalKit_old/PaddleMIX/paddlemix/models/diffsinger/basics/base_augmentation.py +46 -0
- VLMEvalKit_old/PaddleMIX/paddlemix/models/diffsinger/basics/base_binarizer.py +330 -0
- VLMEvalKit_old/PaddleMIX/paddlemix/models/diffsinger/basics/base_exporter.py +72 -0
- VLMEvalKit_old/PaddleMIX/paddlemix/models/diffsinger/basics/base_svs_infer.py +149 -0
- VLMEvalKit_old/PaddleMIX/paddlemix/models/diffsinger/basics/base_vocoder.py +37 -0
- VLMEvalKit_old/PaddleMIX/paddlemix/models/diffsinger/modules/aux_decoder/convnext.py +103 -0
- VLMEvalKit_old/PaddleMIX/paddlemix/models/diffsinger/modules/backbones/__init__.py +26 -0
- VLMEvalKit_old/PaddleMIX/paddlemix/models/diffsinger/modules/backbones/lynxnet.py +188 -0
- VLMEvalKit_old/PaddleMIX/paddlemix/models/diffsinger/modules/backbones/wavenet.py +120 -0
- VLMEvalKit_old/PaddleMIX/paddlemix/models/diffsinger/modules/commons/common_layers.py +187 -0
- VLMEvalKit_old/PaddleMIX/paddlemix/models/diffsinger/modules/commons/espnet_positional_embedding.py +129 -0
- VLMEvalKit_old/PaddleMIX/paddlemix/models/diffsinger/modules/compat.py +35 -0
- VLMEvalKit_old/PaddleMIX/paddlemix/models/diffsinger/modules/core/__init__.py +16 -0
- VLMEvalKit_old/PaddleMIX/paddlemix/models/diffsinger/modules/core/ddpm.py +521 -0
- VLMEvalKit_old/PaddleMIX/paddlemix/models/diffsinger/modules/core/reflow.py +311 -0
- VLMEvalKit_old/PaddleMIX/paddlemix/models/diffsinger/modules/fastspeech/acoustic_encoder.py +110 -0
- VLMEvalKit_old/PaddleMIX/paddlemix/models/diffsinger/modules/fastspeech/param_adaptor.py +88 -0
- VLMEvalKit_old/PaddleMIX/paddlemix/models/diffsinger/modules/fastspeech/tts_modules.py +473 -0
- VLMEvalKit_old/PaddleMIX/paddlemix/models/diffsinger/modules/fastspeech/variance_encoder.py +151 -0
- VLMEvalKit_old/PaddleMIX/paddlemix/models/diffsinger/modules/hnsep/vr/__init__.py +42 -0
- VLMEvalKit_old/PaddleMIX/paddlemix/models/diffsinger/modules/hnsep/vr/layers.py +140 -0
- VLMEvalKit_old/PaddleMIX/paddlemix/models/diffsinger/modules/hnsep/vr/nets.py +185 -0
- VLMEvalKit_old/PaddleMIX/paddlemix/models/diffsinger/modules/nsf_hifigan/env.py +46 -0
- VLMEvalKit_old/PaddleMIX/paddlemix/models/diffsinger/modules/nsf_hifigan/models.py +380 -0
- VLMEvalKit_old/PaddleMIX/paddlemix/models/diffsinger/modules/nsf_hifigan/nvSTFT.py +104 -0
- VLMEvalKit_old/PaddleMIX/paddlemix/models/diffsinger/modules/nsf_hifigan/utils.py +27 -0
- VLMEvalKit_old/PaddleMIX/paddlemix/models/diffsinger/modules/pe/pm.py +30 -0
- VLMEvalKit_old/PaddleMIX/paddlemix/models/diffsinger/modules/pe/rmvpe/__init__.py +19 -0
- VLMEvalKit_old/PaddleMIX/paddlemix/models/diffsinger/modules/pe/rmvpe/constants.py +21 -0
- VLMEvalKit_old/PaddleMIX/paddlemix/models/diffsinger/modules/pe/rmvpe/deepunet.py +194 -0
- VLMEvalKit_old/PaddleMIX/paddlemix/models/diffsinger/modules/pe/rmvpe/inference.py +80 -0
- VLMEvalKit_old/PaddleMIX/paddlemix/models/diffsinger/modules/pe/rmvpe/model.py +54 -0
- VLMEvalKit_old/PaddleMIX/paddlemix/models/diffsinger/modules/pe/rmvpe/seq.py +30 -0
- VLMEvalKit_old/PaddleMIX/paddlemix/models/diffsinger/modules/pe/rmvpe/spec.py +65 -0
- VLMEvalKit_old/PaddleMIX/paddlemix/models/diffsinger/modules/pe/rmvpe/utils.py +54 -0
- VLMEvalKit_old/PaddleMIX/paddlemix/models/diffsinger/modules/toplevel.py +323 -0
- VLMEvalKit_old/PaddleMIX/paddlemix/models/diffsinger/utils/__init__.py +342 -0
.gitattributes
CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
36 |
+
DeepSeek-VL2/vg.jpg filter=lfs diff=lfs merge=lfs -text
|
DeepSeek-VL2/vg.jpg
ADDED
![]() |
VLMEvalKit_old/PaddleMIX/paddlemix/examples/GOT_OCR_2_0/README.md
ADDED
@@ -0,0 +1,110 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# GOT-OCR2.0
|
2 |
+
|
3 |
+
## 1. 模型介绍
|
4 |
+
|
5 |
+
[GOT-OCR2.0](https://arxiv.org/abs/2409.01704)是由 StepFun 和中国科学院大学推出的专用于通用 OCR 任务的多模态大模型,参数量 0.6B,是一款极具突破性的通用OCR多模态模型,旨在解决传统OCR系统(OCR-1.0)和当前大规模视觉语言模型(LVLMs)在OCR任务中的局限性。
|
6 |
+
|
7 |
+
**本仓库支持的模型权重:**
|
8 |
+
|
9 |
+
| Model |
|
10 |
+
|--------------------|
|
11 |
+
| stepfun-ai/GOT-OCR2_0 |
|
12 |
+
|
13 |
+
注意:与huggingface权重同名,但权重为paddle框架的Tensor,使用`xxx.from_pretrained("stepfun-ai/GOT-OCR2_0")`即可自动下载该权重文件夹到缓存目录。
|
14 |
+
|
15 |
+
|
16 |
+
## 2. 环境要求
|
17 |
+
- **python >= 3.10**
|
18 |
+
- **paddlepaddle-gpu 要求3.0.0b2版本或develop版本**
|
19 |
+
```
|
20 |
+
# 安装示例
|
21 |
+
python -m pip install paddlepaddle-gpu==3.0.0b2 -i https://www.paddlepaddle.org.cn/packages/stable/cu118/
|
22 |
+
```
|
23 |
+
|
24 |
+
- **paddlenlp == 3.0.0b3**
|
25 |
+
- **paddlenlp要求是3.0.0b3版本**
|
26 |
+
```
|
27 |
+
# 安装示例
|
28 |
+
python -m pip install paddlenlp==3.0.0b3
|
29 |
+
```
|
30 |
+
|
31 |
+
- **其他环境要求**
|
32 |
+
```
|
33 |
+
pip install -r requirements.txt
|
34 |
+
```
|
35 |
+
|
36 |
+
## 3 推理预测
|
37 |
+
|
38 |
+
注意:GOT-OCR2.0 模型推理显存约需4G,不支持数据类型为"float16"进行推理。
|
39 |
+
|
40 |
+
### 3.1. plain texts OCR:
|
41 |
+
```bash
|
42 |
+
python paddlemix/examples/GOT_OCR_2_0/got_ocr2_0_infer.py \
|
43 |
+
--model_name_or_path stepfun-ai/GOT-OCR2_0 \
|
44 |
+
--image_file paddlemix/demo_images/hospital.jpeg \
|
45 |
+
--ocr_type ocr \
|
46 |
+
--dtype "bfloat16" \
|
47 |
+
```
|
48 |
+
|
49 |
+
### 3.2. format texts OCR:
|
50 |
+
```bash
|
51 |
+
python paddlemix/examples/GOT_OCR_2_0/got_ocr2_0_infer.py \
|
52 |
+
--model_name_or_path stepfun-ai/GOT-OCR2_0 \
|
53 |
+
--image_file paddlemix/demo_images/hospital.jpeg \
|
54 |
+
--ocr_type format \
|
55 |
+
--dtype "bfloat16" \
|
56 |
+
```
|
57 |
+
|
58 |
+
### 3.3. multi_crop plain texts OCR:
|
59 |
+
```bash
|
60 |
+
python paddlemix/examples/GOT_OCR_2_0/got_ocr2_0_infer.py \
|
61 |
+
--model_name_or_path stepfun-ai/GOT-OCR2_0 \
|
62 |
+
--image_file paddlemix/demo_images/hospital.jpeg \
|
63 |
+
--ocr_type ocr \
|
64 |
+
--multi_crop \
|
65 |
+
--dtype "bfloat16" \
|
66 |
+
```
|
67 |
+
|
68 |
+
## 4 训练
|
69 |
+
|
70 |
+
与[官方github代码库](https://github.com/Ucas-HaoranWei/GOT-OCR2.0/?tab=readme-ov-file#train)一样,目前仅支持基于GOT权重的post-training(stage-2/stage-3),其中stage2是全参数微调,stage3是冻结vision encoder后微调,默认训练方式是stage2全参数微调,训练显存约10GB每卡。
|
71 |
+
|
72 |
+
### 数据集下载
|
73 |
+
PaddleMIX团队提供了一个改版的SynthDoG-EN数据集,统一修改了其原先的question为```<image>\nOCR:```,下载链接为:
|
74 |
+
```
|
75 |
+
wget https://paddlenlp.bj.bcebos.com/datasets/paddlemix/playground/synthdog_en.tar # 2.4G
|
76 |
+
```
|
77 |
+
synthdog_en.tar包括了图片images文件夹和标注json文件,需下载解压或软链接在PaddleMIX/目录下。
|
78 |
+
|
79 |
+
### 数据集格式
|
80 |
+
|
81 |
+
同[官方例子](https://github.com/Ucas-HaoranWei/GOT-OCR2.0/blob/main/assets/train_sample.jpg),其中question统一为```<image>\nOCR:```,answer是其OCR结果。
|
82 |
+
|
83 |
+
|
84 |
+
### 训练命令
|
85 |
+
|
86 |
+
```bash
|
87 |
+
sh paddlemix/examples/GOT_OCR_2_0/run_train.sh
|
88 |
+
```
|
89 |
+
|
90 |
+
注意:默认训练方式是stage2全参数微调,训练显存约10GB每卡。也可通过设置```--freeze_vision_tower True```冻结vision encoder后微调。
|
91 |
+
|
92 |
+
### 训完后推理
|
93 |
+
|
94 |
+
```bash
|
95 |
+
python paddlemix/examples/GOT_OCR_2_0/got_ocr2_0_infer.py \
|
96 |
+
--model_name_or_path work_dirs/got_ocr_20/ \
|
97 |
+
--image_file paddlemix/demo_images/hospital.jpeg \
|
98 |
+
--ocr_type ocr \
|
99 |
+
```
|
100 |
+
|
101 |
+
|
102 |
+
## 参考文献
|
103 |
+
```BibTeX
|
104 |
+
@article{wei2024general,
|
105 |
+
title={General OCR Theory: Towards OCR-2.0 via a Unified End-to-end Model},
|
106 |
+
author={Wei, Haoran and Liu, Chenglong and Chen, Jinyue and Wang, Jia and Kong, Lingyu and Xu, Yanming and Ge, Zheng and Zhao, Liang and Sun, Jianjian and Peng, Yuang and others},
|
107 |
+
journal={arXiv preprint arXiv:2409.01704},
|
108 |
+
year={2024}
|
109 |
+
}
|
110 |
+
```
|
VLMEvalKit_old/PaddleMIX/paddlemix/examples/GOT_OCR_2_0/requirement.txt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
megfile
|
2 |
+
natsort
|
3 |
+
paddlenlp==3.0.0b3
|
VLMEvalKit_old/PaddleMIX/paddlemix/examples/GOT_OCR_2_0/run_train.sh
ADDED
@@ -0,0 +1,78 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
|
2 |
+
#
|
3 |
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
4 |
+
# you may not use this file except in compliance with the License.
|
5 |
+
# You may obtain a copy of the License at
|
6 |
+
#
|
7 |
+
# http://www.apache.org/licenses/LICENSE-2.0
|
8 |
+
#
|
9 |
+
# Unless required by applicable law or agreed to in writing, software
|
10 |
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
11 |
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
12 |
+
# See the License for the specific language governing permissions and
|
13 |
+
# limitations under the License.
|
14 |
+
|
15 |
+
set -x
|
16 |
+
|
17 |
+
GPUS=${GPUS:-8}
|
18 |
+
BATCH_SIZE=${BATCH_SIZE:-32}
|
19 |
+
PER_DEVICE_BATCH_SIZE=${PER_DEVICE_BATCH_SIZE:-1}
|
20 |
+
|
21 |
+
GRADIENT_ACC=$((BATCH_SIZE / PER_DEVICE_BATCH_SIZE / GPUS))
|
22 |
+
tensor_parallel_degree=${tensor_parallel_degree:-1}
|
23 |
+
sharding_parallel_degree=$((GPUS / tensor_parallel_degree))
|
24 |
+
|
25 |
+
export PYTHONPATH="${PYTHONPATH}:$(pwd)"
|
26 |
+
export MASTER_PORT=34229
|
27 |
+
export TF_CPP_MIN_LOG_LEVEL=3
|
28 |
+
|
29 |
+
OUTPUT_DIR='work_dirs/got_ocr_20'
|
30 |
+
|
31 |
+
if [ ! -d "$OUTPUT_DIR" ]; then
|
32 |
+
mkdir -p "$OUTPUT_DIR"
|
33 |
+
fi
|
34 |
+
|
35 |
+
TRAINING_MODEL_RESUME="None"
|
36 |
+
TRAINER_INSTANCES='127.0.0.1'
|
37 |
+
MASTER='127.0.0.1:8080'
|
38 |
+
|
39 |
+
# --freeze_vision_tower False \ # True for stage3
|
40 |
+
|
41 |
+
TRAINING_PYTHON="python -m paddle.distributed.launch --master ${MASTER} --nnodes 1 --nproc_per_node ${GPUS} --rank 0 --ips ${TRAINER_INSTANCES} --run_mode=collective"
|
42 |
+
${TRAINING_PYTHON} --log_dir ${OUTPUT_DIR}/paddle_distributed_logs \
|
43 |
+
paddlemix/examples/GOT_OCR_2_0/train_GOT.py \
|
44 |
+
--do_train \
|
45 |
+
--model_name_or_path "stepfun-ai/GOT-OCR2_0" \
|
46 |
+
--output_dir ${OUTPUT_DIR} \
|
47 |
+
--logging_dir ${OUTPUT_DIR}/logs \
|
48 |
+
--meta_path paddlemix/examples/GOT_OCR_2_0/configs/demo_dataset.json \
|
49 |
+
--overwrite_output_dir True \
|
50 |
+
--dataloader_num_workers 8 \
|
51 |
+
--bf16 True \
|
52 |
+
--fp16 False \
|
53 |
+
--fp16_opt_level "O2" \
|
54 |
+
--num_train_epochs 1 \
|
55 |
+
--per_device_train_batch_size ${PER_DEVICE_BATCH_SIZE} \
|
56 |
+
--gradient_accumulation_steps ${GRADIENT_ACC} \
|
57 |
+
--freeze_vision_tower False \
|
58 |
+
--use_im_start_end True \
|
59 |
+
--max_seq_length 8192 \
|
60 |
+
--recompute False \
|
61 |
+
--max_grad_norm 1.0 \
|
62 |
+
--evaluation_strategy "no" \
|
63 |
+
--save_strategy "steps" \
|
64 |
+
--save_steps 200 \
|
65 |
+
--save_total_limit 1 \
|
66 |
+
--learning_rate 2e-5 \
|
67 |
+
--weight_decay 0. \
|
68 |
+
--warmup_ratio 0.001 \
|
69 |
+
--optim "adamw" \
|
70 |
+
--lr_scheduler_type "cosine" \
|
71 |
+
--logging_steps 1 \
|
72 |
+
--report_to "visualdl" \
|
73 |
+
--tensor_parallel_degree=${tensor_parallel_degree} \
|
74 |
+
--sharding_parallel_degree=${sharding_parallel_degree} \
|
75 |
+
--pipeline_parallel_degree=1 \
|
76 |
+
--sep_parallel_degree=1 \
|
77 |
+
--sharding="stage1" \
|
78 |
+
2>&1 | tee -a "${OUTPUT_DIR}/training_log.txt"
|
VLMEvalKit_old/PaddleMIX/paddlemix/examples/GOT_OCR_2_0/train_GOT.py
ADDED
@@ -0,0 +1,243 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Adopted from https://github.com/lm-sys/FastChat. Below is the original copyright:
|
2 |
+
# Adopted from tatsu-lab@stanford_alpaca. Below is the original copyright:
|
3 |
+
# Copyright 2023 Rohan Taori, Ishaan Gulrajani, Tianyi Zhang, Yann Dubois, Xuechen Li
|
4 |
+
#
|
5 |
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
6 |
+
# you may not use this file except in compliance with the License.
|
7 |
+
# You may obtain a copy of the License at
|
8 |
+
#
|
9 |
+
# http://www.apache.org/licenses/LICENSE-2.0
|
10 |
+
#
|
11 |
+
# Unless required by applicable law or agreed to in writing, software
|
12 |
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
13 |
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
14 |
+
# See the License for the specific language governing permissions and
|
15 |
+
# limitations under the License.
|
16 |
+
|
17 |
+
import logging
|
18 |
+
import os
|
19 |
+
import sys
|
20 |
+
from dataclasses import dataclass, field
|
21 |
+
from typing import Optional
|
22 |
+
|
23 |
+
import paddle
|
24 |
+
import paddle.distributed as dist
|
25 |
+
from paddlenlp.trainer import PdArgumentParser, TrainingArguments, set_seed
|
26 |
+
from paddlenlp.trainer.trainer import Trainer
|
27 |
+
from paddlenlp.trainer.trainer_utils import get_last_checkpoint
|
28 |
+
from paddlenlp.transformers import QWenTokenizer
|
29 |
+
|
30 |
+
from paddlemix.datasets.got_dataset import make_supervised_data_module
|
31 |
+
from paddlemix.models.GOT.GOT_ocr_2_0 import GOTQwenForCausalLM
|
32 |
+
from paddlemix.models.GOT.utils.utils import smart_tokenizer_and_embedding_resize
|
33 |
+
|
34 |
+
logger = logging.getLogger(__name__)
|
35 |
+
|
36 |
+
|
37 |
+
def print_trainable_params(model: paddle.nn.Layer) -> None:
|
38 |
+
trainable_params, all_param = 0, 0
|
39 |
+
for k, param in model.named_parameters():
|
40 |
+
num_params = param.size
|
41 |
+
if num_params == 0 and hasattr(param, "ds_numel"):
|
42 |
+
num_params = param.ds_numel
|
43 |
+
all_param += num_params
|
44 |
+
if not param.stop_gradient:
|
45 |
+
# print('{}, shape: {}, requires grad: {}'.format(k, param.shape, not param.stop_gradient))
|
46 |
+
trainable_params += num_params
|
47 |
+
print(
|
48 |
+
"trainable params: {:d} || all params: {:d} || trainable%: {:.4f}".format(
|
49 |
+
trainable_params, all_param, 100 * trainable_params / all_param
|
50 |
+
)
|
51 |
+
)
|
52 |
+
|
53 |
+
|
54 |
+
@dataclass
|
55 |
+
class ModelArguments:
|
56 |
+
model_name_or_path: Optional[str] = field(default="stepfun-ai/GOT-OCR2_0")
|
57 |
+
use_cache: bool = field(default=False)
|
58 |
+
vision_tower: Optional[str] = field(default="openai/clip-vit-large-patch14")
|
59 |
+
freeze_vision_tower: bool = field(default=False)
|
60 |
+
freeze_lm_model: bool = field(default=False)
|
61 |
+
pretrained_stage1_model: Optional[str] = field(default=None) # mlp &/ vision tower
|
62 |
+
vision_select_layer: Optional[int] = field(default=-1) # default to the last layer
|
63 |
+
use_im_start_end: bool = field(default=False)
|
64 |
+
|
65 |
+
|
66 |
+
@dataclass
|
67 |
+
class DataArguments:
|
68 |
+
datasets: str = field(default=None, metadata={"help": "combinations of the training data."})
|
69 |
+
meta_path: Optional[str] = field(
|
70 |
+
default=None,
|
71 |
+
metadata={"help": "The path of the meta file of datasets."},
|
72 |
+
)
|
73 |
+
sep_image_conv_front: bool = False
|
74 |
+
image_token_len: int = 256
|
75 |
+
image_aspect_ratio: str = "square"
|
76 |
+
conversation_version: str = "mpt"
|
77 |
+
box_limit: int = 0
|
78 |
+
max_seq_length: int = 8192
|
79 |
+
|
80 |
+
|
81 |
+
@dataclass
|
82 |
+
class GOTTrainingArguments(TrainingArguments):
|
83 |
+
cache_dir: Optional[str] = field(default=None)
|
84 |
+
optim: str = field(default="adamw_torch")
|
85 |
+
remove_unused_columns: bool = field(default=False)
|
86 |
+
force_fsdp: bool = field(default=False)
|
87 |
+
interleave: bool = field(default=False)
|
88 |
+
with_box: bool = field(default=False)
|
89 |
+
model_max_length: int = field(
|
90 |
+
default=512,
|
91 |
+
metadata={"help": "Maximum sequence length. Sequences will be right padded (and possibly truncated)."},
|
92 |
+
)
|
93 |
+
lora_enable: bool = False
|
94 |
+
lora_r: int = 8
|
95 |
+
lora_alpha: int = 16
|
96 |
+
lora_dropout: float = 0.05
|
97 |
+
lora_weight_path: str = ""
|
98 |
+
lora_bias: str = "none"
|
99 |
+
|
100 |
+
|
101 |
+
def train():
|
102 |
+
parser = PdArgumentParser((ModelArguments, DataArguments, GOTTrainingArguments))
|
103 |
+
if len(sys.argv) == 2 and sys.argv[1].endswith(".json"):
|
104 |
+
# If we pass only one argument to the script, and it's the path to a json file,
|
105 |
+
# let's parse it to get our arguments.
|
106 |
+
model_args, data_args, training_args = parser.parse_json_file(json_file=os.path.abspath(sys.argv[1]))
|
107 |
+
else:
|
108 |
+
model_args, data_args, training_args = parser.parse_args_into_dataclasses()
|
109 |
+
training_args.print_config(model_args, "Model")
|
110 |
+
training_args.print_config(data_args, "Data")
|
111 |
+
|
112 |
+
# Detecting last checkpoint and eventually continue from last checkpoint.
|
113 |
+
last_checkpoint = None
|
114 |
+
if os.path.isdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir:
|
115 |
+
last_checkpoint = get_last_checkpoint(training_args.output_dir)
|
116 |
+
if last_checkpoint is None and len(os.listdir(training_args.output_dir)) > 0:
|
117 |
+
raise ValueError(
|
118 |
+
f"Output directory ({training_args.output_dir}) already exists and is not empty. "
|
119 |
+
"Use --overwrite_output_dir to overcome."
|
120 |
+
)
|
121 |
+
elif last_checkpoint is not None and training_args.resume_from_checkpoint is None:
|
122 |
+
logger.info(
|
123 |
+
f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change "
|
124 |
+
"the `--output_dir` or add `--overwrite_output_dir` to train from scratch."
|
125 |
+
)
|
126 |
+
|
127 |
+
# Load model
|
128 |
+
if training_args.fp16_opt_level == "O2":
|
129 |
+
if training_args.fp16:
|
130 |
+
dtype = "float16"
|
131 |
+
elif training_args.bf16 and paddle.amp.is_bfloat16_supported():
|
132 |
+
dtype = "bfloat16"
|
133 |
+
else:
|
134 |
+
raise ValueError("Please specific dtype: --fp16 or --bf16")
|
135 |
+
else:
|
136 |
+
dtype = "float32"
|
137 |
+
|
138 |
+
# Set seed before initializing model.
|
139 |
+
set_seed(training_args.seed)
|
140 |
+
|
141 |
+
# Load pretrained model, tokenizer, and image processor
|
142 |
+
tokenizer_path = model_args.model_name_or_path
|
143 |
+
print(f"Loading Tokenizer: {tokenizer_path}")
|
144 |
+
|
145 |
+
tokenizer = QWenTokenizer.from_pretrained(
|
146 |
+
model_args.model_name_or_path, padding_side="right", model_max_length=training_args.model_max_length
|
147 |
+
)
|
148 |
+
print("tokenizer", tokenizer)
|
149 |
+
# print("len(tokenizer)", len(tokenizer))
|
150 |
+
# print("tokenizer.added_tokens_encoder", tokenizer.added_tokens_encoder)
|
151 |
+
# print("tokenizer.added_tokens_decoder", tokenizer.added_tokens_decoder)
|
152 |
+
|
153 |
+
model = GOTQwenForCausalLM.from_pretrained(model_args.model_name_or_path, dtype=dtype)
|
154 |
+
|
155 |
+
smart_tokenizer_and_embedding_resize(
|
156 |
+
special_tokens_dict=dict(pad_token="<|endoftext|>"),
|
157 |
+
tokenizer=tokenizer,
|
158 |
+
model=model,
|
159 |
+
)
|
160 |
+
|
161 |
+
vision_tower_dict = model.get_model().initialize_vision_modules(
|
162 |
+
vision_tower=model_args.vision_tower,
|
163 |
+
pretrained_stage1_model=model_args.pretrained_stage1_model,
|
164 |
+
freeze_vision_tower=model_args.freeze_vision_tower,
|
165 |
+
use_im_start_end=model_args.use_im_start_end,
|
166 |
+
vision_select_layer=model_args.vision_select_layer,
|
167 |
+
dtype=dtype,
|
168 |
+
)
|
169 |
+
|
170 |
+
model.initialize_vision_tokenizer(
|
171 |
+
tokenizer=tokenizer,
|
172 |
+
freeze_lm_model=model_args.freeze_lm_model,
|
173 |
+
pretrained_stage1_model=model_args.pretrained_stage1_model,
|
174 |
+
)
|
175 |
+
|
176 |
+
# 'image_processor_high
|
177 |
+
data_args.image_token_len = 256
|
178 |
+
data_args.image_processor = vision_tower_dict["image_processor"]
|
179 |
+
data_args.image_processor_high = vision_tower_dict["image_processor_high"]
|
180 |
+
data_args.use_im_start_end = model_args.use_im_start_end
|
181 |
+
|
182 |
+
def _freeze_params(module):
|
183 |
+
for param in module.parameters():
|
184 |
+
param.stop_gradient = not False
|
185 |
+
|
186 |
+
# mixed relation, to be fixed
|
187 |
+
if model_args.freeze_lm_model:
|
188 |
+
_freeze_params(model.get_model().mm_projector)
|
189 |
+
_freeze_params(model.get_model().mm_projector_vary)
|
190 |
+
_freeze_params(model.get_input_embeddings())
|
191 |
+
|
192 |
+
if model_args.freeze_vision_tower:
|
193 |
+
_freeze_params(model.qwen2.vision_tower_high)
|
194 |
+
|
195 |
+
print_trainable_params(model)
|
196 |
+
# trainable params: 464959488 || all params: 560528640 || trainable%: 82.9502 # stage3
|
197 |
+
# trainable params: 560528640 || all params: 560528640 || trainable%: 100 # stage2
|
198 |
+
params_grad = [p.numel() for n, p in model.named_parameters() if not p.stop_gradient]
|
199 |
+
print(f"Number of Mapping Trainable Parameters: {int(sum(params_grad)) / (1 << 20):.2f} M")
|
200 |
+
|
201 |
+
# print trainable parameters
|
202 |
+
if dist.get_rank() == 0:
|
203 |
+
for name, param in model.named_parameters():
|
204 |
+
if not param.stop_gradient:
|
205 |
+
logger.info(name)
|
206 |
+
|
207 |
+
# set seed for paddle dataloaders
|
208 |
+
set_seed(training_args.seed)
|
209 |
+
|
210 |
+
data_module = make_supervised_data_module(
|
211 |
+
interleave=training_args.interleave, with_box=training_args.with_box, tokenizer=tokenizer, data_args=data_args
|
212 |
+
)
|
213 |
+
|
214 |
+
trainer = Trainer(
|
215 |
+
model=model,
|
216 |
+
args=training_args,
|
217 |
+
tokenizer=tokenizer,
|
218 |
+
**data_module,
|
219 |
+
)
|
220 |
+
|
221 |
+
# Training
|
222 |
+
if training_args.do_train:
|
223 |
+
checkpoint = None
|
224 |
+
if training_args.resume_from_checkpoint is not None:
|
225 |
+
checkpoint = training_args.resume_from_checkpoint
|
226 |
+
elif last_checkpoint is not None:
|
227 |
+
checkpoint = last_checkpoint
|
228 |
+
train_result = trainer.train(resume_from_checkpoint=checkpoint)
|
229 |
+
trainer.save_model() # Saves the tokenizer too for easy upload
|
230 |
+
|
231 |
+
metrics = train_result.metrics
|
232 |
+
try:
|
233 |
+
metrics["train_samples"] = len(data_module["train_dataset"])
|
234 |
+
except:
|
235 |
+
metrics["train_samples"] = -1
|
236 |
+
|
237 |
+
trainer.log_metrics("train", metrics)
|
238 |
+
trainer.save_metrics("train", metrics)
|
239 |
+
trainer.save_state()
|
240 |
+
|
241 |
+
|
242 |
+
if __name__ == "__main__":
|
243 |
+
train()
|
VLMEvalKit_old/PaddleMIX/paddlemix/examples/ppdocbee/app.py
ADDED
@@ -0,0 +1,350 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
|
2 |
+
#
|
3 |
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
4 |
+
# you may not use this file except in compliance with the License.
|
5 |
+
# You may obtain a copy of the License at
|
6 |
+
#
|
7 |
+
# http://www.apache.org/licenses/LICENSE-2.0
|
8 |
+
#
|
9 |
+
# Unless required by applicable law or agreed to in writing, software
|
10 |
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
11 |
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
12 |
+
# See the License for the specific language governing permissions and
|
13 |
+
# limitations under the License.
|
14 |
+
|
15 |
+
import hashlib
|
16 |
+
import os
|
17 |
+
import os.path
|
18 |
+
import sys
|
19 |
+
import tempfile
|
20 |
+
import time
|
21 |
+
from datetime import datetime
|
22 |
+
|
23 |
+
import gradio as gr
|
24 |
+
import numpy as np
|
25 |
+
import paddle
|
26 |
+
from PIL import Image
|
27 |
+
|
28 |
+
# 设置使用的GPU设备
|
29 |
+
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
|
30 |
+
|
31 |
+
# 模型配置
|
32 |
+
model_path = "PaddleMIX/PPDocBee-2B-1129"
|
33 |
+
dtype = "bfloat16" # V100请改成float16
|
34 |
+
|
35 |
+
# 全局变量定义
|
36 |
+
model = None
|
37 |
+
processor = None
|
38 |
+
|
39 |
+
min_pixels = 256 * 28 * 28 # 最小像素数
|
40 |
+
max_pixels = 48 * 48 * 28 * 28 # 最大像素数
|
41 |
+
|
42 |
+
SERVER_NAME = "localhost"
|
43 |
+
SERVER_PORR = 8080
|
44 |
+
|
45 |
+
|
46 |
+
def check_and_install_paddlemix():
|
47 |
+
try:
|
48 |
+
from paddlemix.models.qwen2_vl.modeling_qwen2_vl import (
|
49 |
+
Qwen2VLForConditionalGeneration,
|
50 |
+
)
|
51 |
+
|
52 |
+
print("Required Qwen2VL model successfully installed")
|
53 |
+
except ImportError:
|
54 |
+
print("Failed to install required Qwen2VL model even after running the script")
|
55 |
+
sys.exit(1)
|
56 |
+
|
57 |
+
|
58 |
+
# 在继续之前检查所需模型
|
59 |
+
check_and_install_paddlemix()
|
60 |
+
|
61 |
+
|
62 |
+
from paddlemix.models.qwen2_vl import MIXQwen2Tokenizer
|
63 |
+
from paddlemix.models.qwen2_vl.modeling_qwen2_vl import Qwen2VLForConditionalGeneration
|
64 |
+
from paddlemix.processors.qwen2_vl_processing import (
|
65 |
+
Qwen2VLImageProcessor,
|
66 |
+
Qwen2VLProcessor,
|
67 |
+
process_vision_info,
|
68 |
+
)
|
69 |
+
|
70 |
+
# 示例使用HTTP链接
|
71 |
+
EXAMPLES = [
|
72 |
+
[
|
73 |
+
"维修保养、其他注意事项的注意点中,电池需为什么型号的?",
|
74 |
+
"paddlemix/demo_images/shuomingshu_20.png",
|
75 |
+
],
|
76 |
+
[
|
77 |
+
"产品期限是多久?",
|
78 |
+
"paddlemix/demo_images/shuomingshu_39.png",
|
79 |
+
],
|
80 |
+
]
|
81 |
+
|
82 |
+
|
83 |
+
class ImageCache:
|
84 |
+
"""图片缓存管理类"""
|
85 |
+
|
86 |
+
def __init__(self):
|
87 |
+
"""初始化图片缓存"""
|
88 |
+
self.temp_dir = tempfile.mkdtemp()
|
89 |
+
self.current_image = None
|
90 |
+
self.is_example = False # 标记当前图片是否为示例图片
|
91 |
+
print(f"Created temporary directory for image cache: {self.temp_dir}")
|
92 |
+
|
93 |
+
def cleanup_previous(self):
|
94 |
+
"""清理之前的缓存图片"""
|
95 |
+
if self.current_image and os.path.exists(self.current_image) and not self.is_example:
|
96 |
+
try:
|
97 |
+
os.unlink(self.current_image)
|
98 |
+
print(f"Cleaned up previous image: {self.current_image}")
|
99 |
+
except Exception as e:
|
100 |
+
print(f"Error cleaning up previous image: {e}")
|
101 |
+
|
102 |
+
def cache_image(self, image_path, is_example=False):
|
103 |
+
"""
|
104 |
+
缓存图片并返回缓存路径
|
105 |
+
Args:
|
106 |
+
image_path: 图片文件路径
|
107 |
+
is_example: 是否为示例图片
|
108 |
+
Returns:
|
109 |
+
缓存后的图片路径
|
110 |
+
"""
|
111 |
+
if not image_path:
|
112 |
+
return None
|
113 |
+
|
114 |
+
try:
|
115 |
+
# 如果是示例图片且已经在使用中,直接返回
|
116 |
+
if is_example and self.current_image == image_path and self.is_example:
|
117 |
+
return self.current_image
|
118 |
+
|
119 |
+
# 创建安全的文件名
|
120 |
+
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
121 |
+
file_hash = hashlib.md5(str(time.time()).encode()).hexdigest()[:8]
|
122 |
+
_, ext = os.path.splitext(image_path)
|
123 |
+
if not ext:
|
124 |
+
ext = ".jpg" # 默认扩展名
|
125 |
+
new_filename = f"image_{timestamp}_{file_hash}{ext}"
|
126 |
+
|
127 |
+
# 在临时目录中创建新路径
|
128 |
+
new_path = os.path.join(self.temp_dir, new_filename) if not is_example else image_path
|
129 |
+
|
130 |
+
if not is_example:
|
131 |
+
# 处理上传的图片文件
|
132 |
+
with Image.open(image_path) as img:
|
133 |
+
# 如果需要,转换为RGB
|
134 |
+
if img.mode != "RGB":
|
135 |
+
img = img.convert("RGB")
|
136 |
+
img.save(new_path)
|
137 |
+
|
138 |
+
# 更新当前图片之前清理之前的图片
|
139 |
+
self.cleanup_previous()
|
140 |
+
|
141 |
+
self.current_image = new_path
|
142 |
+
self.is_example = is_example
|
143 |
+
|
144 |
+
return new_path
|
145 |
+
|
146 |
+
except Exception as e:
|
147 |
+
print(f"Error caching image: {e}")
|
148 |
+
return image_path
|
149 |
+
|
150 |
+
|
151 |
+
# 创建全局图片缓存管理器
|
152 |
+
image_cache = ImageCache()
|
153 |
+
|
154 |
+
|
155 |
+
def load_model():
|
156 |
+
"""加载模型并进行内存优化"""
|
157 |
+
global model, processor
|
158 |
+
|
159 |
+
if model is None:
|
160 |
+
# 加载模型和处理器
|
161 |
+
model = Qwen2VLForConditionalGeneration.from_pretrained(
|
162 |
+
model_path,
|
163 |
+
dtype=dtype,
|
164 |
+
)
|
165 |
+
image_processor = Qwen2VLImageProcessor()
|
166 |
+
tokenizer = MIXQwen2Tokenizer.from_pretrained(model_path)
|
167 |
+
processor = Qwen2VLProcessor(image_processor, tokenizer, min_pixels=min_pixels, max_pixels=max_pixels)
|
168 |
+
|
169 |
+
# 设置为评估模式
|
170 |
+
model.eval()
|
171 |
+
del tokenizer
|
172 |
+
return model, processor
|
173 |
+
|
174 |
+
|
175 |
+
def clear_cache():
|
176 |
+
"""清理GPU缓存"""
|
177 |
+
if paddle.device.cuda.memory_allocated() > 0:
|
178 |
+
paddle.device.cuda.empty_cache()
|
179 |
+
import gc
|
180 |
+
|
181 |
+
gc.collect()
|
182 |
+
|
183 |
+
|
184 |
+
def multimodal_understanding(image, question, seed=42, top_p=0.95, temperature=0.1):
|
185 |
+
"""
|
186 |
+
多模态理解主函数
|
187 |
+
Args:
|
188 |
+
image: 输入图片
|
189 |
+
question: 问题文本
|
190 |
+
seed: 随机种子
|
191 |
+
top_p: 采样参数
|
192 |
+
temperature: 温度参数
|
193 |
+
Yields:
|
194 |
+
处理状态和结果
|
195 |
+
"""
|
196 |
+
# 输入验证
|
197 |
+
if not image:
|
198 |
+
yield "⚠️ 请上传图片后再开始对话。"
|
199 |
+
return
|
200 |
+
if not question or question.strip() == "":
|
201 |
+
yield "⚠️ 请输入您的问题后再开始对话。"
|
202 |
+
return
|
203 |
+
|
204 |
+
try:
|
205 |
+
start_time = time.time()
|
206 |
+
yield "🔄 正在处理您的请求,请稍候..."
|
207 |
+
|
208 |
+
# 检查超时
|
209 |
+
if time.time() - start_time > 200:
|
210 |
+
yield "⏳ 系统当前用户繁多,请等待10分钟后再次尝试。感谢您的理解!"
|
211 |
+
return
|
212 |
+
|
213 |
+
clear_cache()
|
214 |
+
|
215 |
+
# 设置随机种子
|
216 |
+
paddle.seed(seed)
|
217 |
+
np.random.seed(seed)
|
218 |
+
|
219 |
+
# 处理图片缓存
|
220 |
+
is_example = any(image == example[1] for example in EXAMPLES)
|
221 |
+
cached_image = image_cache.cache_image(image, is_example=is_example)
|
222 |
+
if not cached_image:
|
223 |
+
return "图片处理失败,请检查图片格式是否正确。"
|
224 |
+
|
225 |
+
# 构建提示文本
|
226 |
+
prompts = question + "\n请用图片中完整出现的内容回答,可以是单词、短语或句子,针对问题回答尽可能详细和完整,并保持格式、单位、符号和标点都与图片中的文字内容完全一致。"
|
227 |
+
|
228 |
+
# 构建消息
|
229 |
+
messages = [
|
230 |
+
{
|
231 |
+
"role": "user",
|
232 |
+
"content": [
|
233 |
+
{
|
234 |
+
"type": "image",
|
235 |
+
"image": cached_image,
|
236 |
+
},
|
237 |
+
{"type": "text", "text": prompts},
|
238 |
+
],
|
239 |
+
}
|
240 |
+
]
|
241 |
+
|
242 |
+
yield "模型正在分析图片内容..."
|
243 |
+
|
244 |
+
# 处理视觉信息
|
245 |
+
image_inputs, video_inputs = process_vision_info(messages)
|
246 |
+
image_pad_token = "<|vision_start|><|image_pad|><|vision_end|>"
|
247 |
+
text = f"<|im_start|>system\n你是一个非常棒的多模态理解的AI助手。<|im_end|>\n<|im_start|>user\n{image_pad_token}{prompts}<|im_end|>\n<|im_start|>assistant\n"
|
248 |
+
|
249 |
+
# 生成回答
|
250 |
+
with paddle.no_grad():
|
251 |
+
inputs = processor(
|
252 |
+
text=[text], images=image_inputs, videos=video_inputs, padding=True, return_tensors="pd"
|
253 |
+
)
|
254 |
+
|
255 |
+
yield "正在生成回答..."
|
256 |
+
|
257 |
+
generated_ids = model.generate(
|
258 |
+
**inputs,
|
259 |
+
max_new_tokens=1024,
|
260 |
+
top_p=top_p,
|
261 |
+
temperature=temperature,
|
262 |
+
num_beams=1,
|
263 |
+
do_sample=True,
|
264 |
+
use_cache=True,
|
265 |
+
)
|
266 |
+
|
267 |
+
output_text = processor.batch_decode(
|
268 |
+
generated_ids[0], skip_special_tokens=True, clean_up_tokenization_spaces=False
|
269 |
+
)[0]
|
270 |
+
|
271 |
+
# 清理内存
|
272 |
+
del inputs, generated_ids
|
273 |
+
clear_cache()
|
274 |
+
|
275 |
+
yield output_text
|
276 |
+
|
277 |
+
except Exception as e:
|
278 |
+
error_message = f"处理过程中出现错误: {str(e)}\n请重试或在评论区留下你的问题。"
|
279 |
+
return error_message
|
280 |
+
|
281 |
+
|
282 |
+
def process_example(question, image):
|
283 |
+
"""处理示例图片的包装函数"""
|
284 |
+
cached_path = image_cache.cache_image(image, is_example=True)
|
285 |
+
return multimodal_understanding(cached_path, question)
|
286 |
+
|
287 |
+
|
288 |
+
def handle_image_upload(image):
|
289 |
+
"""处理图片上传"""
|
290 |
+
if image is None:
|
291 |
+
return None
|
292 |
+
try:
|
293 |
+
cached_path = image_cache.cache_image(image, is_example=False)
|
294 |
+
return cached_path
|
295 |
+
except Exception as e:
|
296 |
+
print(f"Error handling image upload: {e}")
|
297 |
+
return None
|
298 |
+
|
299 |
+
|
300 |
+
# model, processor = load_model()
|
301 |
+
# # image = "/home/aistudio/work/doc-lark/PaddleMIX/paddlemix/demo_images/examples_image1.jpg"
|
302 |
+
# print(multimodal_understanding(EXAMPLES[1][1],EXAMPLES[1][0]))
|
303 |
+
|
304 |
+
# Gradio界面配置
|
305 |
+
with gr.Blocks() as demo:
|
306 |
+
gr.Markdown(
|
307 |
+
value="""
|
308 |
+
# 🤖 PP-DocBee(2B): Multimodal Document Understanding Demo
|
309 |
+
|
310 |
+
📚 原始模型来自 [PaddleMIX](https://github.com/PaddlePaddle/PaddleMIX) (🌟 一个基于飞桨PaddlePaddle框架构建的多模态大模型套件)
|
311 |
+
"""
|
312 |
+
)
|
313 |
+
with gr.Row():
|
314 |
+
image_input = gr.Image(type="filepath", label="📷 Upload Image or Input URL")
|
315 |
+
with gr.Column():
|
316 |
+
question_input = gr.Textbox(label="💭 Question", placeholder="Enter your question here...")
|
317 |
+
und_seed_input = gr.Number(label="🎲 Seed", precision=0, value=42)
|
318 |
+
top_p = gr.Slider(minimum=0, maximum=1, value=0.95, step=0.05, label="📊 Top P")
|
319 |
+
temperature = gr.Slider(minimum=0, maximum=1, value=0.1, step=0.05, label="🌡️ Temperature")
|
320 |
+
|
321 |
+
image_input.upload(fn=handle_image_upload, inputs=[image_input], outputs=[image_input])
|
322 |
+
|
323 |
+
understanding_button = gr.Button("💬 Chat", variant="primary")
|
324 |
+
understanding_output = gr.Textbox(label="🤖 Response", interactive=False)
|
325 |
+
|
326 |
+
gr.Examples(
|
327 |
+
examples=EXAMPLES,
|
328 |
+
inputs=[question_input, image_input],
|
329 |
+
outputs=understanding_output,
|
330 |
+
fn=process_example,
|
331 |
+
cache_examples=True,
|
332 |
+
run_on_click=True,
|
333 |
+
)
|
334 |
+
|
335 |
+
# 加载模型
|
336 |
+
clear_cache()
|
337 |
+
model, processor = load_model()
|
338 |
+
clear_cache()
|
339 |
+
|
340 |
+
understanding_button.click(
|
341 |
+
fn=multimodal_understanding,
|
342 |
+
inputs=[image_input, question_input, und_seed_input, top_p, temperature],
|
343 |
+
outputs=understanding_output,
|
344 |
+
api_name="chat",
|
345 |
+
)
|
346 |
+
|
347 |
+
if __name__ == "__main__":
|
348 |
+
# 创建队列
|
349 |
+
demo.queue()
|
350 |
+
demo.launch(server_name=SERVER_NAME, server_port=SERVER_PORR, share=True, ssr_mode=False, max_threads=1) # 限制并发请求数
|
VLMEvalKit_old/PaddleMIX/paddlemix/models/GOT/utils/conversation.py
ADDED
@@ -0,0 +1,400 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
|
2 |
+
#
|
3 |
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
4 |
+
# you may not use this file except in compliance with the License.
|
5 |
+
# You may obtain a copy of the License at
|
6 |
+
#
|
7 |
+
# http://www.apache.org/licenses/LICENSE-2.0
|
8 |
+
#
|
9 |
+
# Unless required by applicable law or agreed to in writing, software
|
10 |
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
11 |
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
12 |
+
# See the License for the specific language governing permissions and
|
13 |
+
# limitations under the License.
|
14 |
+
|
15 |
+
import dataclasses
|
16 |
+
from enum import Enum, auto
|
17 |
+
from typing import List
|
18 |
+
|
19 |
+
|
20 |
+
class SeparatorStyle(Enum):
|
21 |
+
"""Different separator style."""
|
22 |
+
|
23 |
+
SINGLE = auto()
|
24 |
+
TWO = auto()
|
25 |
+
MPT = auto()
|
26 |
+
|
27 |
+
|
28 |
+
@dataclasses.dataclass
|
29 |
+
class Conversation:
|
30 |
+
"""A class that keeps all conversation history."""
|
31 |
+
|
32 |
+
system: str
|
33 |
+
roles: List[str]
|
34 |
+
messages: List[List[str]]
|
35 |
+
offset: int
|
36 |
+
sep_style: SeparatorStyle = SeparatorStyle.SINGLE
|
37 |
+
sep: str = "<|im_end|>"
|
38 |
+
sep2: str = None
|
39 |
+
version: str = "Unknown"
|
40 |
+
|
41 |
+
skip_next: bool = False
|
42 |
+
|
43 |
+
def get_prompt(self):
|
44 |
+
if self.sep_style == SeparatorStyle.SINGLE:
|
45 |
+
ret = self.system + self.sep + "\n"
|
46 |
+
for role, message in self.messages:
|
47 |
+
if message:
|
48 |
+
if type(message) is tuple:
|
49 |
+
message, _, _ = message
|
50 |
+
ret += role + ": " + message + self.sep
|
51 |
+
else:
|
52 |
+
ret += role + ":"
|
53 |
+
return ret
|
54 |
+
elif self.sep_style == SeparatorStyle.TWO:
|
55 |
+
seps = [self.sep, self.sep2]
|
56 |
+
ret = self.system + seps[0]
|
57 |
+
for i, (role, message) in enumerate(self.messages):
|
58 |
+
if message:
|
59 |
+
if type(message) is tuple:
|
60 |
+
message, _, _ = message
|
61 |
+
ret += role + ": " + message + seps[i % 2]
|
62 |
+
else:
|
63 |
+
ret += role + ":"
|
64 |
+
return ret
|
65 |
+
if self.sep_style == SeparatorStyle.MPT:
|
66 |
+
if self.system:
|
67 |
+
ret = self.system + self.sep
|
68 |
+
else:
|
69 |
+
ret = ""
|
70 |
+
for role, message in self.messages:
|
71 |
+
if message:
|
72 |
+
if type(message) is tuple:
|
73 |
+
message, _, _ = message
|
74 |
+
ret += role + message + self.sep
|
75 |
+
else:
|
76 |
+
ret += role
|
77 |
+
return ret
|
78 |
+
else:
|
79 |
+
raise ValueError(f"Invalid style: {self.sep_style}")
|
80 |
+
|
81 |
+
def append_message(self, role, message):
|
82 |
+
self.messages.append([role, message])
|
83 |
+
|
84 |
+
def get_images(self, return_pil=False):
|
85 |
+
images = []
|
86 |
+
for i, (role, msg) in enumerate(self.messages[self.offset :]):
|
87 |
+
if i % 2 == 0:
|
88 |
+
if type(msg) is tuple:
|
89 |
+
import base64
|
90 |
+
from io import BytesIO
|
91 |
+
|
92 |
+
from PIL import Image
|
93 |
+
|
94 |
+
msg, image, image_process_mode = msg
|
95 |
+
if image_process_mode == "Pad":
|
96 |
+
|
97 |
+
def expand2square(pil_img, background_color=(122, 116, 104)):
|
98 |
+
width, height = pil_img.size
|
99 |
+
if width == height:
|
100 |
+
return pil_img
|
101 |
+
elif width > height:
|
102 |
+
result = Image.new(pil_img.mode, (width, width), background_color)
|
103 |
+
# result.paste(pil_img, (0, (width - height) // 2))
|
104 |
+
result.paste(pil_img)
|
105 |
+
return result
|
106 |
+
else:
|
107 |
+
result = Image.new(pil_img.mode, (height, height), background_color)
|
108 |
+
# result.paste(pil_img, ((height - width) // 2, 0))
|
109 |
+
result.paste(pil_img)
|
110 |
+
return result
|
111 |
+
|
112 |
+
image = expand2square(image)
|
113 |
+
elif image_process_mode == "Crop":
|
114 |
+
max_hw, min_hw = max(image.size), min(image.size)
|
115 |
+
aspect_ratio = max_hw / min_hw
|
116 |
+
max_len, min_len = 800, 400
|
117 |
+
shortest_edge = int(min(max_len / aspect_ratio, min_len, min_hw))
|
118 |
+
longest_edge = int(shortest_edge * aspect_ratio)
|
119 |
+
W, H = image.size
|
120 |
+
if H > W:
|
121 |
+
H, W = longest_edge, shortest_edge
|
122 |
+
else:
|
123 |
+
H, W = shortest_edge, longest_edge
|
124 |
+
image = image.resize((W, H))
|
125 |
+
elif image_process_mode == "Resize":
|
126 |
+
image = image.resize((224, 224))
|
127 |
+
else:
|
128 |
+
raise ValueError(f"Invalid image_process_mode: {image_process_mode}")
|
129 |
+
|
130 |
+
if return_pil:
|
131 |
+
images.append(image)
|
132 |
+
else:
|
133 |
+
buffered = BytesIO()
|
134 |
+
image.convert("RGB").save(buffered, format="JPEG")
|
135 |
+
img_b64_str = base64.b64encode(buffered.getvalue()).decode()
|
136 |
+
images.append(img_b64_str)
|
137 |
+
return images
|
138 |
+
|
139 |
+
def to_gradio_chatbot(self):
|
140 |
+
ret = []
|
141 |
+
for i, (role, msg) in enumerate(self.messages[self.offset :]):
|
142 |
+
if i % 2 == 0:
|
143 |
+
if type(msg) is tuple:
|
144 |
+
import base64
|
145 |
+
from io import BytesIO
|
146 |
+
|
147 |
+
msg, image, image_process_mode = msg
|
148 |
+
max_hw, min_hw = max(image.size), min(image.size)
|
149 |
+
aspect_ratio = max_hw / min_hw
|
150 |
+
max_len, min_len = 800, 400
|
151 |
+
shortest_edge = int(min(max_len / aspect_ratio, min_len, min_hw))
|
152 |
+
longest_edge = int(shortest_edge * aspect_ratio)
|
153 |
+
W, H = image.size
|
154 |
+
if H > W:
|
155 |
+
H, W = longest_edge, shortest_edge
|
156 |
+
else:
|
157 |
+
H, W = shortest_edge, longest_edge
|
158 |
+
image = image.resize((W, H))
|
159 |
+
# image = image.resize((224, 224))
|
160 |
+
buffered = BytesIO()
|
161 |
+
image.save(buffered, format="JPEG")
|
162 |
+
img_b64_str = base64.b64encode(buffered.getvalue()).decode()
|
163 |
+
img_str = f'<img src="data:image/png;base64,{img_b64_str}" alt="user upload image" />'
|
164 |
+
msg = msg.replace("<image>", img_str)
|
165 |
+
ret.append([msg, None])
|
166 |
+
else:
|
167 |
+
ret[-1][-1] = msg
|
168 |
+
return ret
|
169 |
+
|
170 |
+
def copy(self):
|
171 |
+
return Conversation(
|
172 |
+
system=self.system,
|
173 |
+
roles=self.roles,
|
174 |
+
messages=[[x, y] for x, y in self.messages],
|
175 |
+
offset=self.offset,
|
176 |
+
sep_style=self.sep_style,
|
177 |
+
sep=self.sep,
|
178 |
+
sep2=self.sep2,
|
179 |
+
)
|
180 |
+
|
181 |
+
def dict(self):
|
182 |
+
if len(self.get_images()) > 0:
|
183 |
+
return {
|
184 |
+
"system": self.system,
|
185 |
+
"roles": self.roles,
|
186 |
+
"messages": [[x, y[0] if type(y) is tuple else y] for x, y in self.messages],
|
187 |
+
"offset": self.offset,
|
188 |
+
"sep": self.sep,
|
189 |
+
"sep2": self.sep2,
|
190 |
+
}
|
191 |
+
return {
|
192 |
+
"system": self.system,
|
193 |
+
"roles": self.roles,
|
194 |
+
"messages": self.messages,
|
195 |
+
"offset": self.offset,
|
196 |
+
"sep": self.sep,
|
197 |
+
"sep2": self.sep2,
|
198 |
+
}
|
199 |
+
|
200 |
+
|
201 |
+
conv_v1 = Conversation(
|
202 |
+
system="A chat between a curious human and an artificial intelligence assistant. "
|
203 |
+
"The assistant gives helpful, detailed, and polite answers to the human's questions.",
|
204 |
+
roles=("Human", "Assistant"),
|
205 |
+
messages=(
|
206 |
+
("Human", "Give three tips for staying healthy."),
|
207 |
+
(
|
208 |
+
"Assistant",
|
209 |
+
"Sure, here are three tips for staying healthy:\n"
|
210 |
+
"1. Exercise regularly: Regular physical activity can help improve your overall health and wellbeing. "
|
211 |
+
"It can also help reduce your risk of chronic conditions such as obesity, diabetes, heart disease, "
|
212 |
+
"and certain cancers. Aim for at least 150 minutes of moderate-intensity aerobic exercise or "
|
213 |
+
"75 minutes of vigorous-intensity aerobic exercise per week, along with muscle-strengthening "
|
214 |
+
"activities at least two days per week.\n"
|
215 |
+
"2. Eat a balanced diet: Eating a balanced diet that is rich in fruits, "
|
216 |
+
"vegetables, whole grains, lean proteins, and healthy fats can help support "
|
217 |
+
"your overall health. Try to limit your intake of processed and high-sugar foods, "
|
218 |
+
"and aim to drink plenty of water throughout the day.\n"
|
219 |
+
"3. Get enough sleep: Getting enough quality sleep is essential for your physical "
|
220 |
+
"and mental health. Adults should aim for seven to nine hours of sleep per night. "
|
221 |
+
"Establish a regular sleep schedule and try to create a relaxing bedtime routine to "
|
222 |
+
"help improve the quality of your sleep.",
|
223 |
+
),
|
224 |
+
),
|
225 |
+
offset=2,
|
226 |
+
sep_style=SeparatorStyle.SINGLE,
|
227 |
+
sep="###",
|
228 |
+
)
|
229 |
+
|
230 |
+
conv_v1_2 = Conversation(
|
231 |
+
system="A chat between a curious human and an artificial intelligence assistant. "
|
232 |
+
"The assistant gives helpful, detailed, and polite answers to the human's questions.",
|
233 |
+
roles=("Human", "Assistant"),
|
234 |
+
messages=(
|
235 |
+
("Human", "What are the key differences between renewable and non-renewable energy sources?"),
|
236 |
+
(
|
237 |
+
"Assistant",
|
238 |
+
"Renewable energy sources are those that can be replenished naturally in a relatively "
|
239 |
+
"short amount of time, such as solar, wind, hydro, geothermal, and biomass. "
|
240 |
+
"Non-renewable energy sources, on the other hand, are finite and will eventually be "
|
241 |
+
"depleted, such as coal, oil, and natural gas. Here are some key differences between "
|
242 |
+
"renewable and non-renewable energy sources:\n"
|
243 |
+
"1. Availability: Renewable energy sources are virtually inexhaustible, while non-renewable "
|
244 |
+
"energy sources are finite and will eventually run out.\n"
|
245 |
+
"2. Environmental impact: Renewable energy sources have a much lower environmental impact "
|
246 |
+
"than non-renewable sources, which can lead to air and water pollution, greenhouse gas emissions, "
|
247 |
+
"and other negative effects.\n"
|
248 |
+
"3. Cost: Renewable energy sources can be more expensive to initially set up, but they typically "
|
249 |
+
"have lower operational costs than non-renewable sources.\n"
|
250 |
+
"4. Reliability: Renewable energy sources are often more reliable and can be used in more remote "
|
251 |
+
"locations than non-renewable sources.\n"
|
252 |
+
"5. Flexibility: Renewable energy sources are often more flexible and can be adapted to different "
|
253 |
+
"situations and needs, while non-renewable sources are more rigid and inflexible.\n"
|
254 |
+
"6. Sustainability: Renewable energy sources are more sustainable over the long term, while "
|
255 |
+
"non-renewable sources are not, and their depletion can lead to economic and social instability.\n",
|
256 |
+
),
|
257 |
+
),
|
258 |
+
offset=2,
|
259 |
+
sep_style=SeparatorStyle.SINGLE,
|
260 |
+
sep="###",
|
261 |
+
)
|
262 |
+
|
263 |
+
conv_vicuna_v1_1 = Conversation(
|
264 |
+
system="A chat between a curious user and an artificial intelligence assistant. "
|
265 |
+
"The assistant gives helpful, detailed, and polite answers to the user's questions.",
|
266 |
+
roles=("USER", "ASSISTANT"),
|
267 |
+
version="v1",
|
268 |
+
messages=(),
|
269 |
+
offset=0,
|
270 |
+
sep_style=SeparatorStyle.TWO,
|
271 |
+
sep=" ",
|
272 |
+
sep2="</s>",
|
273 |
+
)
|
274 |
+
|
275 |
+
|
276 |
+
conv_mpt = Conversation(
|
277 |
+
system="""<|im_start|>system
|
278 |
+
You should follow the instructions carefully and explain your answers in detail.""",
|
279 |
+
# system = None,
|
280 |
+
roles=("<|im_start|>user\n", "<|im_start|>assistant\n"),
|
281 |
+
version="mpt",
|
282 |
+
messages=(),
|
283 |
+
offset=0,
|
284 |
+
sep_style=SeparatorStyle.MPT,
|
285 |
+
sep="<|im_end|>",
|
286 |
+
)
|
287 |
+
|
288 |
+
conv_mpt_eval = Conversation(
|
289 |
+
system="",
|
290 |
+
roles=("<|im_start|>user\n", "<|im_start|>assistant\n"),
|
291 |
+
version="mpt",
|
292 |
+
messages=(),
|
293 |
+
offset=0,
|
294 |
+
sep_style=SeparatorStyle.MPT,
|
295 |
+
sep="<|im_end|>",
|
296 |
+
)
|
297 |
+
|
298 |
+
conv_mpt_text = Conversation(
|
299 |
+
system="""<|im_start|>system
|
300 |
+
- You are a helpful assistant chatbot trained by MosaicML.
|
301 |
+
- You answer questions.
|
302 |
+
- You are excited to be able to help the user, but will refuse to do anything that could be considered harmful to the user.
|
303 |
+
- You are more than just an information source, you are also able to write poetry, short stories, and make jokes.""",
|
304 |
+
roles=("<|im_start|>user\n", "<|im_start|>assistant\n"),
|
305 |
+
version="mpt",
|
306 |
+
messages=(),
|
307 |
+
offset=0,
|
308 |
+
sep_style=SeparatorStyle.MPT,
|
309 |
+
sep="<|im_end|>",
|
310 |
+
)
|
311 |
+
|
312 |
+
conv_bair_v1 = Conversation(
|
313 |
+
system="BEGINNING OF CONVERSATION:",
|
314 |
+
roles=("USER", "GPT"),
|
315 |
+
messages=(),
|
316 |
+
offset=0,
|
317 |
+
sep_style=SeparatorStyle.TWO,
|
318 |
+
sep=" ",
|
319 |
+
sep2="</s>",
|
320 |
+
)
|
321 |
+
|
322 |
+
|
323 |
+
simple_conv = Conversation(
|
324 |
+
system="",
|
325 |
+
roles=("Human", "Assistant"),
|
326 |
+
messages=(),
|
327 |
+
offset=0,
|
328 |
+
sep_style=SeparatorStyle.SINGLE,
|
329 |
+
sep="###",
|
330 |
+
)
|
331 |
+
|
332 |
+
|
333 |
+
simple_conv_multimodal = Conversation(
|
334 |
+
system="You are GOT, a large language and vision assistant trained by Foundation Model Group, Megvii Technology."
|
335 |
+
"You are able to understand the visual content that the user provides, and assist the user with a variety of tasks using natural language."
|
336 |
+
"Follow the instructions carefully and explain your answers in detail.",
|
337 |
+
# system="",
|
338 |
+
roles=("Human", "Assistant"),
|
339 |
+
messages=(("Human", "Hi!"), ("Assistant", "Hi there! How can I help you today?\n")),
|
340 |
+
offset=2,
|
341 |
+
sep_style=SeparatorStyle.SINGLE,
|
342 |
+
sep="###",
|
343 |
+
)
|
344 |
+
|
345 |
+
|
346 |
+
simple_conv_mpt_multimodal = Conversation(
|
347 |
+
system="""<|im_start|>system
|
348 |
+
- You are GOT, a large language and vision assistant trained by Foundation Model Group, Megvii Technology.
|
349 |
+
- You are able to understand the visual content that the user provides, and assist the user with a variety of tasks using natural language.
|
350 |
+
- You should follow the instructions carefully and explain your answers in detail.""",
|
351 |
+
roles=("<|im_start|>user\n", "<|im_start|>assistant\n"),
|
352 |
+
version="mpt",
|
353 |
+
messages=(),
|
354 |
+
offset=0,
|
355 |
+
sep_style=SeparatorStyle.MPT,
|
356 |
+
sep="<|im_end|>",
|
357 |
+
)
|
358 |
+
|
359 |
+
|
360 |
+
simple_conv_legacy = Conversation(
|
361 |
+
system="You are GOT, a large language model trained by Foundation Model Group, Megvii Technology."
|
362 |
+
"You are designed to assist human with a variety of tasks using natural language."
|
363 |
+
"Follow the instructions carefully.",
|
364 |
+
roles=("Human", "Assistant"),
|
365 |
+
messages=(("Human", "Hi!\n\n### Response:"), ("Assistant", "Hi there! How can I help you today?\n")),
|
366 |
+
offset=2,
|
367 |
+
sep_style=SeparatorStyle.SINGLE,
|
368 |
+
sep="###",
|
369 |
+
)
|
370 |
+
|
371 |
+
|
372 |
+
conv_llava_v1 = Conversation(
|
373 |
+
system="You are GOT, a large language and vision assistant trained by Foundation Model Group, Megvii Technology."
|
374 |
+
"You are able to understand the visual content that the user provides, and assist the user with a variety of tasks using natural language."
|
375 |
+
"Follow the instructions carefully and explain your answers in detail.",
|
376 |
+
roles=("USER", "ASSISTANT"),
|
377 |
+
version="v1",
|
378 |
+
messages=(),
|
379 |
+
offset=0,
|
380 |
+
sep_style=SeparatorStyle.TWO,
|
381 |
+
sep=" ",
|
382 |
+
sep2="</s>",
|
383 |
+
)
|
384 |
+
|
385 |
+
default_conversation = conv_mpt
|
386 |
+
conv_templates = {
|
387 |
+
"default": simple_conv_multimodal,
|
388 |
+
"simple": simple_conv,
|
389 |
+
"simple_legacy": simple_conv_legacy,
|
390 |
+
"multimodal": simple_conv,
|
391 |
+
"mpt_multimodal": simple_conv_mpt_multimodal,
|
392 |
+
"llava_v1": conv_llava_v1,
|
393 |
+
"mpt_eval": conv_mpt_eval,
|
394 |
+
# fastchat
|
395 |
+
"v1": conv_vicuna_v1_1,
|
396 |
+
"bair_v1": conv_bair_v1,
|
397 |
+
"vicuna_v1_1": conv_vicuna_v1_1,
|
398 |
+
"mpt": conv_mpt,
|
399 |
+
"mpt_text": conv_mpt_text,
|
400 |
+
}
|
VLMEvalKit_old/PaddleMIX/paddlemix/models/audioldm2/encoders/phoneme_encoder/__init__.py
ADDED
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
|
2 |
+
#
|
3 |
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
4 |
+
# you may not use this file except in compliance with the License.
|
5 |
+
# You may obtain a copy of the License at
|
6 |
+
#
|
7 |
+
# http://www.apache.org/licenses/LICENSE-2.0
|
8 |
+
#
|
9 |
+
# Unless required by applicable law or agreed to in writing, software
|
10 |
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
11 |
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
12 |
+
# See the License for the specific language governing permissions and
|
13 |
+
# limitations under the License.
|
VLMEvalKit_old/PaddleMIX/paddlemix/models/audioldm2/encoders/phoneme_encoder/cleaners.py
ADDED
@@ -0,0 +1,103 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
|
2 |
+
#
|
3 |
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
4 |
+
# you may not use this file except in compliance with the License.
|
5 |
+
# You may obtain a copy of the License at
|
6 |
+
#
|
7 |
+
# http://www.apache.org/licenses/LICENSE-2.0
|
8 |
+
#
|
9 |
+
# Unless required by applicable law or agreed to in writing, software
|
10 |
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
11 |
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
12 |
+
# See the License for the specific language governing permissions and
|
13 |
+
# limitations under the License.
|
14 |
+
|
15 |
+
""" from https://github.com/keithito/tacotron """
|
16 |
+
|
17 |
+
import re
|
18 |
+
from unidecode import unidecode
|
19 |
+
from phonemizer import phonemize
|
20 |
+
|
21 |
+
__all__ = [
|
22 |
+
"basic_cleaners",
|
23 |
+
"transliteration_cleaners",
|
24 |
+
"english_cleaners",
|
25 |
+
"english_cleaners2"
|
26 |
+
]
|
27 |
+
|
28 |
+
# Regular expression matching whitespace:
|
29 |
+
_whitespace_re = re.compile(r'\s+')
|
30 |
+
|
31 |
+
# List of (regular expression, replacement) pairs for abbreviations:
|
32 |
+
_abbreviations = [(re.compile('\\b%s\\.' % x[0], re.IGNORECASE), x[1]) for x in [
|
33 |
+
('mrs', 'misess'),
|
34 |
+
('mr', 'mister'),
|
35 |
+
('dr', 'doctor'),
|
36 |
+
('st', 'saint'),
|
37 |
+
('co', 'company'),
|
38 |
+
('jr', 'junior'),
|
39 |
+
('maj', 'major'),
|
40 |
+
('gen', 'general'),
|
41 |
+
('drs', 'doctors'),
|
42 |
+
('rev', 'reverend'),
|
43 |
+
('lt', 'lieutenant'),
|
44 |
+
('hon', 'honorable'),
|
45 |
+
('sgt', 'sergeant'),
|
46 |
+
('capt', 'captain'),
|
47 |
+
('esq', 'esquire'),
|
48 |
+
('ltd', 'limited'),
|
49 |
+
('col', 'colonel'),
|
50 |
+
('ft', 'fort'),
|
51 |
+
]]
|
52 |
+
|
53 |
+
|
54 |
+
def expand_abbreviations(text):
|
55 |
+
for regex, replacement in _abbreviations:
|
56 |
+
text = re.sub(regex, replacement, text)
|
57 |
+
return text
|
58 |
+
|
59 |
+
def lowercase(text):
|
60 |
+
return text.lower()
|
61 |
+
|
62 |
+
|
63 |
+
def collapse_whitespace(text):
|
64 |
+
return re.sub(_whitespace_re, ' ', text)
|
65 |
+
|
66 |
+
|
67 |
+
def convert_to_ascii(text):
|
68 |
+
return unidecode(text)
|
69 |
+
|
70 |
+
|
71 |
+
def basic_cleaners(text):
|
72 |
+
'''Basic pipeline that lowercases and collapses whitespace without transliteration.'''
|
73 |
+
text = lowercase(text)
|
74 |
+
text = collapse_whitespace(text)
|
75 |
+
return text
|
76 |
+
|
77 |
+
|
78 |
+
def transliteration_cleaners(text):
|
79 |
+
'''Pipeline for non-English text that transliterates to ASCII.'''
|
80 |
+
text = convert_to_ascii(text)
|
81 |
+
text = lowercase(text)
|
82 |
+
text = collapse_whitespace(text)
|
83 |
+
return text
|
84 |
+
|
85 |
+
|
86 |
+
def english_cleaners(text):
|
87 |
+
'''Pipeline for English text, including abbreviation expansion.'''
|
88 |
+
text = convert_to_ascii(text)
|
89 |
+
text = lowercase(text)
|
90 |
+
text = expand_abbreviations(text)
|
91 |
+
phonemes = phonemize(text, language='en-us', backend='espeak', strip=True)
|
92 |
+
phonemes = collapse_whitespace(phonemes)
|
93 |
+
return phonemes
|
94 |
+
|
95 |
+
|
96 |
+
def english_cleaners2(text):
|
97 |
+
'''Pipeline for English text, including abbreviation expansion. + punctuation + stress'''
|
98 |
+
text = convert_to_ascii(text)
|
99 |
+
text = lowercase(text)
|
100 |
+
text = expand_abbreviations(text)
|
101 |
+
phonemes = phonemize(text, language='en-us', backend='espeak', strip=True, preserve_punctuation=True, with_stress=True)
|
102 |
+
phonemes = collapse_whitespace(phonemes)
|
103 |
+
return phonemes
|
VLMEvalKit_old/PaddleMIX/paddlemix/models/audioldm2/encoders/phoneme_encoder/symbols.py
ADDED
@@ -0,0 +1,28 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
|
2 |
+
#
|
3 |
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
4 |
+
# you may not use this file except in compliance with the License.
|
5 |
+
# You may obtain a copy of the License at
|
6 |
+
#
|
7 |
+
# http://www.apache.org/licenses/LICENSE-2.0
|
8 |
+
#
|
9 |
+
# Unless required by applicable law or agreed to in writing, software
|
10 |
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
11 |
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
12 |
+
# See the License for the specific language governing permissions and
|
13 |
+
# limitations under the License.
|
14 |
+
|
15 |
+
'''
|
16 |
+
Defines the set of symbols used in text input to the model.
|
17 |
+
'''
|
18 |
+
_pad = '_'
|
19 |
+
_punctuation = ';:,.!?¡¿—…"«»“” '
|
20 |
+
_letters = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz'
|
21 |
+
_letters_ipa = "ɑɐɒæɓʙβɔɕçɗɖðʤəɘɚɛɜɝɞɟʄɡɠɢʛɦɧħɥʜɨɪʝɭɬɫɮʟɱɯɰŋɳɲɴøɵɸθœɶʘɹɺɾɻʀʁɽʂʃʈʧʉʊʋⱱʌɣɤʍχʎʏʑʐʒʔʡʕʢǀǁǂǃˈˌːˑʼʴʰʱʲʷˠˤ˞↓↑→↗↘'̩'ᵻ"
|
22 |
+
|
23 |
+
|
24 |
+
# Export all symbols:
|
25 |
+
symbols = [_pad] + list(_punctuation) + list(_letters) + list(_letters_ipa)
|
26 |
+
|
27 |
+
# Special symbol ids
|
28 |
+
SPACE_ID = symbols.index(" ")
|
VLMEvalKit_old/PaddleMIX/paddlemix/models/audioldm2/encoders/phoneme_encoder/text.py
ADDED
@@ -0,0 +1,62 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
|
2 |
+
#
|
3 |
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
4 |
+
# you may not use this file except in compliance with the License.
|
5 |
+
# You may obtain a copy of the License at
|
6 |
+
#
|
7 |
+
# http://www.apache.org/licenses/LICENSE-2.0
|
8 |
+
#
|
9 |
+
# Unless required by applicable law or agreed to in writing, software
|
10 |
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
11 |
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
12 |
+
# See the License for the specific language governing permissions and
|
13 |
+
# limitations under the License.
|
14 |
+
|
15 |
+
""" from https://github.com/keithito/tacotron """
|
16 |
+
|
17 |
+
from .cleaners import *
|
18 |
+
from .symbols import symbols
|
19 |
+
|
20 |
+
# Mappings from symbol to numeric ID and vice versa:
|
21 |
+
_symbol_to_id = {s: i for i, s in enumerate(symbols)}
|
22 |
+
_id_to_symbol = {i: s for i, s in enumerate(symbols)}
|
23 |
+
|
24 |
+
cleaner = english_cleaners2
|
25 |
+
|
26 |
+
def text_to_sequence(text, cleaner_names):
|
27 |
+
'''Converts a string of text to a sequence of IDs corresponding to the symbols in the text.
|
28 |
+
Args:
|
29 |
+
text: string to convert to a sequence
|
30 |
+
cleaner_names: names of the cleaner functions to run the text through
|
31 |
+
Returns:
|
32 |
+
List of integers corresponding to the symbols in the text
|
33 |
+
'''
|
34 |
+
sequence = []
|
35 |
+
|
36 |
+
clean_text = _clean_text(text, cleaner_names)
|
37 |
+
for symbol in clean_text:
|
38 |
+
symbol_id = _symbol_to_id[symbol]
|
39 |
+
sequence += [symbol_id]
|
40 |
+
return sequence
|
41 |
+
|
42 |
+
def cleaned_text_to_sequence(cleaned_text):
|
43 |
+
'''Converts a string of text to a sequence of IDs corresponding to the symbols in the text.
|
44 |
+
Args:
|
45 |
+
text: string to convert to a sequence
|
46 |
+
Returns:
|
47 |
+
List of integers corresponding to the symbols in the text
|
48 |
+
'''
|
49 |
+
sequence = [_symbol_to_id[symbol] for symbol in cleaned_text]
|
50 |
+
return sequence
|
51 |
+
|
52 |
+
def sequence_to_text(sequence):
|
53 |
+
'''Converts a sequence of IDs back to a string'''
|
54 |
+
result = ''
|
55 |
+
for symbol_id in sequence:
|
56 |
+
s = _id_to_symbol[symbol_id]
|
57 |
+
result += s
|
58 |
+
return result
|
59 |
+
|
60 |
+
def _clean_text(text, cleaner_names):
|
61 |
+
text = cleaner(text)
|
62 |
+
return text
|
VLMEvalKit_old/PaddleMIX/paddlemix/models/audioldm2/unet/attention.py
ADDED
@@ -0,0 +1,199 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
|
2 |
+
#
|
3 |
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
4 |
+
# you may not use this file except in compliance with the License.
|
5 |
+
# You may obtain a copy of the License at
|
6 |
+
#
|
7 |
+
# http://www.apache.org/licenses/LICENSE-2.0
|
8 |
+
#
|
9 |
+
# Unless required by applicable law or agreed to in writing, software
|
10 |
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
11 |
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
12 |
+
# See the License for the specific language governing permissions and
|
13 |
+
# limitations under the License.
|
14 |
+
|
15 |
+
import paddle
|
16 |
+
from paddle import nn
|
17 |
+
from ppdiffusers.models.attention import GEGLU
|
18 |
+
from einops import rearrange, repeat
|
19 |
+
from ..diffusionwrapper import default
|
20 |
+
|
21 |
+
def Normalize(in_channels):
|
22 |
+
return nn.GroupNorm(
|
23 |
+
num_groups=32, num_channels=in_channels, epsilon=1e-6
|
24 |
+
)
|
25 |
+
|
26 |
+
class FeedForward(nn.Layer):
|
27 |
+
def __init__(self, dim, dim_out=None, mult=4, glu=False, dropout=0.0):
|
28 |
+
super().__init__()
|
29 |
+
inner_dim = int(dim * mult)
|
30 |
+
dim_out = default(dim_out, dim)
|
31 |
+
project_in = (
|
32 |
+
nn.Sequential(nn.Linear(dim, inner_dim), nn.GELU())
|
33 |
+
if not glu
|
34 |
+
else GEGLU(dim, inner_dim)
|
35 |
+
)
|
36 |
+
|
37 |
+
self.net = nn.Sequential(
|
38 |
+
project_in, nn.Dropout(dropout), nn.Linear(inner_dim, dim_out)
|
39 |
+
)
|
40 |
+
|
41 |
+
def forward(self, x):
|
42 |
+
return self.net(x)
|
43 |
+
|
44 |
+
|
45 |
+
class CrossAttention(nn.Layer):
|
46 |
+
def __init__(self, query_dim, context_dim=None, heads=8, dim_head=64, dropout=0.0):
|
47 |
+
super().__init__()
|
48 |
+
inner_dim = dim_head * heads
|
49 |
+
context_dim = default(context_dim, query_dim)
|
50 |
+
|
51 |
+
self.scale = dim_head**-0.5
|
52 |
+
self.heads = heads
|
53 |
+
|
54 |
+
self.to_q = nn.Linear(query_dim, inner_dim, bias_attr=False)
|
55 |
+
self.to_k = nn.Linear(context_dim, inner_dim, bias_attr=False)
|
56 |
+
self.to_v = nn.Linear(context_dim, inner_dim, bias_attr=False)
|
57 |
+
|
58 |
+
self.to_out = nn.Sequential(
|
59 |
+
nn.Linear(inner_dim, query_dim), nn.Dropout(dropout)
|
60 |
+
)
|
61 |
+
|
62 |
+
def forward(self, x, context=None, mask=None):
|
63 |
+
h = self.heads
|
64 |
+
|
65 |
+
q = self.to_q(x)
|
66 |
+
context = default(context, x)
|
67 |
+
|
68 |
+
k = self.to_k(context)
|
69 |
+
v = self.to_v(context)
|
70 |
+
|
71 |
+
q, k, v = map(lambda t: rearrange(t, "b n (h d) -> (b h) n d", h=h), (q, k, v))
|
72 |
+
|
73 |
+
sim = paddle.einsum("b i d, b j d -> b i j", q, k) * self.scale
|
74 |
+
|
75 |
+
if mask is not None:
|
76 |
+
mask = rearrange(mask, "b ... -> b (...)")
|
77 |
+
max_neg_value = -paddle.finfo(sim.dtype).max
|
78 |
+
mask = repeat(mask, "b j -> (b h) () j", h=h)
|
79 |
+
tmp = paddle.full(sim.shape, max_neg_value, sim.dtype)
|
80 |
+
sim = paddle.where(~(mask == 1), tmp, sim)
|
81 |
+
|
82 |
+
# attention, what we cannot get enough of
|
83 |
+
attn = nn.functional.softmax(sim, axis=-1)
|
84 |
+
out = paddle.einsum("b i j, b j d -> b i d", attn, v)
|
85 |
+
out = rearrange(out, "(b h) n d -> b n (h d)", h=h)
|
86 |
+
return self.to_out(out)
|
87 |
+
|
88 |
+
|
89 |
+
class LinearAttention(nn.Layer):
|
90 |
+
def __init__(self, dim, heads=4, dim_head=32):
|
91 |
+
super().__init__()
|
92 |
+
self.heads = heads
|
93 |
+
hidden_dim = dim_head * heads
|
94 |
+
self.to_qkv = nn.Conv2D(dim, hidden_dim * 3, 1, bias_attr=False)
|
95 |
+
self.to_out = nn.Conv2D(hidden_dim, dim, 1)
|
96 |
+
|
97 |
+
def forward(self, x):
|
98 |
+
b, c, h, w = x.shape
|
99 |
+
qkv = self.to_qkv(x)
|
100 |
+
q, k, v = rearrange(
|
101 |
+
qkv, "b (qkv heads c) h w -> qkv b heads c (h w)", heads=self.heads, qkv=3
|
102 |
+
)
|
103 |
+
k = nn.functional.softmax(k, axis=-1)
|
104 |
+
context = paddle.einsum("bhdn,bhen->bhde", k, v)
|
105 |
+
out = paddle.einsum("bhde,bhdn->bhen", context, q)
|
106 |
+
out = rearrange(
|
107 |
+
out, "b heads c (h w) -> b (heads c) h w", heads=self.heads, h=h, w=w
|
108 |
+
)
|
109 |
+
return self.to_out(out)
|
110 |
+
|
111 |
+
class BasicTransformerBlock(nn.Layer):
|
112 |
+
def __init__(
|
113 |
+
self,
|
114 |
+
dim,
|
115 |
+
n_heads,
|
116 |
+
d_head,
|
117 |
+
dropout=0.0,
|
118 |
+
context_dim=None,
|
119 |
+
gated_ff=True,
|
120 |
+
checkpoint=True,
|
121 |
+
):
|
122 |
+
super().__init__()
|
123 |
+
self.attn1 = CrossAttention(
|
124 |
+
query_dim=dim, heads=n_heads, dim_head=d_head, dropout=dropout
|
125 |
+
) # is a self-attention
|
126 |
+
self.ff = FeedForward(dim, dropout=dropout, glu=gated_ff)
|
127 |
+
self.attn2 = CrossAttention(
|
128 |
+
query_dim=dim,
|
129 |
+
context_dim=context_dim,
|
130 |
+
heads=n_heads,
|
131 |
+
dim_head=d_head,
|
132 |
+
dropout=dropout,
|
133 |
+
) # is self-attn if context is none
|
134 |
+
self.norm1 = nn.LayerNorm(dim)
|
135 |
+
self.norm2 = nn.LayerNorm(dim)
|
136 |
+
self.norm3 = nn.LayerNorm(dim)
|
137 |
+
self.checkpoint = checkpoint
|
138 |
+
|
139 |
+
def forward(self, x, context=None, mask=None):
|
140 |
+
x = self.attn1(self.norm1(x)) + x
|
141 |
+
x = self.attn2(self.norm2(x), context=context, mask=mask) + x
|
142 |
+
x = self.ff(self.norm3(x)) + x
|
143 |
+
return x
|
144 |
+
|
145 |
+
class SpatialTransformer(nn.Layer):
|
146 |
+
"""
|
147 |
+
Transformer block for image-like data.
|
148 |
+
First, project the input (aka embedding)
|
149 |
+
and reshape to b, t, d.
|
150 |
+
Then apply standard transformer action.
|
151 |
+
Finally, reshape to image
|
152 |
+
"""
|
153 |
+
|
154 |
+
def __init__(
|
155 |
+
self,
|
156 |
+
in_channels,
|
157 |
+
n_heads,
|
158 |
+
d_head,
|
159 |
+
depth=1,
|
160 |
+
dropout=0.0,
|
161 |
+
context_dim=None,
|
162 |
+
):
|
163 |
+
super().__init__()
|
164 |
+
|
165 |
+
context_dim = context_dim
|
166 |
+
|
167 |
+
self.in_channels = in_channels
|
168 |
+
inner_dim = n_heads * d_head
|
169 |
+
self.norm = Normalize(in_channels)
|
170 |
+
|
171 |
+
self.proj_in = nn.Conv2D(
|
172 |
+
in_channels, inner_dim, kernel_size=1, stride=1, padding=0
|
173 |
+
)
|
174 |
+
|
175 |
+
self.transformer_blocks = nn.LayerList(
|
176 |
+
[
|
177 |
+
BasicTransformerBlock(
|
178 |
+
inner_dim, n_heads, d_head, dropout=dropout, context_dim=context_dim
|
179 |
+
)
|
180 |
+
for d in range(depth)
|
181 |
+
]
|
182 |
+
)
|
183 |
+
weight_attr = paddle.ParamAttr(
|
184 |
+
initializer=nn.initializer.Constant(value=0.0)
|
185 |
+
)
|
186 |
+
self.proj_out = nn.Conv2D(inner_dim, in_channels, kernel_size=1, stride=1, padding=0, weight_attr=weight_attr)
|
187 |
+
|
188 |
+
def forward(self, x, context=None, mask=None):
|
189 |
+
# note: if no context is given, cross-attention defaults to self-attention
|
190 |
+
b, c, h, w = x.shape
|
191 |
+
x_in = x
|
192 |
+
x = self.norm(x)
|
193 |
+
x = self.proj_in(x)
|
194 |
+
x = rearrange(x, "b c h w -> b (h w) c")
|
195 |
+
for block in self.transformer_blocks:
|
196 |
+
x = block(x, context=context, mask=mask)
|
197 |
+
x = rearrange(x, "b (h w) c -> b c h w", h=h, w=w)
|
198 |
+
x = self.proj_out(x)
|
199 |
+
return x + x_in
|
VLMEvalKit_old/PaddleMIX/paddlemix/models/diffsinger/basics/base_augmentation.py
ADDED
@@ -0,0 +1,46 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
|
2 |
+
#
|
3 |
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
4 |
+
# you may not use this file except in compliance with the License.
|
5 |
+
# You may obtain a copy of the License at
|
6 |
+
#
|
7 |
+
# http://www.apache.org/licenses/LICENSE-2.0
|
8 |
+
#
|
9 |
+
# Unless required by applicable law or agreed to in writing, software
|
10 |
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
11 |
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
12 |
+
# See the License for the specific language governing permissions and
|
13 |
+
# limitations under the License.
|
14 |
+
|
15 |
+
from utils.hparams import hparams
|
16 |
+
|
17 |
+
|
18 |
+
class BaseAugmentation:
|
19 |
+
"""
|
20 |
+
Base class for data augmentation.
|
21 |
+
All methods of this class should be thread-safe.
|
22 |
+
1. *process_item*:
|
23 |
+
Apply augmentation to one piece of data.
|
24 |
+
"""
|
25 |
+
|
26 |
+
def __init__(self, data_dirs: list, augmentation_args: dict):
|
27 |
+
self.raw_data_dirs = data_dirs
|
28 |
+
self.augmentation_args = augmentation_args
|
29 |
+
self.timestep = hparams["hop_size"] / hparams["audio_sample_rate"]
|
30 |
+
|
31 |
+
def process_item(self, item: dict, **kwargs) -> dict:
|
32 |
+
raise NotImplementedError()
|
33 |
+
|
34 |
+
|
35 |
+
def require_same_keys(func):
|
36 |
+
def run(*args, **kwargs):
|
37 |
+
item: dict = args[1]
|
38 |
+
res: dict = func(*args, **kwargs)
|
39 |
+
assert set(item.keys()) == set(
|
40 |
+
res.keys()
|
41 |
+
), f"""Item keys mismatch after augmentation.
|
42 |
+
Before: {sorted(item.keys())}
|
43 |
+
After: {sorted(res.keys())}"""
|
44 |
+
return res
|
45 |
+
|
46 |
+
return run
|
VLMEvalKit_old/PaddleMIX/paddlemix/models/diffsinger/basics/base_binarizer.py
ADDED
@@ -0,0 +1,330 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
|
2 |
+
#
|
3 |
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
4 |
+
# you may not use this file except in compliance with the License.
|
5 |
+
# You may obtain a copy of the License at
|
6 |
+
#
|
7 |
+
# http://www.apache.org/licenses/LICENSE-2.0
|
8 |
+
#
|
9 |
+
# Unless required by applicable law or agreed to in writing, software
|
10 |
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
11 |
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
12 |
+
# See the License for the specific language governing permissions and
|
13 |
+
# limitations under the License.
|
14 |
+
|
15 |
+
import json
|
16 |
+
import pathlib
|
17 |
+
import pickle
|
18 |
+
import random
|
19 |
+
import shutil
|
20 |
+
import warnings
|
21 |
+
from copy import deepcopy
|
22 |
+
|
23 |
+
import numpy as np
|
24 |
+
import paddle
|
25 |
+
from tqdm import tqdm
|
26 |
+
from utils.hparams import hparams
|
27 |
+
from utils.indexed_datasets import IndexedDatasetBuilder
|
28 |
+
from utils.multiprocess_utils import chunked_multiprocess_run
|
29 |
+
from utils.phoneme_utils import build_phoneme_list, locate_dictionary
|
30 |
+
from utils.plot import distribution_to_figure
|
31 |
+
from utils.text_encoder import TokenTextEncoder
|
32 |
+
|
33 |
+
|
34 |
+
class BinarizationError(Exception):
|
35 |
+
pass
|
36 |
+
|
37 |
+
|
38 |
+
class BaseBinarizer:
|
39 |
+
"""
|
40 |
+
Base class for data processing.
|
41 |
+
1. *process* and *process_data_split*:
|
42 |
+
process entire data, generate the train-test split (support parallel processing);
|
43 |
+
2. *process_item*:
|
44 |
+
process singe piece of data;
|
45 |
+
3. *get_pitch*:
|
46 |
+
infer the pitch using some algorithm;
|
47 |
+
4. *get_align*:
|
48 |
+
get the alignment using 'mel2ph' format (see https://arxiv.org/abs/1905.09263).
|
49 |
+
5. phoneme encoder, voice encoder, etc.
|
50 |
+
|
51 |
+
Subclasses should define:
|
52 |
+
1. *load_metadata*:
|
53 |
+
how to read multiple datasets from files;
|
54 |
+
2. *train_item_names*, *valid_item_names*, *test_item_names*:
|
55 |
+
how to split the dataset;
|
56 |
+
3. load_ph_set:
|
57 |
+
the phoneme set.
|
58 |
+
"""
|
59 |
+
|
60 |
+
def __init__(self, data_dir=None, data_attrs=None):
|
61 |
+
if data_dir is None:
|
62 |
+
data_dir = hparams["raw_data_dir"]
|
63 |
+
if not isinstance(data_dir, list):
|
64 |
+
data_dir = [data_dir]
|
65 |
+
self.raw_data_dirs = [pathlib.Path(d) for d in data_dir]
|
66 |
+
self.binary_data_dir = pathlib.Path(hparams["binary_data_dir"])
|
67 |
+
self.data_attrs = [] if data_attrs is None else data_attrs
|
68 |
+
self.binarization_args = hparams["binarization_args"]
|
69 |
+
self.augmentation_args = hparams.get("augmentation_args", {})
|
70 |
+
self.device = str("cuda" if paddle.device.cuda.device_count() >= 1 else "cpu").replace("cuda", "gpu")
|
71 |
+
self.spk_map = None
|
72 |
+
self.spk_ids = hparams["spk_ids"]
|
73 |
+
self.speakers = hparams["speakers"]
|
74 |
+
self.build_spk_map()
|
75 |
+
self.items = {}
|
76 |
+
self.item_names: list = None
|
77 |
+
self._train_item_names: list = None
|
78 |
+
self._valid_item_names: list = None
|
79 |
+
self.phone_encoder = TokenTextEncoder(vocab_list=build_phoneme_list())
|
80 |
+
self.timestep = hparams["hop_size"] / hparams["audio_sample_rate"]
|
81 |
+
|
82 |
+
def build_spk_map(self):
|
83 |
+
assert isinstance(self.speakers, list), "Speakers must be a list"
|
84 |
+
assert len(self.speakers) == len(
|
85 |
+
self.raw_data_dirs
|
86 |
+
), "Number of raw data dirs must equal number of speaker names!"
|
87 |
+
if len(self.spk_ids) == 0:
|
88 |
+
self.spk_ids = list(range(len(self.raw_data_dirs)))
|
89 |
+
else:
|
90 |
+
assert len(self.spk_ids) == len(
|
91 |
+
self.raw_data_dirs
|
92 |
+
), "Length of explicitly given spk_ids must equal the number of raw datasets."
|
93 |
+
assert (
|
94 |
+
max(self.spk_ids) < hparams["num_spk"]
|
95 |
+
), f"Index in spk_id sequence {self.spk_ids} is out of range. All values should be smaller than num_spk."
|
96 |
+
self.spk_map = {}
|
97 |
+
for spk_name, spk_id in zip(self.speakers, self.spk_ids):
|
98 |
+
if spk_name in self.spk_map and self.spk_map[spk_name] != spk_id:
|
99 |
+
raise ValueError(
|
100 |
+
f"Invalid speaker ID assignment. Name '{spk_name}' is assigned with different speaker IDs: {self.spk_map[spk_name]} and {spk_id}."
|
101 |
+
)
|
102 |
+
self.spk_map[spk_name] = spk_id
|
103 |
+
print("| spk_map: ", self.spk_map)
|
104 |
+
|
105 |
+
def load_meta_data(self, raw_data_dir: pathlib.Path, ds_id, spk_id):
|
106 |
+
raise NotImplementedError()
|
107 |
+
|
108 |
+
def split_train_valid_set(self, item_names):
|
109 |
+
"""
|
110 |
+
Split the dataset into training set and validation set.
|
111 |
+
:return: train_item_names, valid_item_names
|
112 |
+
"""
|
113 |
+
prefixes = {str(pr): (1) for pr in hparams["test_prefixes"]}
|
114 |
+
valid_item_names = {}
|
115 |
+
for prefix in deepcopy(prefixes):
|
116 |
+
if prefix in item_names:
|
117 |
+
valid_item_names[prefix] = 1
|
118 |
+
prefixes.pop(prefix)
|
119 |
+
for prefix in deepcopy(prefixes):
|
120 |
+
matched = False
|
121 |
+
for name in item_names:
|
122 |
+
if name.split(":")[-1] == prefix:
|
123 |
+
valid_item_names[name] = 1
|
124 |
+
matched = True
|
125 |
+
if matched:
|
126 |
+
prefixes.pop(prefix)
|
127 |
+
for prefix in deepcopy(prefixes):
|
128 |
+
matched = False
|
129 |
+
for name in item_names:
|
130 |
+
if name.startswith(prefix):
|
131 |
+
valid_item_names[name] = 1
|
132 |
+
matched = True
|
133 |
+
if matched:
|
134 |
+
prefixes.pop(prefix)
|
135 |
+
for prefix in deepcopy(prefixes):
|
136 |
+
matched = False
|
137 |
+
for name in item_names:
|
138 |
+
if name.split(":")[-1].startswith(prefix):
|
139 |
+
valid_item_names[name] = 1
|
140 |
+
matched = True
|
141 |
+
if matched:
|
142 |
+
prefixes.pop(prefix)
|
143 |
+
if len(prefixes) != 0:
|
144 |
+
warnings.warn(
|
145 |
+
f"The following rules in test_prefixes have no matching names in the dataset: {', '.join(prefixes.keys())}",
|
146 |
+
category=UserWarning,
|
147 |
+
)
|
148 |
+
warnings.filterwarnings("default")
|
149 |
+
valid_item_names = list(valid_item_names.keys())
|
150 |
+
assert len(valid_item_names) > 0, "Validation set is empty!"
|
151 |
+
train_item_names = [x for x in item_names if x not in set(valid_item_names)]
|
152 |
+
assert len(train_item_names) > 0, "Training set is empty!"
|
153 |
+
return train_item_names, valid_item_names
|
154 |
+
|
155 |
+
@property
|
156 |
+
def train_item_names(self):
|
157 |
+
return self._train_item_names
|
158 |
+
|
159 |
+
@property
|
160 |
+
def valid_item_names(self):
|
161 |
+
return self._valid_item_names
|
162 |
+
|
163 |
+
def meta_data_iterator(self, prefix):
|
164 |
+
if prefix == "train":
|
165 |
+
item_names = self.train_item_names
|
166 |
+
else:
|
167 |
+
item_names = self.valid_item_names
|
168 |
+
for item_name in item_names:
|
169 |
+
meta_data = self.items[item_name]
|
170 |
+
yield item_name, meta_data
|
171 |
+
|
172 |
+
def process(self):
|
173 |
+
for ds_id, spk_id, data_dir in zip(range(len(self.raw_data_dirs)), self.spk_ids, self.raw_data_dirs):
|
174 |
+
self.load_meta_data(pathlib.Path(data_dir), ds_id=ds_id, spk_id=spk_id)
|
175 |
+
self.item_names = sorted(list(self.items.keys()))
|
176 |
+
self._train_item_names, self._valid_item_names = self.split_train_valid_set(self.item_names)
|
177 |
+
if self.binarization_args["shuffle"]:
|
178 |
+
random.shuffle(self.item_names)
|
179 |
+
self.binary_data_dir.mkdir(parents=True, exist_ok=True)
|
180 |
+
spk_map_fn = self.binary_data_dir / "spk_map.json"
|
181 |
+
with open(spk_map_fn, "w", encoding="utf-8") as f:
|
182 |
+
json.dump(self.spk_map, f)
|
183 |
+
shutil.copy(locate_dictionary(), self.binary_data_dir / "dictionary.txt")
|
184 |
+
self.check_coverage()
|
185 |
+
try:
|
186 |
+
self.process_dataset("valid")
|
187 |
+
self.process_dataset(
|
188 |
+
"train",
|
189 |
+
num_workers=int(self.binarization_args["num_workers"]),
|
190 |
+
apply_augmentation=any(args["enabled"] for args in self.augmentation_args.values()),
|
191 |
+
)
|
192 |
+
except KeyboardInterrupt:
|
193 |
+
exit(-1)
|
194 |
+
|
195 |
+
def check_coverage(self):
|
196 |
+
ph_required = set(build_phoneme_list())
|
197 |
+
phoneme_map = {}
|
198 |
+
for ph in ph_required:
|
199 |
+
phoneme_map[ph] = 0
|
200 |
+
ph_occurred = []
|
201 |
+
for item_name in self.items:
|
202 |
+
ph_occurred += self.items[item_name]["ph_seq"]
|
203 |
+
if len(ph_occurred) == 0:
|
204 |
+
raise BinarizationError(f"Empty tokens in {item_name}.")
|
205 |
+
for ph in ph_occurred:
|
206 |
+
if ph not in ph_required:
|
207 |
+
continue
|
208 |
+
phoneme_map[ph] += 1
|
209 |
+
ph_occurred = set(ph_occurred)
|
210 |
+
print("===== Phoneme Distribution Summary =====")
|
211 |
+
for i, key in enumerate(sorted(phoneme_map.keys())):
|
212 |
+
if i == len(ph_required) - 1:
|
213 |
+
end = "\n"
|
214 |
+
elif i % 10 == 9:
|
215 |
+
end = ",\n"
|
216 |
+
else:
|
217 |
+
end = ", "
|
218 |
+
print(f"'{key}': {phoneme_map[key]}", end=end)
|
219 |
+
x = sorted(phoneme_map.keys())
|
220 |
+
values = [phoneme_map[k] for k in x]
|
221 |
+
plt = distribution_to_figure(
|
222 |
+
title="Phoneme Distribution Summary",
|
223 |
+
x_label="Phoneme",
|
224 |
+
y_label="Number of occurrences",
|
225 |
+
items=x,
|
226 |
+
values=values,
|
227 |
+
)
|
228 |
+
filename = self.binary_data_dir / "phoneme_distribution.jpg"
|
229 |
+
plt.savefig(fname=filename, bbox_inches="tight", pad_inches=0.25)
|
230 |
+
print(f"| save summary to '{filename}'")
|
231 |
+
if ph_occurred != ph_required:
|
232 |
+
unrecognizable_phones = ph_occurred.difference(ph_required)
|
233 |
+
missing_phones = ph_required.difference(ph_occurred)
|
234 |
+
raise BinarizationError(
|
235 |
+
f"""transcriptions and dictionary mismatch.
|
236 |
+
(+) {sorted(unrecognizable_phones)}
|
237 |
+
(-) {sorted(missing_phones)}"""
|
238 |
+
)
|
239 |
+
|
240 |
+
def process_dataset(self, prefix, num_workers=0, apply_augmentation=False):
|
241 |
+
args = []
|
242 |
+
builder = IndexedDatasetBuilder(self.binary_data_dir, prefix=prefix, allowed_attr=self.data_attrs)
|
243 |
+
total_sec = {k: (0.0) for k in self.spk_map}
|
244 |
+
total_raw_sec = {k: (0.0) for k in self.spk_map}
|
245 |
+
extra_info = {"names": {}, "spk_ids": {}, "spk_names": {}, "lengths": {}}
|
246 |
+
max_no = -1
|
247 |
+
for item_name, meta_data in self.meta_data_iterator(prefix):
|
248 |
+
args.append([item_name, meta_data, self.binarization_args])
|
249 |
+
aug_map = self.arrange_data_augmentation(self.meta_data_iterator(prefix)) if apply_augmentation else {}
|
250 |
+
|
251 |
+
def postprocess(_item):
|
252 |
+
nonlocal total_sec, total_raw_sec, extra_info, max_no
|
253 |
+
if _item is None:
|
254 |
+
return
|
255 |
+
item_no = builder.add_item(_item)
|
256 |
+
max_no = max(max_no, item_no)
|
257 |
+
for k, v in _item.items():
|
258 |
+
if isinstance(v, np.ndarray):
|
259 |
+
if k not in extra_info:
|
260 |
+
extra_info[k] = {}
|
261 |
+
extra_info[k][item_no] = tuple(v.shape)[0]
|
262 |
+
extra_info["names"][item_no] = _item["name"].split(":", 1)[-1]
|
263 |
+
extra_info["spk_ids"][item_no] = _item["spk_id"]
|
264 |
+
extra_info["spk_names"][item_no] = _item["spk_name"]
|
265 |
+
extra_info["lengths"][item_no] = _item["length"]
|
266 |
+
total_raw_sec[_item["spk_name"]] += _item["seconds"]
|
267 |
+
total_sec[_item["spk_name"]] += _item["seconds"]
|
268 |
+
for task in aug_map.get(_item["name"], []):
|
269 |
+
aug_item = task["func"](_item, **task["kwargs"])
|
270 |
+
aug_item_no = builder.add_item(aug_item)
|
271 |
+
max_no = max(max_no, aug_item_no)
|
272 |
+
for k, v in aug_item.items():
|
273 |
+
if isinstance(v, np.ndarray):
|
274 |
+
if k not in extra_info:
|
275 |
+
extra_info[k] = {}
|
276 |
+
extra_info[k][aug_item_no] = tuple(v.shape)[0]
|
277 |
+
extra_info["names"][aug_item_no] = aug_item["name"].split(":", 1)[-1]
|
278 |
+
extra_info["spk_ids"][aug_item_no] = aug_item["spk_id"]
|
279 |
+
extra_info["spk_names"][aug_item_no] = aug_item["spk_name"]
|
280 |
+
extra_info["lengths"][aug_item_no] = aug_item["length"]
|
281 |
+
total_sec[aug_item["spk_name"]] += aug_item["seconds"]
|
282 |
+
|
283 |
+
try:
|
284 |
+
if num_workers > 0:
|
285 |
+
for item in tqdm(
|
286 |
+
chunked_multiprocess_run(self.process_item, args, num_workers=num_workers),
|
287 |
+
total=len(list(self.meta_data_iterator(prefix))),
|
288 |
+
):
|
289 |
+
postprocess(item)
|
290 |
+
else:
|
291 |
+
for a in tqdm(args):
|
292 |
+
item = self.process_item(*a)
|
293 |
+
postprocess(item)
|
294 |
+
for k in extra_info:
|
295 |
+
assert set(extra_info[k]) == set(range(max_no + 1)), f"Item numbering is not consecutive."
|
296 |
+
extra_info[k] = list(map(lambda x: x[1], sorted(extra_info[k].items(), key=lambda x: x[0])))
|
297 |
+
except KeyboardInterrupt:
|
298 |
+
builder.finalize()
|
299 |
+
raise
|
300 |
+
builder.finalize()
|
301 |
+
if prefix == "train":
|
302 |
+
extra_info.pop("names")
|
303 |
+
extra_info.pop("spk_names")
|
304 |
+
with open(self.binary_data_dir / f"{prefix}.meta", "wb") as f:
|
305 |
+
pickle.dump(extra_info, f)
|
306 |
+
if apply_augmentation:
|
307 |
+
print(f"| {prefix} total duration (before augmentation): {sum(total_raw_sec.values()):.2f}s")
|
308 |
+
print(
|
309 |
+
f"| {prefix} respective duration (before augmentation): "
|
310 |
+
+ ", ".join(f"{k}={v:.2f}s" for k, v in total_raw_sec.items())
|
311 |
+
)
|
312 |
+
print(
|
313 |
+
f"| {prefix} total duration (after augmentation): {sum(total_sec.values()):.2f}s ({sum(total_sec.values()) / sum(total_raw_sec.values()):.2f}x)"
|
314 |
+
)
|
315 |
+
print(
|
316 |
+
f"| {prefix} respective duration (after augmentation): "
|
317 |
+
+ ", ".join(f"{k}={v:.2f}s" for k, v in total_sec.items())
|
318 |
+
)
|
319 |
+
else:
|
320 |
+
print(f"| {prefix} total duration: {sum(total_raw_sec.values()):.2f}s")
|
321 |
+
print(f"| {prefix} respective duration: " + ", ".join(f"{k}={v:.2f}s" for k, v in total_raw_sec.items()))
|
322 |
+
|
323 |
+
def arrange_data_augmentation(self, data_iterator):
|
324 |
+
"""
|
325 |
+
Code for all types of data augmentation should be added here.
|
326 |
+
"""
|
327 |
+
raise NotImplementedError()
|
328 |
+
|
329 |
+
def process_item(self, item_name, meta_data, binarization_args):
|
330 |
+
raise NotImplementedError()
|
VLMEvalKit_old/PaddleMIX/paddlemix/models/diffsinger/basics/base_exporter.py
ADDED
@@ -0,0 +1,72 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
|
2 |
+
#
|
3 |
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
4 |
+
# you may not use this file except in compliance with the License.
|
5 |
+
# You may obtain a copy of the License at
|
6 |
+
#
|
7 |
+
# http://www.apache.org/licenses/LICENSE-2.0
|
8 |
+
#
|
9 |
+
# Unless required by applicable law or agreed to in writing, software
|
10 |
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
11 |
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
12 |
+
# See the License for the specific language governing permissions and
|
13 |
+
# limitations under the License.
|
14 |
+
|
15 |
+
import json
|
16 |
+
from pathlib import Path
|
17 |
+
from typing import Union
|
18 |
+
|
19 |
+
import paddle
|
20 |
+
from utils.hparams import hparams
|
21 |
+
|
22 |
+
|
23 |
+
class BaseExporter:
|
24 |
+
def __init__(
|
25 |
+
self, device: Union[str, (paddle.CPUPlace, paddle.CUDAPlace, str)] = None, cache_dir: Path = None, **kwargs
|
26 |
+
):
|
27 |
+
self.device = (
|
28 |
+
device
|
29 |
+
if device is not None
|
30 |
+
else str("cuda" if paddle.device.cuda.device_count() >= 1 else "cpu").replace("cuda", "gpu")
|
31 |
+
)
|
32 |
+
self.cache_dir: Path = (
|
33 |
+
cache_dir.resolve() if cache_dir is not None else Path(__file__).parent.parent / "deployment" / "cache"
|
34 |
+
)
|
35 |
+
self.cache_dir.mkdir(parents=True, exist_ok=True)
|
36 |
+
|
37 |
+
def build_spk_map(self) -> dict:
|
38 |
+
if hparams["use_spk_id"]:
|
39 |
+
with open(Path(hparams["work_dir"]) / "spk_map.json", "r", encoding="utf8") as f:
|
40 |
+
spk_map = json.load(f)
|
41 |
+
assert isinstance(spk_map, dict) and len(spk_map) > 0, "Invalid or empty speaker map!"
|
42 |
+
assert len(spk_map) == len(set(spk_map.values())), "Duplicate speaker id in speaker map!"
|
43 |
+
return spk_map
|
44 |
+
else:
|
45 |
+
return {}
|
46 |
+
|
47 |
+
def build_model(self) -> paddle.nn.Layer:
|
48 |
+
"""
|
49 |
+
Creates an instance of nn.Module and load its state dict on the target device.
|
50 |
+
"""
|
51 |
+
raise NotImplementedError()
|
52 |
+
|
53 |
+
def export_model(self, path: Path):
|
54 |
+
"""
|
55 |
+
Exports the model to ONNX format.
|
56 |
+
:param path: the target model path
|
57 |
+
"""
|
58 |
+
raise NotImplementedError()
|
59 |
+
|
60 |
+
def export_attachments(self, path: Path):
|
61 |
+
"""
|
62 |
+
Exports related files and configs (e.g. the dictionary) to the target directory.
|
63 |
+
:param path: the target directory
|
64 |
+
"""
|
65 |
+
raise NotImplementedError()
|
66 |
+
|
67 |
+
def export(self, path: Path):
|
68 |
+
"""
|
69 |
+
Exports all the artifacts to the target directory.
|
70 |
+
:param path: the target directory
|
71 |
+
"""
|
72 |
+
raise NotImplementedError()
|
VLMEvalKit_old/PaddleMIX/paddlemix/models/diffsinger/basics/base_svs_infer.py
ADDED
@@ -0,0 +1,149 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
|
2 |
+
#
|
3 |
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
4 |
+
# you may not use this file except in compliance with the License.
|
5 |
+
# You may obtain a copy of the License at
|
6 |
+
#
|
7 |
+
# http://www.apache.org/licenses/LICENSE-2.0
|
8 |
+
#
|
9 |
+
# Unless required by applicable law or agreed to in writing, software
|
10 |
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
11 |
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
12 |
+
# See the License for the specific language governing permissions and
|
13 |
+
# limitations under the License.
|
14 |
+
|
15 |
+
from typing import Dict, Tuple
|
16 |
+
|
17 |
+
import numpy as np
|
18 |
+
import paddle
|
19 |
+
|
20 |
+
from paddlemix.models.diffsinger.utils import hparams
|
21 |
+
from paddlemix.models.diffsinger.utils.infer_utils import resample_align_curve
|
22 |
+
|
23 |
+
|
24 |
+
class BaseSVSInfer:
|
25 |
+
"""
|
26 |
+
Base class for SVS inference models.
|
27 |
+
Subclasses should define:
|
28 |
+
1. *build_model*:
|
29 |
+
how to build the model;
|
30 |
+
2. *run_model*:
|
31 |
+
how to run the model (typically, generate a mel-spectrogram and
|
32 |
+
pass it to the pre-built vocoder);
|
33 |
+
3. *preprocess_input*:
|
34 |
+
how to preprocess user input.
|
35 |
+
4. *infer_once*
|
36 |
+
infer from raw inputs to the final outputs
|
37 |
+
"""
|
38 |
+
|
39 |
+
def __init__(self, device=None):
|
40 |
+
if device is None:
|
41 |
+
device = "gpu" if paddle.device.cuda.device_count() >= 1 else "cpu"
|
42 |
+
self.device = device
|
43 |
+
self.timestep = hparams["hop_size"] / hparams["audio_sample_rate"]
|
44 |
+
self.spk_map = {}
|
45 |
+
self.model: paddle.nn.Layer = None
|
46 |
+
|
47 |
+
def build_model(self, ckpt_steps=None) -> paddle.nn.Layer:
|
48 |
+
raise NotImplementedError()
|
49 |
+
|
50 |
+
def load_speaker_mix(
|
51 |
+
self, param_src: dict, summary_dst: dict, mix_mode: str = "frame", mix_length: int = None
|
52 |
+
) -> Tuple[paddle.Tensor, paddle.Tensor]:
|
53 |
+
"""
|
54 |
+
|
55 |
+
:param param_src: param dict
|
56 |
+
:param summary_dst: summary dict
|
57 |
+
:param mix_mode: 'token' or 'frame'
|
58 |
+
:param mix_length: total tokens or frames to mix
|
59 |
+
:return: spk_mix_id [B=1, 1, N], spk_mix_value [B=1, T, N]
|
60 |
+
"""
|
61 |
+
assert mix_mode == "token" or mix_mode == "frame"
|
62 |
+
param_key = "spk_mix" if mix_mode == "frame" else "ph_spk_mix"
|
63 |
+
summary_solo_key = "spk" if mix_mode == "frame" else "ph_spk"
|
64 |
+
spk_mix_map = param_src.get(param_key)
|
65 |
+
dynamic = False
|
66 |
+
if spk_mix_map is None:
|
67 |
+
for name in self.spk_map.keys():
|
68 |
+
spk_mix_map = {name: 1.0}
|
69 |
+
break
|
70 |
+
else:
|
71 |
+
for name in spk_mix_map:
|
72 |
+
assert name in self.spk_map, f"Speaker '{name}' not found."
|
73 |
+
if len(spk_mix_map) == 1:
|
74 |
+
summary_dst[summary_solo_key] = list(spk_mix_map.keys())[0]
|
75 |
+
elif any([isinstance(val, str) for val in spk_mix_map.values()]):
|
76 |
+
print_mix = "|".join(spk_mix_map.keys())
|
77 |
+
summary_dst[param_key] = f"dynamic({print_mix})"
|
78 |
+
dynamic = True
|
79 |
+
else:
|
80 |
+
print_mix = "|".join([f"{n}:{'%.3f' % spk_mix_map[n]}" for n in spk_mix_map])
|
81 |
+
summary_dst[param_key] = f"static({print_mix})"
|
82 |
+
spk_mix_id_list = []
|
83 |
+
spk_mix_value_list = []
|
84 |
+
if dynamic:
|
85 |
+
for name, values in spk_mix_map.items():
|
86 |
+
spk_mix_id_list.append(self.spk_map[name])
|
87 |
+
if isinstance(values, str):
|
88 |
+
if mix_mode == "token":
|
89 |
+
cur_spk_mix_value = values.split()
|
90 |
+
assert (
|
91 |
+
len(cur_spk_mix_value) == mix_length
|
92 |
+
), "Speaker mix checks failed. In dynamic token-level mix, number of proportion values must equal number of tokens."
|
93 |
+
cur_spk_mix_value = paddle.to_tensor(data=np.array(cur_spk_mix_value, "float32")).to(
|
94 |
+
self.device
|
95 |
+
)[None]
|
96 |
+
else:
|
97 |
+
cur_spk_mix_value = paddle.to_tensor(
|
98 |
+
data=resample_align_curve(
|
99 |
+
np.array(values.split(), "float32"),
|
100 |
+
original_timestep=float(param_src["spk_mix_timestep"]),
|
101 |
+
target_timestep=self.timestep,
|
102 |
+
align_length=mix_length,
|
103 |
+
)
|
104 |
+
).to(self.device)[None]
|
105 |
+
assert paddle.all(
|
106 |
+
x=cur_spk_mix_value >= 0.0
|
107 |
+
), f"""Speaker mix checks failed.
|
108 |
+
Proportions of speaker '{name}' on some {mix_mode}s are negative."""
|
109 |
+
else:
|
110 |
+
assert (
|
111 |
+
values >= 0.0
|
112 |
+
), f"""Speaker mix checks failed.
|
113 |
+
Proportion of speaker '{name}' is negative."""
|
114 |
+
cur_spk_mix_value = paddle.full(shape=(1, mix_length), fill_value=values, dtype="float32")
|
115 |
+
spk_mix_value_list.append(cur_spk_mix_value)
|
116 |
+
spk_mix_id = paddle.to_tensor(data=spk_mix_id_list, dtype="int64").to(self.device)[None, None]
|
117 |
+
spk_mix_value = paddle.stack(x=spk_mix_value_list, axis=2)
|
118 |
+
spk_mix_value_sum = paddle.sum(x=spk_mix_value, axis=2, keepdim=True)
|
119 |
+
assert paddle.all(
|
120 |
+
x=spk_mix_value_sum > 0.0
|
121 |
+
), f"""Speaker mix checks failed.
|
122 |
+
Proportions of speaker mix on some frames sum to zero."""
|
123 |
+
spk_mix_value /= spk_mix_value_sum
|
124 |
+
else:
|
125 |
+
for name, value in spk_mix_map.items():
|
126 |
+
spk_mix_id_list.append(self.spk_map[name])
|
127 |
+
assert (
|
128 |
+
value >= 0.0
|
129 |
+
), f"""Speaker mix checks failed.
|
130 |
+
Proportion of speaker '{name}' is negative."""
|
131 |
+
spk_mix_value_list.append(value)
|
132 |
+
spk_mix_id = paddle.to_tensor(data=spk_mix_id_list, dtype="int64").to(self.device)[None, None]
|
133 |
+
spk_mix_value = paddle.to_tensor(data=spk_mix_value_list, dtype="float32").to(self.device)[None, None]
|
134 |
+
spk_mix_value_sum = spk_mix_value.sum()
|
135 |
+
assert (
|
136 |
+
spk_mix_value_sum > 0.0
|
137 |
+
), f"""Speaker mix checks failed.
|
138 |
+
Proportions of speaker mix sum to zero."""
|
139 |
+
spk_mix_value /= spk_mix_value_sum
|
140 |
+
return spk_mix_id, spk_mix_value
|
141 |
+
|
142 |
+
def preprocess_input(self, param: dict, idx=0) -> Dict[str, paddle.Tensor]:
|
143 |
+
raise NotImplementedError()
|
144 |
+
|
145 |
+
def forward_model(self, sample: Dict[str, paddle.Tensor]):
|
146 |
+
raise NotImplementedError()
|
147 |
+
|
148 |
+
def run_inference(self, params, **kwargs):
|
149 |
+
raise NotImplementedError()
|
VLMEvalKit_old/PaddleMIX/paddlemix/models/diffsinger/basics/base_vocoder.py
ADDED
@@ -0,0 +1,37 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
|
2 |
+
#
|
3 |
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
4 |
+
# you may not use this file except in compliance with the License.
|
5 |
+
# You may obtain a copy of the License at
|
6 |
+
#
|
7 |
+
# http://www.apache.org/licenses/LICENSE-2.0
|
8 |
+
#
|
9 |
+
# Unless required by applicable law or agreed to in writing, software
|
10 |
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
11 |
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
12 |
+
# See the License for the specific language governing permissions and
|
13 |
+
# limitations under the License.
|
14 |
+
|
15 |
+
|
16 |
+
class BaseVocoder:
|
17 |
+
def to_device(self, device):
|
18 |
+
"""
|
19 |
+
|
20 |
+
:param device: torch.device or str
|
21 |
+
"""
|
22 |
+
raise NotImplementedError()
|
23 |
+
|
24 |
+
def get_device(self):
|
25 |
+
"""
|
26 |
+
|
27 |
+
:return: device: torch.device or str
|
28 |
+
"""
|
29 |
+
raise NotImplementedError()
|
30 |
+
|
31 |
+
def spec2wav(self, mel, **kwargs):
|
32 |
+
"""
|
33 |
+
|
34 |
+
:param mel: [T, 80]
|
35 |
+
:return: wav: [T']
|
36 |
+
"""
|
37 |
+
raise NotImplementedError()
|
VLMEvalKit_old/PaddleMIX/paddlemix/models/diffsinger/modules/aux_decoder/convnext.py
ADDED
@@ -0,0 +1,103 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
|
2 |
+
#
|
3 |
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
4 |
+
# you may not use this file except in compliance with the License.
|
5 |
+
# You may obtain a copy of the License at
|
6 |
+
#
|
7 |
+
# http://www.apache.org/licenses/LICENSE-2.0
|
8 |
+
#
|
9 |
+
# Unless required by applicable law or agreed to in writing, software
|
10 |
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
11 |
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
12 |
+
# See the License for the specific language governing permissions and
|
13 |
+
# limitations under the License.
|
14 |
+
|
15 |
+
import sys
|
16 |
+
from typing import Optional
|
17 |
+
|
18 |
+
import paddle
|
19 |
+
from paddlemix.models.diffsinger.utils import paddle_aux
|
20 |
+
|
21 |
+
|
22 |
+
class ConvNeXtBlock(paddle.nn.Layer):
|
23 |
+
"""ConvNeXt Block adapted from https://github.com/facebookresearch/ConvNeXt to 1D audio signal.
|
24 |
+
|
25 |
+
Args:
|
26 |
+
dim (int): Number of input channels.
|
27 |
+
intermediate_dim (int): Dimensionality of the intermediate layer.
|
28 |
+
layer_scale_init_value (float, optional): Initial value for the layer scale. None means no scaling.
|
29 |
+
Defaults to None.
|
30 |
+
"""
|
31 |
+
|
32 |
+
def __init__(
|
33 |
+
self, dim: int, intermediate_dim: int, layer_scale_init_value: Optional[float] = None, drop_out: float = 0.0
|
34 |
+
):
|
35 |
+
super().__init__()
|
36 |
+
self.dwconv = paddle.nn.Conv1D(in_channels=dim, out_channels=dim, kernel_size=7, padding=3, groups=dim)
|
37 |
+
self.norm = paddle.nn.LayerNorm(normalized_shape=dim, epsilon=1e-06)
|
38 |
+
self.pwconv1 = paddle.nn.Linear(in_features=dim, out_features=intermediate_dim)
|
39 |
+
self.act = paddle.nn.GELU()
|
40 |
+
self.pwconv2 = paddle.nn.Linear(in_features=intermediate_dim, out_features=dim)
|
41 |
+
self.gamma = (
|
42 |
+
paddle.base.framework.EagerParamBase.from_tensor(
|
43 |
+
tensor=layer_scale_init_value * paddle.ones(shape=dim), trainable=True
|
44 |
+
)
|
45 |
+
if layer_scale_init_value > 0
|
46 |
+
else None
|
47 |
+
)
|
48 |
+
self.drop_path = paddle.nn.Identity()
|
49 |
+
self.dropout = paddle.nn.Dropout(p=drop_out) if drop_out > 0.0 else paddle.nn.Identity()
|
50 |
+
|
51 |
+
def forward(self, x: paddle.Tensor) -> paddle.Tensor:
|
52 |
+
residual = x
|
53 |
+
x = self.dwconv(x)
|
54 |
+
x = x.transpose(perm=paddle_aux.transpose_aux_func(x.ndim, 1, 2))
|
55 |
+
x = self.norm(x)
|
56 |
+
x = self.pwconv1(x)
|
57 |
+
x = self.act(x)
|
58 |
+
x = self.pwconv2(x)
|
59 |
+
if self.gamma is not None:
|
60 |
+
x = self.gamma * x
|
61 |
+
x = x.transpose(perm=paddle_aux.transpose_aux_func(x.ndim, 1, 2))
|
62 |
+
x = self.dropout(x)
|
63 |
+
x = residual + self.drop_path(x)
|
64 |
+
return x
|
65 |
+
|
66 |
+
|
67 |
+
class ConvNeXtDecoder(paddle.nn.Layer):
|
68 |
+
def __init__(self, in_dims, out_dims, /, *, num_channels=512, num_layers=6, kernel_size=7, dropout_rate=0.1):
|
69 |
+
super().__init__()
|
70 |
+
self.inconv = paddle.nn.Conv1D(
|
71 |
+
in_channels=in_dims,
|
72 |
+
out_channels=num_channels,
|
73 |
+
kernel_size=kernel_size,
|
74 |
+
stride=1,
|
75 |
+
padding=(kernel_size - 1) // 2,
|
76 |
+
)
|
77 |
+
self.conv = paddle.nn.LayerList(
|
78 |
+
sublayers=(
|
79 |
+
ConvNeXtBlock(
|
80 |
+
dim=num_channels,
|
81 |
+
intermediate_dim=num_channels * 4,
|
82 |
+
layer_scale_init_value=1e-06,
|
83 |
+
drop_out=dropout_rate,
|
84 |
+
)
|
85 |
+
for _ in range(num_layers)
|
86 |
+
)
|
87 |
+
)
|
88 |
+
self.outconv = paddle.nn.Conv1D(
|
89 |
+
in_channels=num_channels,
|
90 |
+
out_channels=out_dims,
|
91 |
+
kernel_size=kernel_size,
|
92 |
+
stride=1,
|
93 |
+
padding=(kernel_size - 1) // 2,
|
94 |
+
)
|
95 |
+
|
96 |
+
def forward(self, x, infer=False):
|
97 |
+
x = x.transpose(perm=paddle_aux.transpose_aux_func(x.ndim, 1, 2))
|
98 |
+
x = self.inconv(x)
|
99 |
+
for conv in self.conv:
|
100 |
+
x = conv(x)
|
101 |
+
x = self.outconv(x)
|
102 |
+
x = x.transpose(perm=paddle_aux.transpose_aux_func(x.ndim, 1, 2))
|
103 |
+
return x
|
VLMEvalKit_old/PaddleMIX/paddlemix/models/diffsinger/modules/backbones/__init__.py
ADDED
@@ -0,0 +1,26 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
|
2 |
+
#
|
3 |
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
4 |
+
# you may not use this file except in compliance with the License.
|
5 |
+
# You may obtain a copy of the License at
|
6 |
+
#
|
7 |
+
# http://www.apache.org/licenses/LICENSE-2.0
|
8 |
+
#
|
9 |
+
# Unless required by applicable law or agreed to in writing, software
|
10 |
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
11 |
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
12 |
+
# See the License for the specific language governing permissions and
|
13 |
+
# limitations under the License.
|
14 |
+
|
15 |
+
import paddle
|
16 |
+
from paddlemix.models.diffsinger.modules.backbones.lynxnet import LYNXNet
|
17 |
+
from paddlemix.models.diffsinger.modules.backbones.wavenet import WaveNet
|
18 |
+
from paddlemix.models.diffsinger.utils import filter_kwargs
|
19 |
+
|
20 |
+
BACKBONES = {"wavenet": WaveNet, "lynxnet": LYNXNet}
|
21 |
+
|
22 |
+
|
23 |
+
def build_backbone(out_dims: int, num_feats: int, backbone_type: str, backbone_args: dict) -> paddle.nn.Layer:
|
24 |
+
backbone = BACKBONES[backbone_type]
|
25 |
+
kwargs = filter_kwargs(backbone_args, backbone)
|
26 |
+
return BACKBONES[backbone_type](out_dims, num_feats, **kwargs)
|
VLMEvalKit_old/PaddleMIX/paddlemix/models/diffsinger/modules/backbones/lynxnet.py
ADDED
@@ -0,0 +1,188 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
|
2 |
+
#
|
3 |
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
4 |
+
# you may not use this file except in compliance with the License.
|
5 |
+
# You may obtain a copy of the License at
|
6 |
+
#
|
7 |
+
# http://www.apache.org/licenses/LICENSE-2.0
|
8 |
+
#
|
9 |
+
# Unless required by applicable law or agreed to in writing, software
|
10 |
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
11 |
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
12 |
+
# See the License for the specific language governing permissions and
|
13 |
+
# limitations under the License.
|
14 |
+
|
15 |
+
import sys
|
16 |
+
|
17 |
+
import paddle
|
18 |
+
from paddlemix.models.diffsinger.utils import paddle_aux
|
19 |
+
|
20 |
+
from paddlemix.models.diffsinger.modules.commons.common_layers import SinusoidalPosEmb
|
21 |
+
from paddlemix.models.diffsinger.utils.hparams import hparams
|
22 |
+
|
23 |
+
|
24 |
+
class SwiGLU(paddle.nn.Layer):
|
25 |
+
def __init__(self, dim=-1):
|
26 |
+
super().__init__()
|
27 |
+
self.dim = dim
|
28 |
+
|
29 |
+
def forward(self, x):
|
30 |
+
out, gate = paddle_aux.split(x=x, num_or_sections=x.shape[self.dim] // 2, axis=self.dim)
|
31 |
+
return out * paddle.nn.functional.silu(x=gate)
|
32 |
+
|
33 |
+
|
34 |
+
class Transpose(paddle.nn.Layer):
|
35 |
+
def __init__(self, dims):
|
36 |
+
super().__init__()
|
37 |
+
assert len(dims) == 2, "dims must be a tuple of two dimensions"
|
38 |
+
self.dims = dims
|
39 |
+
|
40 |
+
def forward(self, x):
|
41 |
+
# return x.transpose(*self.dims)
|
42 |
+
# return x.transpose(perm=list(self.dims)) # or tuple(self.dims)
|
43 |
+
return x.transpose(perm=paddle_aux.transpose_aux_func(x.ndim, *self.dims))
|
44 |
+
|
45 |
+
|
46 |
+
class LYNXConvModule(paddle.nn.Layer):
|
47 |
+
@staticmethod
|
48 |
+
def calc_same_padding(kernel_size):
|
49 |
+
pad = kernel_size // 2
|
50 |
+
return pad, pad - (kernel_size + 1) % 2
|
51 |
+
|
52 |
+
def __init__(self, dim, expansion_factor, kernel_size=31, activation="PReLU", dropout=0.0):
|
53 |
+
super().__init__()
|
54 |
+
inner_dim = dim * expansion_factor
|
55 |
+
activation_classes = {
|
56 |
+
"SiLU": paddle.nn.Silu,
|
57 |
+
"ReLU": paddle.nn.ReLU,
|
58 |
+
"PReLU": lambda: paddle.nn.PReLU(num_parameters=inner_dim),
|
59 |
+
}
|
60 |
+
activation = activation if activation is not None else "PReLU"
|
61 |
+
if activation not in activation_classes:
|
62 |
+
raise ValueError(f"{activation} is not a valid activation")
|
63 |
+
_activation = activation_classes[activation]()
|
64 |
+
padding = self.calc_same_padding(kernel_size)
|
65 |
+
if float(dropout) > 0.0:
|
66 |
+
_dropout = paddle.nn.Dropout(p=dropout)
|
67 |
+
else:
|
68 |
+
_dropout = paddle.nn.Identity()
|
69 |
+
self.net = paddle.nn.Sequential(
|
70 |
+
paddle.nn.LayerNorm(normalized_shape=dim),
|
71 |
+
Transpose((1, 2)),
|
72 |
+
paddle.nn.Conv1D(in_channels=dim, out_channels=inner_dim * 2, kernel_size=1),
|
73 |
+
SwiGLU(dim=1),
|
74 |
+
paddle.nn.Conv1D(
|
75 |
+
in_channels=inner_dim,
|
76 |
+
out_channels=inner_dim,
|
77 |
+
kernel_size=kernel_size,
|
78 |
+
padding=padding[0],
|
79 |
+
groups=inner_dim,
|
80 |
+
),
|
81 |
+
_activation,
|
82 |
+
paddle.nn.Conv1D(in_channels=inner_dim, out_channels=dim, kernel_size=1),
|
83 |
+
Transpose((1, 2)),
|
84 |
+
_dropout,
|
85 |
+
)
|
86 |
+
|
87 |
+
def forward(self, x):
|
88 |
+
return self.net(x)
|
89 |
+
|
90 |
+
|
91 |
+
class LYNXNetResidualLayer(paddle.nn.Layer):
|
92 |
+
def __init__(self, dim_cond, dim, expansion_factor, kernel_size=31, activation="PReLU", dropout=0.0):
|
93 |
+
super().__init__()
|
94 |
+
self.diffusion_projection = paddle.nn.Conv1D(in_channels=dim, out_channels=dim, kernel_size=1)
|
95 |
+
self.conditioner_projection = paddle.nn.Conv1D(in_channels=dim_cond, out_channels=dim, kernel_size=1)
|
96 |
+
self.convmodule = LYNXConvModule(
|
97 |
+
dim=dim, expansion_factor=expansion_factor, kernel_size=kernel_size, activation=activation, dropout=dropout
|
98 |
+
)
|
99 |
+
|
100 |
+
def forward(self, x, conditioner, diffusion_step):
|
101 |
+
res_x = x.transpose(perm=paddle_aux.transpose_aux_func(x.ndim, 1, 2))
|
102 |
+
x = x + self.diffusion_projection(diffusion_step) + self.conditioner_projection(conditioner)
|
103 |
+
x = x.transpose(perm=paddle_aux.transpose_aux_func(x.ndim, 1, 2))
|
104 |
+
x = self.convmodule(x)
|
105 |
+
x = x + res_x
|
106 |
+
x = x.transpose(perm=paddle_aux.transpose_aux_func(x.ndim, 1, 2))
|
107 |
+
return x
|
108 |
+
|
109 |
+
|
110 |
+
class LYNXNet(paddle.nn.Layer):
|
111 |
+
def __init__(
|
112 |
+
self,
|
113 |
+
in_dims,
|
114 |
+
n_feats,
|
115 |
+
*,
|
116 |
+
num_layers=6,
|
117 |
+
num_channels=512,
|
118 |
+
expansion_factor=2,
|
119 |
+
kernel_size=31,
|
120 |
+
activation="PReLU",
|
121 |
+
dropout=0.0
|
122 |
+
):
|
123 |
+
"""
|
124 |
+
LYNXNet(Linear Gated Depthwise Separable Convolution Network)
|
125 |
+
TIPS:You can control the style of the generated results by modifying the 'activation',
|
126 |
+
- 'PReLU'(default) : Similar to WaveNet
|
127 |
+
- 'SiLU' : Voice will be more pronounced, not recommended for use under DDPM
|
128 |
+
- 'ReLU' : Contrary to 'SiLU', Voice will be weakened
|
129 |
+
"""
|
130 |
+
super().__init__()
|
131 |
+
self.in_dims = in_dims
|
132 |
+
self.n_feats = n_feats
|
133 |
+
self.input_projection = paddle.nn.Conv1D(
|
134 |
+
in_channels=in_dims * n_feats, out_channels=num_channels, kernel_size=1
|
135 |
+
)
|
136 |
+
self.diffusion_embedding = paddle.nn.Sequential(
|
137 |
+
SinusoidalPosEmb(num_channels),
|
138 |
+
paddle.nn.Linear(in_features=num_channels, out_features=num_channels * 4),
|
139 |
+
paddle.nn.GELU(),
|
140 |
+
paddle.nn.Linear(in_features=num_channels * 4, out_features=num_channels),
|
141 |
+
)
|
142 |
+
self.residual_layers = paddle.nn.LayerList(
|
143 |
+
sublayers=[
|
144 |
+
LYNXNetResidualLayer(
|
145 |
+
dim_cond=hparams["hidden_size"],
|
146 |
+
dim=num_channels,
|
147 |
+
expansion_factor=expansion_factor,
|
148 |
+
kernel_size=kernel_size,
|
149 |
+
activation=activation,
|
150 |
+
dropout=dropout,
|
151 |
+
)
|
152 |
+
for i in range(num_layers)
|
153 |
+
]
|
154 |
+
)
|
155 |
+
self.norm = paddle.nn.LayerNorm(normalized_shape=num_channels)
|
156 |
+
self.output_projection = paddle.nn.Conv1D(
|
157 |
+
in_channels=num_channels, out_channels=in_dims * n_feats, kernel_size=1
|
158 |
+
)
|
159 |
+
init_Constant = paddle.nn.initializer.Constant(value=0.0)
|
160 |
+
init_Constant(self.output_projection.weight)
|
161 |
+
|
162 |
+
def forward(self, spec, diffusion_step, cond):
|
163 |
+
"""
|
164 |
+
:param spec: [B, F, M, T]
|
165 |
+
:param diffusion_step: [B, 1]
|
166 |
+
:param cond: [B, H, T]
|
167 |
+
:return:
|
168 |
+
"""
|
169 |
+
if self.n_feats == 1:
|
170 |
+
x = spec[:, 0]
|
171 |
+
else:
|
172 |
+
x = spec.flatten(start_axis=1, stop_axis=2)
|
173 |
+
x = self.input_projection(x)
|
174 |
+
x = paddle.nn.functional.gelu(x=x)
|
175 |
+
diffusion_step = self.diffusion_embedding(diffusion_step).unsqueeze(axis=-1)
|
176 |
+
for layer in self.residual_layers:
|
177 |
+
x = layer(x, cond, diffusion_step)
|
178 |
+
x = self.norm(x.transpose(perm=paddle_aux.transpose_aux_func(x.ndim, 1, 2))).transpose(
|
179 |
+
perm=paddle_aux.transpose_aux_func(
|
180 |
+
self.norm(x.transpose(perm=paddle_aux.transpose_aux_func(x.ndim, 1, 2))).ndim, 1, 2
|
181 |
+
)
|
182 |
+
)
|
183 |
+
x = self.output_projection(x)
|
184 |
+
if self.n_feats == 1:
|
185 |
+
x = x[:, None, :, :]
|
186 |
+
else:
|
187 |
+
x = x.reshape(-1, self.n_feats, self.in_dims, tuple(x.shape)[2])
|
188 |
+
return x
|
VLMEvalKit_old/PaddleMIX/paddlemix/models/diffsinger/modules/backbones/wavenet.py
ADDED
@@ -0,0 +1,120 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
|
2 |
+
#
|
3 |
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
4 |
+
# you may not use this file except in compliance with the License.
|
5 |
+
# You may obtain a copy of the License at
|
6 |
+
#
|
7 |
+
# http://www.apache.org/licenses/LICENSE-2.0
|
8 |
+
#
|
9 |
+
# Unless required by applicable law or agreed to in writing, software
|
10 |
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
11 |
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
12 |
+
# See the License for the specific language governing permissions and
|
13 |
+
# limitations under the License.
|
14 |
+
|
15 |
+
import math
|
16 |
+
import sys
|
17 |
+
from math import sqrt
|
18 |
+
|
19 |
+
import paddle
|
20 |
+
from paddlemix.models.diffsinger.utils import paddle_aux
|
21 |
+
|
22 |
+
from paddlemix.models.diffsinger.modules.commons.common_layers import SinusoidalPosEmb
|
23 |
+
from paddlemix.models.diffsinger.utils.hparams import hparams
|
24 |
+
|
25 |
+
|
26 |
+
class Conv1d(paddle.nn.Conv1D):
|
27 |
+
def __init__(self, *args, **kwargs):
|
28 |
+
super().__init__(*args, **kwargs)
|
29 |
+
init_KaimingNormal = paddle.nn.initializer.KaimingNormal(nonlinearity="leaky_relu")
|
30 |
+
init_KaimingNormal(self.weight)
|
31 |
+
|
32 |
+
|
33 |
+
class ResidualBlock(paddle.nn.Layer):
|
34 |
+
def __init__(self, encoder_hidden, residual_channels, dilation):
|
35 |
+
super().__init__()
|
36 |
+
self.residual_channels = residual_channels
|
37 |
+
self.dilated_conv = paddle.nn.Conv1D(
|
38 |
+
in_channels=residual_channels,
|
39 |
+
out_channels=2 * residual_channels,
|
40 |
+
kernel_size=3,
|
41 |
+
padding=dilation,
|
42 |
+
dilation=dilation,
|
43 |
+
)
|
44 |
+
self.diffusion_projection = paddle.nn.Linear(in_features=residual_channels, out_features=residual_channels)
|
45 |
+
self.conditioner_projection = paddle.nn.Conv1D(
|
46 |
+
in_channels=encoder_hidden, out_channels=2 * residual_channels, kernel_size=1
|
47 |
+
)
|
48 |
+
self.output_projection = paddle.nn.Conv1D(
|
49 |
+
in_channels=residual_channels, out_channels=2 * residual_channels, kernel_size=1
|
50 |
+
)
|
51 |
+
|
52 |
+
def forward(self, x, conditioner, diffusion_step):
|
53 |
+
diffusion_step = self.diffusion_projection(diffusion_step).unsqueeze(axis=-1)
|
54 |
+
conditioner = self.conditioner_projection(conditioner)
|
55 |
+
y = x + diffusion_step
|
56 |
+
y = self.dilated_conv(y) + conditioner
|
57 |
+
gate, filter = paddle_aux.split(x=y, num_or_sections=[self.residual_channels, self.residual_channels], axis=1)
|
58 |
+
y = paddle.nn.functional.sigmoid(x=gate) * paddle.nn.functional.tanh(x=filter)
|
59 |
+
y = self.output_projection(y)
|
60 |
+
residual, skip = paddle_aux.split(
|
61 |
+
x=y, num_or_sections=[self.residual_channels, self.residual_channels], axis=1
|
62 |
+
)
|
63 |
+
return (x + residual) / math.sqrt(2.0), skip
|
64 |
+
|
65 |
+
|
66 |
+
class WaveNet(paddle.nn.Layer):
|
67 |
+
def __init__(self, in_dims, n_feats, *, num_layers=20, num_channels=256, dilation_cycle_length=4):
|
68 |
+
super().__init__()
|
69 |
+
self.in_dims = in_dims
|
70 |
+
self.n_feats = n_feats
|
71 |
+
self.input_projection = Conv1d(in_dims * n_feats, num_channels, 1)
|
72 |
+
self.diffusion_embedding = SinusoidalPosEmb(num_channels)
|
73 |
+
self.mlp = paddle.nn.Sequential(
|
74 |
+
paddle.nn.Linear(in_features=num_channels, out_features=num_channels * 4),
|
75 |
+
paddle.nn.Mish(),
|
76 |
+
paddle.nn.Linear(in_features=num_channels * 4, out_features=num_channels),
|
77 |
+
)
|
78 |
+
self.residual_layers = paddle.nn.LayerList(
|
79 |
+
sublayers=[
|
80 |
+
ResidualBlock(
|
81 |
+
encoder_hidden=hparams["hidden_size"],
|
82 |
+
residual_channels=num_channels,
|
83 |
+
dilation=2 ** (i % dilation_cycle_length),
|
84 |
+
)
|
85 |
+
for i in range(num_layers)
|
86 |
+
]
|
87 |
+
)
|
88 |
+
self.skip_projection = Conv1d(num_channels, num_channels, 1)
|
89 |
+
self.output_projection = Conv1d(num_channels, in_dims * n_feats, 1)
|
90 |
+
init_Constant = paddle.nn.initializer.Constant(value=0.0)
|
91 |
+
init_Constant(self.output_projection.weight)
|
92 |
+
|
93 |
+
def forward(self, spec, diffusion_step, cond):
|
94 |
+
"""
|
95 |
+
:param spec: [B, F, M, T]
|
96 |
+
:param diffusion_step: [B, 1]
|
97 |
+
:param cond: [B, H, T]
|
98 |
+
:return:
|
99 |
+
"""
|
100 |
+
if self.n_feats == 1:
|
101 |
+
x = spec.squeeze(axis=1)
|
102 |
+
else:
|
103 |
+
x = spec.flatten(start_axis=1, stop_axis=2)
|
104 |
+
x = self.input_projection(x)
|
105 |
+
x = paddle.nn.functional.relu(x=x)
|
106 |
+
diffusion_step = self.diffusion_embedding(diffusion_step)
|
107 |
+
diffusion_step = self.mlp(diffusion_step)
|
108 |
+
skip = []
|
109 |
+
for layer in self.residual_layers:
|
110 |
+
x, skip_connection = layer(x, cond, diffusion_step)
|
111 |
+
skip.append(skip_connection)
|
112 |
+
x = paddle.sum(x=paddle.stack(x=skip), axis=0) / sqrt(len(self.residual_layers))
|
113 |
+
x = self.skip_projection(x)
|
114 |
+
x = paddle.nn.functional.relu(x=x)
|
115 |
+
x = self.output_projection(x)
|
116 |
+
if self.n_feats == 1:
|
117 |
+
x = x[:, None, :, :]
|
118 |
+
else:
|
119 |
+
x = x.reshape(-1, self.n_feats, self.in_dims, tuple(x.shape)[2])
|
120 |
+
return x
|
VLMEvalKit_old/PaddleMIX/paddlemix/models/diffsinger/modules/commons/common_layers.py
ADDED
@@ -0,0 +1,187 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
|
2 |
+
#
|
3 |
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
4 |
+
# you may not use this file except in compliance with the License.
|
5 |
+
# You may obtain a copy of the License at
|
6 |
+
#
|
7 |
+
# http://www.apache.org/licenses/LICENSE-2.0
|
8 |
+
#
|
9 |
+
# Unless required by applicable law or agreed to in writing, software
|
10 |
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
11 |
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
12 |
+
# See the License for the specific language governing permissions and
|
13 |
+
# limitations under the License.
|
14 |
+
|
15 |
+
from __future__ import annotations
|
16 |
+
|
17 |
+
import math
|
18 |
+
import sys
|
19 |
+
|
20 |
+
import paddle
|
21 |
+
from paddlemix.models.diffsinger.utils import paddle_aux
|
22 |
+
from paddle.nn import GELU, LayerNorm
|
23 |
+
from paddle.nn import MultiHeadAttention as MultiheadAttention
|
24 |
+
from paddle.nn import ReLU
|
25 |
+
from paddle.nn import Silu as SiLU
|
26 |
+
|
27 |
+
import paddlemix.models.diffsinger.utils as utils
|
28 |
+
|
29 |
+
|
30 |
+
class NormalInitEmbedding(paddle.nn.Embedding):
|
31 |
+
def __init__(self, num_embeddings: int, embedding_dim: int, padding_idx: (int | None) = None, *args, **kwargs):
|
32 |
+
super().__init__(num_embeddings, embedding_dim, *args, padding_idx=padding_idx, **kwargs)
|
33 |
+
init_Normal = paddle.nn.initializer.Normal(mean=0, std=self._embedding_dim**-0.5)
|
34 |
+
init_Normal(self.weight)
|
35 |
+
if padding_idx is not None:
|
36 |
+
init_Constant = paddle.nn.initializer.Constant(value=0)
|
37 |
+
init_Constant(self.weight[padding_idx])
|
38 |
+
|
39 |
+
|
40 |
+
class XavierUniformInitLinear(paddle.nn.Linear):
|
41 |
+
def __init__(self, in_features: int, out_features: int, *args, bias: bool = True, **kwargs):
|
42 |
+
super().__init__(in_features, out_features, *args, bias_attr=bias, **kwargs)
|
43 |
+
init_XavierUniform = paddle.nn.initializer.XavierUniform()
|
44 |
+
init_XavierUniform(self.weight)
|
45 |
+
if bias:
|
46 |
+
init_Constant = paddle.nn.initializer.Constant(value=0.0)
|
47 |
+
init_Constant(self.bias)
|
48 |
+
|
49 |
+
|
50 |
+
class SinusoidalPositionalEmbedding(paddle.nn.Layer):
|
51 |
+
"""This module produces sinusoidal positional embeddings of any length.
|
52 |
+
|
53 |
+
Padding symbols are ignored.
|
54 |
+
"""
|
55 |
+
|
56 |
+
def __init__(self, embedding_dim, padding_idx, init_size=1024):
|
57 |
+
super().__init__()
|
58 |
+
self.embedding_dim = embedding_dim
|
59 |
+
self.padding_idx = padding_idx
|
60 |
+
self.weights = SinusoidalPositionalEmbedding.get_embedding(init_size, embedding_dim, padding_idx)
|
61 |
+
self.register_buffer(name="_float_tensor", tensor=paddle.empty(shape=[1], dtype="float32"))
|
62 |
+
|
63 |
+
@staticmethod
|
64 |
+
def get_embedding(num_embeddings, embedding_dim, padding_idx=None):
|
65 |
+
"""Build sinusoidal embeddings.
|
66 |
+
|
67 |
+
This matches the implementation in tensor2tensor, but differs slightly
|
68 |
+
from the description in Section 3.5 of "Attention Is All You Need".
|
69 |
+
"""
|
70 |
+
half_dim = embedding_dim // 2
|
71 |
+
emb = math.log(10000) / (half_dim - 1)
|
72 |
+
emb = paddle.exp(x=paddle.arange(dtype="float32", end=half_dim) * -emb)
|
73 |
+
emb = paddle.arange(dtype="float32", end=num_embeddings).unsqueeze(axis=1) * emb.unsqueeze(axis=0)
|
74 |
+
emb = paddle.concat(x=[paddle.sin(x=emb), paddle.cos(x=emb)], axis=1).view(num_embeddings, -1)
|
75 |
+
if embedding_dim % 2 == 1:
|
76 |
+
emb = paddle.concat(x=[emb, paddle.zeros(shape=[num_embeddings, 1])], axis=1)
|
77 |
+
if padding_idx is not None:
|
78 |
+
emb[padding_idx, :] = 0
|
79 |
+
return emb
|
80 |
+
|
81 |
+
def forward(self, x, incremental_state=None, timestep=None, positions=None):
|
82 |
+
"""Input is expected to be of size [bsz x seqlen]."""
|
83 |
+
bsz, seq_len = tuple(x.shape)[:2]
|
84 |
+
max_pos = self.padding_idx + 1 + seq_len
|
85 |
+
if self.weights is None or max_pos > self.weights.shape[0]:
|
86 |
+
self.weights = SinusoidalPositionalEmbedding.get_embedding(max_pos, self.embedding_dim, self.padding_idx)
|
87 |
+
self.weights = self.weights.to(self._float_tensor)
|
88 |
+
if incremental_state is not None:
|
89 |
+
pos = timestep.view(-1)[0] + 1 if timestep is not None else seq_len
|
90 |
+
return self.weights[self.padding_idx + pos, :].expand(shape=[bsz, 1, -1])
|
91 |
+
positions = utils.make_positions(x, self.padding_idx) if positions is None else positions
|
92 |
+
return self.weights.index_select(axis=0, index=positions.view(-1)).view(bsz, seq_len, -1).detach()
|
93 |
+
|
94 |
+
@staticmethod
|
95 |
+
def max_positions():
|
96 |
+
"""Maximum number of supported positions."""
|
97 |
+
return int(100000.0)
|
98 |
+
|
99 |
+
|
100 |
+
class TransformerFFNLayer(paddle.nn.Layer):
|
101 |
+
def __init__(self, hidden_size, filter_size, kernel_size=1, dropout=0.0, act="gelu"):
|
102 |
+
super().__init__()
|
103 |
+
self.kernel_size = kernel_size
|
104 |
+
self.dropout = dropout
|
105 |
+
self.act = act
|
106 |
+
self.ffn_1 = paddle.nn.Conv1D(
|
107 |
+
in_channels=hidden_size, out_channels=filter_size, kernel_size=kernel_size, padding=kernel_size // 2
|
108 |
+
)
|
109 |
+
if self.act == "relu":
|
110 |
+
self.act_fn = paddle.nn.ReLU()
|
111 |
+
elif self.act == "gelu":
|
112 |
+
self.act_fn = paddle.nn.GELU()
|
113 |
+
elif self.act == "swish":
|
114 |
+
self.act_fn = paddle.nn.Silu()
|
115 |
+
self.ffn_2 = XavierUniformInitLinear(filter_size, hidden_size)
|
116 |
+
|
117 |
+
def forward(self, x):
|
118 |
+
x = self.ffn_1(x.transpose(perm=[1, 2, 0])).transpose(perm=[2, 0, 1])
|
119 |
+
x = x * self.kernel_size**-0.5
|
120 |
+
x = self.act_fn(x)
|
121 |
+
x = paddle.nn.functional.dropout(x=x, p=self.dropout, training=self.training)
|
122 |
+
x = self.ffn_2(x)
|
123 |
+
return x
|
124 |
+
|
125 |
+
|
126 |
+
class EncSALayer(paddle.nn.Layer):
|
127 |
+
def __init__(self, c, num_heads, dropout, attention_dropout=0.1, relu_dropout=0.1, kernel_size=9, act="gelu"):
|
128 |
+
super().__init__()
|
129 |
+
self.dropout = dropout
|
130 |
+
self.layer_norm1 = paddle.nn.LayerNorm(normalized_shape=c)
|
131 |
+
self.self_attn = MultiheadAttention(
|
132 |
+
c,
|
133 |
+
num_heads,
|
134 |
+
dropout=attention_dropout,
|
135 |
+
bias_attr=False,
|
136 |
+
)
|
137 |
+
self.layer_norm2 = paddle.nn.LayerNorm(normalized_shape=c)
|
138 |
+
self.ffn = TransformerFFNLayer(c, 4 * c, kernel_size=kernel_size, dropout=relu_dropout, act=act)
|
139 |
+
|
140 |
+
def forward(self, x, encoder_padding_mask=None, **kwargs):
|
141 |
+
layer_norm_training = kwargs.get("layer_norm_training", None)
|
142 |
+
if layer_norm_training is not None:
|
143 |
+
self.layer_norm1.training = layer_norm_training
|
144 |
+
self.layer_norm2.training = layer_norm_training
|
145 |
+
residual = x
|
146 |
+
x = self.layer_norm1(x)
|
147 |
+
x = self.self_attn(
|
148 |
+
query=x,
|
149 |
+
key=x,
|
150 |
+
value=x,
|
151 |
+
attn_mask=paddle.any(encoder_padding_mask, -1), # key_padding_mask=encoder_padding_mask
|
152 |
+
)
|
153 |
+
x = paddle.nn.functional.dropout(x=x, p=self.dropout, training=self.training)
|
154 |
+
x = residual + x
|
155 |
+
x = (
|
156 |
+
x
|
157 |
+
* (1 - encoder_padding_mask.astype(dtype="float32")).transpose(
|
158 |
+
perm=paddle_aux.transpose_aux_func((1 - encoder_padding_mask.astype(dtype="float32")).ndim, 0, 1)
|
159 |
+
)[..., None]
|
160 |
+
)
|
161 |
+
residual = x
|
162 |
+
x = self.layer_norm2(x)
|
163 |
+
x = self.ffn(x)
|
164 |
+
x = paddle.nn.functional.dropout(x=x, p=self.dropout, training=self.training)
|
165 |
+
x = residual + x
|
166 |
+
x = (
|
167 |
+
x
|
168 |
+
* (1 - encoder_padding_mask.astype(dtype="float32")).transpose(
|
169 |
+
perm=paddle_aux.transpose_aux_func((1 - encoder_padding_mask.astype(dtype="float32")).ndim, 0, 1)
|
170 |
+
)[..., None]
|
171 |
+
)
|
172 |
+
return x
|
173 |
+
|
174 |
+
|
175 |
+
class SinusoidalPosEmb(paddle.nn.Layer):
|
176 |
+
def __init__(self, dim):
|
177 |
+
super().__init__()
|
178 |
+
self.dim = dim
|
179 |
+
|
180 |
+
def forward(self, x):
|
181 |
+
device = x.place
|
182 |
+
half_dim = self.dim // 2
|
183 |
+
emb = math.log(10000) / (half_dim - 1)
|
184 |
+
emb = paddle.exp(x=paddle.arange(end=half_dim) * -emb)
|
185 |
+
emb = x[:, None] * emb[None, :]
|
186 |
+
emb = paddle.concat(x=(emb.sin(), emb.cos()), axis=-1)
|
187 |
+
return emb
|
VLMEvalKit_old/PaddleMIX/paddlemix/models/diffsinger/modules/commons/espnet_positional_embedding.py
ADDED
@@ -0,0 +1,129 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
|
2 |
+
#
|
3 |
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
4 |
+
# you may not use this file except in compliance with the License.
|
5 |
+
# You may obtain a copy of the License at
|
6 |
+
#
|
7 |
+
# http://www.apache.org/licenses/LICENSE-2.0
|
8 |
+
#
|
9 |
+
# Unless required by applicable law or agreed to in writing, software
|
10 |
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
11 |
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
12 |
+
# See the License for the specific language governing permissions and
|
13 |
+
# limitations under the License.
|
14 |
+
|
15 |
+
import math
|
16 |
+
import sys
|
17 |
+
|
18 |
+
import paddle
|
19 |
+
|
20 |
+
from paddlemix.models.diffsinger.utils import paddle_aux
|
21 |
+
|
22 |
+
|
23 |
+
class PositionalEncoding(paddle.nn.Layer):
|
24 |
+
"""Positional encoding.
|
25 |
+
Args:
|
26 |
+
d_model (int): Embedding dimension.
|
27 |
+
dropout_rate (float): Dropout rate.
|
28 |
+
max_len (int): Maximum input length.
|
29 |
+
reverse (bool): Whether to reverse the input position.
|
30 |
+
"""
|
31 |
+
|
32 |
+
def __init__(self, d_model, dropout_rate, max_len=5000, reverse=False):
|
33 |
+
"""Construct an PositionalEncoding object."""
|
34 |
+
super(PositionalEncoding, self).__init__()
|
35 |
+
self.d_model = d_model
|
36 |
+
self.reverse = reverse
|
37 |
+
self.xscale = math.sqrt(self.d_model)
|
38 |
+
self.dropout = paddle.nn.Dropout(p=dropout_rate)
|
39 |
+
self.pe = None
|
40 |
+
self.extend_pe(paddle.to_tensor(data=0.0).expand(shape=[1, max_len]))
|
41 |
+
|
42 |
+
def extend_pe(self, x):
|
43 |
+
"""Reset the positional encodings."""
|
44 |
+
if self.pe is not None:
|
45 |
+
if self.pe.shape[1] >= x.shape[1]:
|
46 |
+
if self.pe.dtype != x.dtype or self.pe.place != x.place:
|
47 |
+
self.pe = self.pe.to(dtype=x.dtype, device=x.place)
|
48 |
+
return
|
49 |
+
if self.reverse:
|
50 |
+
position = paddle.arange(start=x.shape[1] - 1, end=-1, step=-1.0, dtype="float32").unsqueeze(axis=1)
|
51 |
+
else:
|
52 |
+
position = paddle.arange(start=0, end=x.shape[1], dtype="float32").unsqueeze(axis=1)
|
53 |
+
div_term = paddle.exp(
|
54 |
+
x=paddle.arange(start=0, end=self.d_model, step=2, dtype="float32") * -(math.log(10000.0) / self.d_model)
|
55 |
+
)
|
56 |
+
pe = (
|
57 |
+
paddle.stack(x=[paddle.sin(x=position * div_term), paddle.cos(x=position * div_term)], axis=2)
|
58 |
+
.view(-1, self.d_model)
|
59 |
+
.unsqueeze(axis=0)
|
60 |
+
)
|
61 |
+
self.pe = pe.to(device=x.place, dtype=x.dtype)
|
62 |
+
|
63 |
+
def forward(self, x: paddle.Tensor):
|
64 |
+
"""Add positional encoding.
|
65 |
+
Args:
|
66 |
+
x (torch.Tensor): Input tensor (batch, time, `*`).
|
67 |
+
Returns:
|
68 |
+
torch.Tensor: Encoded tensor (batch, time, `*`).
|
69 |
+
"""
|
70 |
+
self.extend_pe(x)
|
71 |
+
x = x * self.xscale + self.pe[:, : x.shape[1]]
|
72 |
+
return self.dropout(x)
|
73 |
+
|
74 |
+
|
75 |
+
class ScaledPositionalEncoding(PositionalEncoding):
|
76 |
+
"""Scaled positional encoding module.
|
77 |
+
See Sec. 3.2 https://arxiv.org/abs/1809.08895
|
78 |
+
Args:
|
79 |
+
d_model (int): Embedding dimension.
|
80 |
+
dropout_rate (float): Dropout rate.
|
81 |
+
max_len (int): Maximum input length.
|
82 |
+
"""
|
83 |
+
|
84 |
+
def __init__(self, d_model, dropout_rate, max_len=5000):
|
85 |
+
"""Initialize class."""
|
86 |
+
super().__init__(d_model=d_model, dropout_rate=dropout_rate, max_len=max_len)
|
87 |
+
self.alpha = paddle.base.framework.EagerParamBase.from_tensor(tensor=paddle.to_tensor(data=1.0))
|
88 |
+
|
89 |
+
def reset_parameters(self):
|
90 |
+
"""Reset parameters."""
|
91 |
+
self.alpha.data = paddle.to_tensor(data=1.0)
|
92 |
+
|
93 |
+
def forward(self, x):
|
94 |
+
"""Add positional encoding.
|
95 |
+
Args:
|
96 |
+
x (torch.Tensor): Input tensor (batch, time, `*`).
|
97 |
+
Returns:
|
98 |
+
torch.Tensor: Encoded tensor (batch, time, `*`).
|
99 |
+
"""
|
100 |
+
self.extend_pe(x)
|
101 |
+
x = x + self.alpha * self.pe[:, : x.shape[1]]
|
102 |
+
return self.dropout(x)
|
103 |
+
|
104 |
+
|
105 |
+
class RelPositionalEncoding(PositionalEncoding):
|
106 |
+
"""Relative positional encoding module.
|
107 |
+
See : Appendix B in https://arxiv.org/abs/1901.02860
|
108 |
+
Args:
|
109 |
+
d_model (int): Embedding dimension.
|
110 |
+
dropout_rate (float): Dropout rate.
|
111 |
+
max_len (int): Maximum input length.
|
112 |
+
"""
|
113 |
+
|
114 |
+
def __init__(self, d_model, dropout_rate, max_len=5000):
|
115 |
+
"""Initialize class."""
|
116 |
+
super().__init__(d_model, dropout_rate, max_len, reverse=True)
|
117 |
+
|
118 |
+
def forward(self, x):
|
119 |
+
"""Compute positional encoding.
|
120 |
+
Args:
|
121 |
+
x (torch.Tensor): Input tensor (batch, time, `*`).
|
122 |
+
Returns:
|
123 |
+
torch.Tensor: Encoded tensor (batch, time, `*`).
|
124 |
+
torch.Tensor: Positional embedding tensor (1, time, `*`).
|
125 |
+
"""
|
126 |
+
self.extend_pe(x)
|
127 |
+
x = x * self.xscale
|
128 |
+
pos_emb = self.pe[:, : x.shape[1]]
|
129 |
+
return self.dropout(x) + self.dropout(pos_emb)
|
VLMEvalKit_old/PaddleMIX/paddlemix/models/diffsinger/modules/compat.py
ADDED
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
|
2 |
+
#
|
3 |
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
4 |
+
# you may not use this file except in compliance with the License.
|
5 |
+
# You may obtain a copy of the License at
|
6 |
+
#
|
7 |
+
# http://www.apache.org/licenses/LICENSE-2.0
|
8 |
+
#
|
9 |
+
# Unless required by applicable law or agreed to in writing, software
|
10 |
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
11 |
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
12 |
+
# See the License for the specific language governing permissions and
|
13 |
+
# limitations under the License.
|
14 |
+
|
15 |
+
|
16 |
+
def get_backbone_type(root_config: dict, nested_config: dict = None):
|
17 |
+
if nested_config is None:
|
18 |
+
nested_config = root_config
|
19 |
+
return nested_config.get(
|
20 |
+
"backbone_type", root_config.get("backbone_type", root_config.get("diff_decoder_type", "wavenet"))
|
21 |
+
)
|
22 |
+
|
23 |
+
|
24 |
+
def get_backbone_args(config: dict, backbone_type: str):
|
25 |
+
args = config.get("backbone_args")
|
26 |
+
if args is not None:
|
27 |
+
return args
|
28 |
+
elif backbone_type == "wavenet":
|
29 |
+
return {
|
30 |
+
"num_layers": config.get("residual_layers"),
|
31 |
+
"num_channels": config.get("residual_channels"),
|
32 |
+
"dilation_cycle_length": config.get("dilation_cycle_length"),
|
33 |
+
}
|
34 |
+
else:
|
35 |
+
return None
|
VLMEvalKit_old/PaddleMIX/paddlemix/models/diffsinger/modules/core/__init__.py
ADDED
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
|
2 |
+
#
|
3 |
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
4 |
+
# you may not use this file except in compliance with the License.
|
5 |
+
# You may obtain a copy of the License at
|
6 |
+
#
|
7 |
+
# http://www.apache.org/licenses/LICENSE-2.0
|
8 |
+
#
|
9 |
+
# Unless required by applicable law or agreed to in writing, software
|
10 |
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
11 |
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
12 |
+
# See the License for the specific language governing permissions and
|
13 |
+
# limitations under the License.
|
14 |
+
|
15 |
+
from .ddpm import GaussianDiffusion, MultiVarianceDiffusion, PitchDiffusion
|
16 |
+
from .reflow import MultiVarianceRectifiedFlow, PitchRectifiedFlow, RectifiedFlow
|
VLMEvalKit_old/PaddleMIX/paddlemix/models/diffsinger/modules/core/ddpm.py
ADDED
@@ -0,0 +1,521 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
|
2 |
+
#
|
3 |
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
4 |
+
# you may not use this file except in compliance with the License.
|
5 |
+
# You may obtain a copy of the License at
|
6 |
+
#
|
7 |
+
# http://www.apache.org/licenses/LICENSE-2.0
|
8 |
+
#
|
9 |
+
# Unless required by applicable law or agreed to in writing, software
|
10 |
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
11 |
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
12 |
+
# See the License for the specific language governing permissions and
|
13 |
+
# limitations under the License.
|
14 |
+
|
15 |
+
from __future__ import annotations
|
16 |
+
|
17 |
+
import sys, os
|
18 |
+
from collections import deque
|
19 |
+
from functools import partial
|
20 |
+
from typing import List, Tuple
|
21 |
+
|
22 |
+
import numpy as np
|
23 |
+
|
24 |
+
import paddle
|
25 |
+
from tqdm import tqdm
|
26 |
+
|
27 |
+
from paddlemix.models.diffsinger.modules.backbones import build_backbone
|
28 |
+
from paddlemix.models.diffsinger.utils.hparams import hparams
|
29 |
+
|
30 |
+
|
31 |
+
def extract(a, t, x_shape):
|
32 |
+
b, *_ = tuple(t.shape)
|
33 |
+
out = a.take_along_axis(axis=-1, indices=t, broadcast=False)
|
34 |
+
return out.reshape(b, *((1,) * (len(x_shape) - 1)))
|
35 |
+
|
36 |
+
|
37 |
+
def noise_like(shape, device, repeat=False):
|
38 |
+
repeat_noise = lambda: paddle.randn(shape=(1, *shape[1:])).tile(
|
39 |
+
repeat_times=[shape[0], *((1,) * (len(shape) - 1))]
|
40 |
+
)
|
41 |
+
noise = lambda: paddle.randn(shape=shape)
|
42 |
+
return repeat_noise() if repeat else noise()
|
43 |
+
|
44 |
+
|
45 |
+
def linear_beta_schedule(timesteps, max_beta=0.01):
|
46 |
+
"""
|
47 |
+
linear schedule
|
48 |
+
"""
|
49 |
+
betas = np.linspace(0.0001, max_beta, timesteps)
|
50 |
+
return betas
|
51 |
+
|
52 |
+
|
53 |
+
def cosine_beta_schedule(timesteps, s=0.008):
|
54 |
+
"""
|
55 |
+
cosine schedule
|
56 |
+
as proposed in https://openreview.net/forum?id=-NEXDKk8gZ
|
57 |
+
"""
|
58 |
+
steps = timesteps + 1
|
59 |
+
x = np.linspace(0, steps, steps)
|
60 |
+
alphas_cumprod = np.cos((x / steps + s) / (1 + s) * np.pi * 0.5) ** 2
|
61 |
+
alphas_cumprod = alphas_cumprod / alphas_cumprod[0]
|
62 |
+
betas = 1 - alphas_cumprod[1:] / alphas_cumprod[:-1]
|
63 |
+
return np.clip(betas, a_min=0, a_max=0.999)
|
64 |
+
|
65 |
+
|
66 |
+
beta_schedule = {"cosine": cosine_beta_schedule, "linear": linear_beta_schedule}
|
67 |
+
|
68 |
+
|
69 |
+
class GaussianDiffusion(paddle.nn.Layer):
|
70 |
+
def __init__(
|
71 |
+
self,
|
72 |
+
out_dims,
|
73 |
+
num_feats=1,
|
74 |
+
timesteps=1000,
|
75 |
+
k_step=1000,
|
76 |
+
backbone_type=None,
|
77 |
+
backbone_args=None,
|
78 |
+
betas=None,
|
79 |
+
spec_min=None,
|
80 |
+
spec_max=None,
|
81 |
+
):
|
82 |
+
super().__init__()
|
83 |
+
self.denoise_fn: paddle.nn.Layer = build_backbone(out_dims, num_feats, backbone_type, backbone_args)
|
84 |
+
self.out_dims = out_dims
|
85 |
+
self.num_feats = num_feats
|
86 |
+
if betas is not None:
|
87 |
+
betas = betas.detach().cpu().numpy() if isinstance(betas, paddle.Tensor) else betas
|
88 |
+
else:
|
89 |
+
betas = beta_schedule[hparams["schedule_type"]](timesteps)
|
90 |
+
alphas = 1.0 - betas
|
91 |
+
alphas_cumprod = np.cumprod(alphas, axis=0)
|
92 |
+
alphas_cumprod_prev = np.append(1.0, alphas_cumprod[:-1])
|
93 |
+
self.use_shallow_diffusion = hparams.get("use_shallow_diffusion", False)
|
94 |
+
if self.use_shallow_diffusion:
|
95 |
+
assert k_step <= timesteps, "K_step should not be larger than timesteps."
|
96 |
+
self.timesteps = timesteps
|
97 |
+
self.k_step = k_step if self.use_shallow_diffusion else timesteps
|
98 |
+
self.noise_list = deque(maxlen=4)
|
99 |
+
to_torch = partial(paddle.to_tensor, dtype="float32")
|
100 |
+
self.register_buffer(name="betas", tensor=to_torch(betas))
|
101 |
+
self.register_buffer(name="alphas_cumprod", tensor=to_torch(alphas_cumprod))
|
102 |
+
self.register_buffer(name="alphas_cumprod_prev", tensor=to_torch(alphas_cumprod_prev))
|
103 |
+
self.register_buffer(name="sqrt_alphas_cumprod", tensor=to_torch(np.sqrt(alphas_cumprod)))
|
104 |
+
self.register_buffer(name="sqrt_one_minus_alphas_cumprod", tensor=to_torch(np.sqrt(1.0 - alphas_cumprod)))
|
105 |
+
self.register_buffer(name="log_one_minus_alphas_cumprod", tensor=to_torch(np.log(1.0 - alphas_cumprod)))
|
106 |
+
self.register_buffer(name="sqrt_recip_alphas_cumprod", tensor=to_torch(np.sqrt(1.0 / alphas_cumprod)))
|
107 |
+
self.register_buffer(name="sqrt_recipm1_alphas_cumprod", tensor=to_torch(np.sqrt(1.0 / alphas_cumprod - 1)))
|
108 |
+
posterior_variance = betas * (1.0 - alphas_cumprod_prev) / (1.0 - alphas_cumprod)
|
109 |
+
self.register_buffer(name="posterior_variance", tensor=to_torch(posterior_variance))
|
110 |
+
self.register_buffer(
|
111 |
+
name="posterior_log_variance_clipped", tensor=to_torch(np.log(np.maximum(posterior_variance, 1e-20)))
|
112 |
+
)
|
113 |
+
self.register_buffer(
|
114 |
+
name="posterior_mean_coef1", tensor=to_torch(betas * np.sqrt(alphas_cumprod_prev) / (1.0 - alphas_cumprod))
|
115 |
+
)
|
116 |
+
self.register_buffer(
|
117 |
+
name="posterior_mean_coef2",
|
118 |
+
tensor=to_torch((1.0 - alphas_cumprod_prev) * np.sqrt(alphas) / (1.0 - alphas_cumprod)),
|
119 |
+
)
|
120 |
+
spec_min = paddle.to_tensor(data=spec_min, dtype="float32")[None, None, :out_dims].transpose(
|
121 |
+
perm=paddle_aux.transpose_aux_func(
|
122 |
+
paddle.to_tensor(data=spec_min, dtype="float32")[None, None, :out_dims].ndim, -3, -2
|
123 |
+
)
|
124 |
+
)
|
125 |
+
spec_max = paddle.to_tensor(data=spec_max, dtype="float32")[None, None, :out_dims].transpose(
|
126 |
+
perm=paddle_aux.transpose_aux_func(
|
127 |
+
paddle.to_tensor(data=spec_max, dtype="float32")[None, None, :out_dims].ndim, -3, -2
|
128 |
+
)
|
129 |
+
)
|
130 |
+
self.register_buffer(name="spec_min", tensor=spec_min)
|
131 |
+
self.register_buffer(name="spec_max", tensor=spec_max)
|
132 |
+
self.time_scale_factor = self.timesteps
|
133 |
+
self.t_start = 1 - self.k_step / self.timesteps
|
134 |
+
factors = paddle.to_tensor(
|
135 |
+
data=[i for i in range(1, self.timesteps + 1) if self.timesteps % i == 0], dtype="int64"
|
136 |
+
)
|
137 |
+
self.register_buffer(name="timestep_factors", tensor=factors, persistable=False)
|
138 |
+
|
139 |
+
def q_mean_variance(self, x_start, t):
|
140 |
+
mean = extract(self.sqrt_alphas_cumprod, t, tuple(x_start.shape)) * x_start
|
141 |
+
variance = extract(1.0 - self.alphas_cumprod, t, tuple(x_start.shape))
|
142 |
+
log_variance = extract(self.log_one_minus_alphas_cumprod, t, tuple(x_start.shape))
|
143 |
+
return mean, variance, log_variance
|
144 |
+
|
145 |
+
def predict_start_from_noise(self, x_t, t, noise):
|
146 |
+
return (
|
147 |
+
extract(self.sqrt_recip_alphas_cumprod, t, tuple(x_t.shape)) * x_t
|
148 |
+
- extract(self.sqrt_recipm1_alphas_cumprod, t, tuple(x_t.shape)) * noise
|
149 |
+
)
|
150 |
+
|
151 |
+
def q_posterior(self, x_start, x_t, t):
|
152 |
+
posterior_mean = (
|
153 |
+
extract(self.posterior_mean_coef1, t, tuple(x_t.shape)) * x_start
|
154 |
+
+ extract(self.posterior_mean_coef2, t, tuple(x_t.shape)) * x_t
|
155 |
+
)
|
156 |
+
posterior_variance = extract(self.posterior_variance, t, tuple(x_t.shape))
|
157 |
+
posterior_log_variance_clipped = extract(self.posterior_log_variance_clipped, t, tuple(x_t.shape))
|
158 |
+
return (posterior_mean, posterior_variance, posterior_log_variance_clipped)
|
159 |
+
|
160 |
+
def p_mean_variance(self, x, t, cond):
|
161 |
+
noise_pred = self.denoise_fn(x, t, cond=cond)
|
162 |
+
x_recon = self.predict_start_from_noise(x, t=t, noise=noise_pred)
|
163 |
+
model_mean, posterior_variance, posterior_log_variance = self.q_posterior(x_start=x_recon, x_t=x, t=t)
|
164 |
+
return model_mean, posterior_variance, posterior_log_variance
|
165 |
+
|
166 |
+
@paddle.no_grad()
|
167 |
+
def p_sample(self, x, t, cond, clip_denoised=True, repeat_noise=False):
|
168 |
+
b, *_, device = *tuple(x.shape), x.place
|
169 |
+
model_mean, _, model_log_variance = self.p_mean_variance(x=x, t=t, cond=cond)
|
170 |
+
noise = noise_like(tuple(x.shape), device, repeat_noise)
|
171 |
+
nonzero_mask = (1 - (t == 0).astype(dtype="float32")).reshape(b, *((1,) * (len(tuple(x.shape)) - 1)))
|
172 |
+
return model_mean + nonzero_mask * (0.5 * model_log_variance).exp() * noise
|
173 |
+
|
174 |
+
@paddle.no_grad()
|
175 |
+
def p_sample_ddim(self, x, t, interval, cond):
|
176 |
+
a_t = extract(self.alphas_cumprod, t, tuple(x.shape))
|
177 |
+
a_prev = extract(self.alphas_cumprod, paddle_aux.max(t - interval, paddle.zeros_like(x=t)), tuple(x.shape))
|
178 |
+
noise_pred = self.denoise_fn(x, t, cond=cond)
|
179 |
+
x_prev = a_prev.sqrt() * (
|
180 |
+
x / a_t.sqrt() + (((1 - a_prev) / a_prev).sqrt() - ((1 - a_t) / a_t).sqrt()) * noise_pred
|
181 |
+
)
|
182 |
+
return x_prev
|
183 |
+
|
184 |
+
@paddle.no_grad()
|
185 |
+
def p_sample_plms(self, x, t, interval, cond, clip_denoised=True, repeat_noise=False):
|
186 |
+
"""
|
187 |
+
Use the PLMS method from
|
188 |
+
[Pseudo Numerical Methods for Diffusion Models on Manifolds](https://arxiv.org/abs/2202.09778).
|
189 |
+
"""
|
190 |
+
|
191 |
+
def get_x_pred(x, noise_t, t):
|
192 |
+
a_t = extract(self.alphas_cumprod, t, tuple(x.shape))
|
193 |
+
a_prev = extract(self.alphas_cumprod, paddle_aux.max(t - interval, paddle.zeros_like(x=t)), tuple(x.shape))
|
194 |
+
a_t_sq, a_prev_sq = a_t.sqrt(), a_prev.sqrt()
|
195 |
+
x_delta = (a_prev - a_t) * (
|
196 |
+
1 / (a_t_sq * (a_t_sq + a_prev_sq)) * x
|
197 |
+
- 1 / (a_t_sq * (((1 - a_prev) * a_t).sqrt() + ((1 - a_t) * a_prev).sqrt())) * noise_t
|
198 |
+
)
|
199 |
+
x_pred = x + x_delta
|
200 |
+
return x_pred
|
201 |
+
|
202 |
+
noise_list = self.noise_list
|
203 |
+
noise_pred = self.denoise_fn(x, t, cond=cond)
|
204 |
+
if len(noise_list) == 0:
|
205 |
+
x_pred = get_x_pred(x, noise_pred, t)
|
206 |
+
noise_pred_prev = self.denoise_fn(x_pred, max(t - interval, 0), cond=cond)
|
207 |
+
noise_pred_prime = (noise_pred + noise_pred_prev) / 2
|
208 |
+
elif len(noise_list) == 1:
|
209 |
+
noise_pred_prime = (3 * noise_pred - noise_list[-1]) / 2
|
210 |
+
elif len(noise_list) == 2:
|
211 |
+
noise_pred_prime = (23 * noise_pred - 16 * noise_list[-1] + 5 * noise_list[-2]) / 12
|
212 |
+
else:
|
213 |
+
noise_pred_prime = (55 * noise_pred - 59 * noise_list[-1] + 37 * noise_list[-2] - 9 * noise_list[-3]) / 24
|
214 |
+
x_prev = get_x_pred(x, noise_pred_prime, t)
|
215 |
+
noise_list.append(noise_pred)
|
216 |
+
return x_prev
|
217 |
+
|
218 |
+
def q_sample(self, x_start, t, noise):
|
219 |
+
return (
|
220 |
+
extract(self.sqrt_alphas_cumprod, t, tuple(x_start.shape)) * x_start
|
221 |
+
+ extract(self.sqrt_one_minus_alphas_cumprod, t, tuple(x_start.shape)) * noise
|
222 |
+
)
|
223 |
+
|
224 |
+
def p_losses(self, x_start, t, cond, noise=None):
|
225 |
+
if noise is None:
|
226 |
+
noise = paddle.randn(shape=x_start.shape, dtype=x_start.dtype)
|
227 |
+
x_noisy = self.q_sample(x_start=x_start, t=t, noise=noise)
|
228 |
+
x_recon = self.denoise_fn(x_noisy, t, cond)
|
229 |
+
return x_recon, noise
|
230 |
+
|
231 |
+
def inference(self, cond, b=1, x_start=None, device=None):
|
232 |
+
depth = hparams.get("K_step_infer", self.k_step)
|
233 |
+
speedup = hparams["diff_speedup"]
|
234 |
+
if speedup > 0:
|
235 |
+
assert depth % speedup == 0, f"Acceleration ratio must be a factor of diffusion depth {depth}."
|
236 |
+
noise = paddle.randn(shape=[b, self.num_feats, self.out_dims, tuple(cond.shape)[2]])
|
237 |
+
if self.use_shallow_diffusion:
|
238 |
+
t_max = min(depth, self.k_step)
|
239 |
+
else:
|
240 |
+
t_max = self.k_step
|
241 |
+
if t_max >= self.timesteps:
|
242 |
+
x = noise
|
243 |
+
elif t_max > 0:
|
244 |
+
assert x_start is not None, "Missing shallow diffusion source."
|
245 |
+
x = self.q_sample(x_start, paddle.full(shape=(b,), fill_value=t_max - 1, dtype="int64"), noise)
|
246 |
+
else:
|
247 |
+
assert x_start is not None, "Missing shallow diffusion source."
|
248 |
+
x = x_start
|
249 |
+
if speedup > 1 and t_max > 0:
|
250 |
+
algorithm = hparams["diff_accelerator"]
|
251 |
+
if algorithm == "dpm-solver":
|
252 |
+
from inference.dpm_solver_pytorch import (
|
253 |
+
DPM_Solver,
|
254 |
+
NoiseScheduleVP,
|
255 |
+
model_wrapper,
|
256 |
+
)
|
257 |
+
|
258 |
+
noise_schedule = NoiseScheduleVP(schedule="discrete", betas=self.betas[:t_max])
|
259 |
+
|
260 |
+
def my_wrapper(fn):
|
261 |
+
def wrapped(x, t, **kwargs):
|
262 |
+
ret = fn(x, t, **kwargs)
|
263 |
+
self.bar.update(1)
|
264 |
+
return ret
|
265 |
+
|
266 |
+
return wrapped
|
267 |
+
|
268 |
+
model_fn = model_wrapper(
|
269 |
+
my_wrapper(self.denoise_fn), noise_schedule, model_type="noise", model_kwargs={"cond": cond}
|
270 |
+
)
|
271 |
+
dpm_solver = DPM_Solver(model_fn, noise_schedule, algorithm_type="dpmsolver++")
|
272 |
+
steps = t_max // hparams["diff_speedup"]
|
273 |
+
self.bar = tqdm(desc="sample time step", total=steps, disable=not hparams["infer"], leave=False)
|
274 |
+
x = dpm_solver.sample(x, steps=steps, order=2, skip_type="time_uniform", method="multistep")
|
275 |
+
self.bar.close()
|
276 |
+
elif algorithm == "unipc":
|
277 |
+
from inference.uni_pc import NoiseScheduleVP, UniPC, model_wrapper
|
278 |
+
|
279 |
+
noise_schedule = NoiseScheduleVP(schedule="discrete", betas=self.betas[:t_max])
|
280 |
+
|
281 |
+
def my_wrapper(fn):
|
282 |
+
def wrapped(x, t, **kwargs):
|
283 |
+
ret = fn(x, t, **kwargs)
|
284 |
+
self.bar.update(1)
|
285 |
+
return ret
|
286 |
+
|
287 |
+
return wrapped
|
288 |
+
|
289 |
+
model_fn = model_wrapper(
|
290 |
+
my_wrapper(self.denoise_fn), noise_schedule, model_type="noise", model_kwargs={"cond": cond}
|
291 |
+
)
|
292 |
+
uni_pc = UniPC(model_fn, noise_schedule, variant="bh2")
|
293 |
+
steps = t_max // hparams["diff_speedup"]
|
294 |
+
self.bar = tqdm(desc="sample time step", total=steps, disable=not hparams["infer"], leave=False)
|
295 |
+
x = uni_pc.sample(x, steps=steps, order=2, skip_type="time_uniform", method="multistep")
|
296 |
+
self.bar.close()
|
297 |
+
elif algorithm == "pndm":
|
298 |
+
self.noise_list = deque(maxlen=4)
|
299 |
+
iteration_interval = speedup
|
300 |
+
for i in tqdm(
|
301 |
+
reversed(range(0, t_max, iteration_interval)),
|
302 |
+
desc="sample time step",
|
303 |
+
total=t_max // iteration_interval,
|
304 |
+
disable=not hparams["infer"],
|
305 |
+
leave=False,
|
306 |
+
):
|
307 |
+
x = self.p_sample_plms(
|
308 |
+
x, paddle.full(shape=(b,), fill_value=i, dtype="int64"), iteration_interval, cond=cond
|
309 |
+
)
|
310 |
+
elif algorithm == "ddim":
|
311 |
+
iteration_interval = speedup
|
312 |
+
for i in tqdm(
|
313 |
+
reversed(range(0, t_max, iteration_interval)),
|
314 |
+
desc="sample time step",
|
315 |
+
total=t_max // iteration_interval,
|
316 |
+
disable=not hparams["infer"],
|
317 |
+
leave=False,
|
318 |
+
):
|
319 |
+
x = self.p_sample_ddim(
|
320 |
+
x, paddle.full(shape=(b,), fill_value=i, dtype="int64"), iteration_interval, cond=cond
|
321 |
+
)
|
322 |
+
else:
|
323 |
+
raise ValueError(f"Unsupported acceleration algorithm for DDPM: {algorithm}.")
|
324 |
+
else:
|
325 |
+
for i in tqdm(
|
326 |
+
reversed(range(0, t_max)),
|
327 |
+
desc="sample time step",
|
328 |
+
total=t_max,
|
329 |
+
disable=not hparams["infer"],
|
330 |
+
leave=False,
|
331 |
+
):
|
332 |
+
x = self.p_sample(x, paddle.full(shape=(b,), fill_value=i, dtype="int64"), cond)
|
333 |
+
x = x.transpose(perm=paddle_aux.transpose_aux_func(x.ndim, 2, 3)).squeeze(axis=1)
|
334 |
+
return x
|
335 |
+
|
336 |
+
def forward(self, condition, gt_spec=None, src_spec=None, infer=True):
|
337 |
+
"""
|
338 |
+
conditioning diffusion, use fastspeech2 encoder output as the condition
|
339 |
+
"""
|
340 |
+
cond = condition.transpose(perm=paddle_aux.transpose_aux_func(condition.ndim, 1, 2))
|
341 |
+
b, device = tuple(condition.shape)[0], condition.place
|
342 |
+
if not infer:
|
343 |
+
spec = self.norm_spec(gt_spec).transpose(
|
344 |
+
perm=paddle_aux.transpose_aux_func(self.norm_spec(gt_spec).ndim, -2, -1)
|
345 |
+
)
|
346 |
+
if self.num_feats == 1:
|
347 |
+
spec = spec[:, None, :, :]
|
348 |
+
t = paddle.randint(low=0, high=self.k_step, shape=(b,)).astype(dtype="int64")
|
349 |
+
x_recon, noise = self.p_losses(spec, t, cond=cond)
|
350 |
+
return x_recon, noise
|
351 |
+
else:
|
352 |
+
if src_spec is not None:
|
353 |
+
spec = self.norm_spec(src_spec).transpose(
|
354 |
+
perm=paddle_aux.transpose_aux_func(self.norm_spec(src_spec).ndim, -2, -1)
|
355 |
+
)
|
356 |
+
if self.num_feats == 1:
|
357 |
+
spec = spec[:, None, :, :]
|
358 |
+
else:
|
359 |
+
spec = None
|
360 |
+
x = self.inference(cond, b=b, x_start=spec, device=device)
|
361 |
+
return self.denorm_spec(x)
|
362 |
+
|
363 |
+
def norm_spec(self, x):
|
364 |
+
return (x - self.spec_min) / (self.spec_max - self.spec_min) * 2 - 1
|
365 |
+
|
366 |
+
def denorm_spec(self, x):
|
367 |
+
return (x + 1) / 2 * (self.spec_max - self.spec_min) + self.spec_min
|
368 |
+
|
369 |
+
|
370 |
+
class RepetitiveDiffusion(GaussianDiffusion):
|
371 |
+
def __init__(
|
372 |
+
self,
|
373 |
+
vmin: (float | int | list),
|
374 |
+
vmax: (float | int | list),
|
375 |
+
repeat_bins: int,
|
376 |
+
timesteps=1000,
|
377 |
+
k_step=1000,
|
378 |
+
backbone_type=None,
|
379 |
+
backbone_args=None,
|
380 |
+
betas=None,
|
381 |
+
):
|
382 |
+
assert isinstance(vmin, (float, int)) and isinstance(vmin, (float, int)) or len(vmin) == len(vmax)
|
383 |
+
num_feats = 1 if isinstance(vmin, (float, int)) else len(vmin)
|
384 |
+
spec_min = [vmin] if num_feats == 1 else [[v] for v in vmin]
|
385 |
+
spec_max = [vmax] if num_feats == 1 else [[v] for v in vmax]
|
386 |
+
self.repeat_bins = repeat_bins
|
387 |
+
super().__init__(
|
388 |
+
out_dims=repeat_bins,
|
389 |
+
num_feats=num_feats,
|
390 |
+
timesteps=timesteps,
|
391 |
+
k_step=k_step,
|
392 |
+
backbone_type=backbone_type,
|
393 |
+
backbone_args=backbone_args,
|
394 |
+
betas=betas,
|
395 |
+
spec_min=spec_min,
|
396 |
+
spec_max=spec_max,
|
397 |
+
)
|
398 |
+
|
399 |
+
def norm_spec(self, x):
|
400 |
+
"""
|
401 |
+
|
402 |
+
:param x: [B, T] or [B, F, T]
|
403 |
+
:return [B, T, R] or [B, F, T, R]
|
404 |
+
"""
|
405 |
+
if self.num_feats == 1:
|
406 |
+
repeats = [1, 1, self.repeat_bins]
|
407 |
+
else:
|
408 |
+
repeats = [1, 1, 1, self.repeat_bins]
|
409 |
+
return super().norm_spec(x.unsqueeze(axis=-1).tile(repeat_times=repeats))
|
410 |
+
|
411 |
+
def denorm_spec(self, x):
|
412 |
+
"""
|
413 |
+
|
414 |
+
:param x: [B, T, R] or [B, F, T, R]
|
415 |
+
:return [B, T] or [B, F, T]
|
416 |
+
"""
|
417 |
+
return super().denorm_spec(x).mean(axis=-1)
|
418 |
+
|
419 |
+
|
420 |
+
class PitchDiffusion(RepetitiveDiffusion):
|
421 |
+
def __init__(
|
422 |
+
self,
|
423 |
+
vmin: float,
|
424 |
+
vmax: float,
|
425 |
+
cmin: float,
|
426 |
+
cmax: float,
|
427 |
+
repeat_bins,
|
428 |
+
timesteps=1000,
|
429 |
+
k_step=1000,
|
430 |
+
backbone_type=None,
|
431 |
+
backbone_args=None,
|
432 |
+
betas=None,
|
433 |
+
):
|
434 |
+
self.vmin = vmin
|
435 |
+
self.vmax = vmax
|
436 |
+
self.cmin = cmin
|
437 |
+
self.cmax = cmax
|
438 |
+
super().__init__(
|
439 |
+
vmin=vmin,
|
440 |
+
vmax=vmax,
|
441 |
+
repeat_bins=repeat_bins,
|
442 |
+
timesteps=timesteps,
|
443 |
+
k_step=k_step,
|
444 |
+
backbone_type=backbone_type,
|
445 |
+
backbone_args=backbone_args,
|
446 |
+
betas=betas,
|
447 |
+
)
|
448 |
+
|
449 |
+
def norm_spec(self, x):
|
450 |
+
return super().norm_spec(x.clip(min=self.cmin, max=self.cmax))
|
451 |
+
|
452 |
+
def denorm_spec(self, x):
|
453 |
+
return super().denorm_spec(x).clip(min=self.cmin, max=self.cmax)
|
454 |
+
|
455 |
+
|
456 |
+
class MultiVarianceDiffusion(RepetitiveDiffusion):
|
457 |
+
def __init__(
|
458 |
+
self,
|
459 |
+
ranges: List[Tuple[float, float]],
|
460 |
+
clamps: List[Tuple[float | None, float | None] | None],
|
461 |
+
repeat_bins,
|
462 |
+
timesteps=1000,
|
463 |
+
k_step=1000,
|
464 |
+
backbone_type=None,
|
465 |
+
backbone_args=None,
|
466 |
+
betas=None,
|
467 |
+
):
|
468 |
+
assert len(ranges) == len(clamps)
|
469 |
+
self.clamps = clamps
|
470 |
+
vmin = [r[0] for r in ranges]
|
471 |
+
vmax = [r[1] for r in ranges]
|
472 |
+
if len(vmin) == 1:
|
473 |
+
vmin = vmin[0]
|
474 |
+
if len(vmax) == 1:
|
475 |
+
vmax = vmax[0]
|
476 |
+
super().__init__(
|
477 |
+
vmin=vmin,
|
478 |
+
vmax=vmax,
|
479 |
+
repeat_bins=repeat_bins,
|
480 |
+
timesteps=timesteps,
|
481 |
+
k_step=k_step,
|
482 |
+
backbone_type=backbone_type,
|
483 |
+
backbone_args=backbone_args,
|
484 |
+
betas=betas,
|
485 |
+
)
|
486 |
+
|
487 |
+
def clamp_spec(self, xs: (list | tuple)):
|
488 |
+
clamped = []
|
489 |
+
for x, c in zip(xs, self.clamps):
|
490 |
+
if c is None:
|
491 |
+
clamped.append(x)
|
492 |
+
continue
|
493 |
+
clamped.append(x.clip(min=c[0], max=c[1]))
|
494 |
+
return clamped
|
495 |
+
|
496 |
+
def norm_spec(self, xs: (list | tuple)):
|
497 |
+
"""
|
498 |
+
|
499 |
+
:param xs: sequence of [B, T]
|
500 |
+
:return: [B, F, T] => super().norm_spec(xs) => [B, F, T, R]
|
501 |
+
"""
|
502 |
+
assert len(xs) == self.num_feats
|
503 |
+
clamped = self.clamp_spec(xs)
|
504 |
+
xs = paddle.stack(x=clamped, axis=1)
|
505 |
+
if self.num_feats == 1:
|
506 |
+
xs = xs.squeeze(axis=1)
|
507 |
+
return super().norm_spec(xs)
|
508 |
+
|
509 |
+
def denorm_spec(self, xs):
|
510 |
+
"""
|
511 |
+
|
512 |
+
:param xs: [B, T, R] or [B, F, T, R] => super().denorm_spec(xs) => [B, T] or [B, F, T]
|
513 |
+
:return: sequence of [B, T]
|
514 |
+
"""
|
515 |
+
xs = super().denorm_spec(xs)
|
516 |
+
if self.num_feats == 1:
|
517 |
+
xs = [xs]
|
518 |
+
else:
|
519 |
+
xs = xs.unbind(axis=1)
|
520 |
+
assert len(xs) == self.num_feats
|
521 |
+
return self.clamp_spec(xs)
|
VLMEvalKit_old/PaddleMIX/paddlemix/models/diffsinger/modules/core/reflow.py
ADDED
@@ -0,0 +1,311 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
|
2 |
+
#
|
3 |
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
4 |
+
# you may not use this file except in compliance with the License.
|
5 |
+
# You may obtain a copy of the License at
|
6 |
+
#
|
7 |
+
# http://www.apache.org/licenses/LICENSE-2.0
|
8 |
+
#
|
9 |
+
# Unless required by applicable law or agreed to in writing, software
|
10 |
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
11 |
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
12 |
+
# See the License for the specific language governing permissions and
|
13 |
+
# limitations under the License.
|
14 |
+
|
15 |
+
from __future__ import annotations
|
16 |
+
|
17 |
+
import sys
|
18 |
+
from typing import List, Tuple
|
19 |
+
import paddle
|
20 |
+
|
21 |
+
from tqdm import tqdm
|
22 |
+
from paddlemix.models.diffsinger.modules.backbones import build_backbone
|
23 |
+
from paddlemix.models.diffsinger.utils.hparams import hparams
|
24 |
+
from paddlemix.models.diffsinger.utils import paddle_aux
|
25 |
+
|
26 |
+
class RectifiedFlow(paddle.nn.Layer):
|
27 |
+
def __init__(
|
28 |
+
self,
|
29 |
+
out_dims,
|
30 |
+
num_feats=1,
|
31 |
+
t_start=0.0,
|
32 |
+
time_scale_factor=1000,
|
33 |
+
backbone_type=None,
|
34 |
+
backbone_args=None,
|
35 |
+
spec_min=None,
|
36 |
+
spec_max=None,
|
37 |
+
):
|
38 |
+
super().__init__()
|
39 |
+
self.velocity_fn: paddle.nn.Layer = build_backbone(out_dims, num_feats, backbone_type, backbone_args)
|
40 |
+
self.out_dims = out_dims
|
41 |
+
self.num_feats = num_feats
|
42 |
+
self.use_shallow_diffusion = hparams.get("use_shallow_diffusion", False)
|
43 |
+
if self.use_shallow_diffusion:
|
44 |
+
assert 0.0 <= t_start <= 1.0, "T_start should be in [0, 1]."
|
45 |
+
else:
|
46 |
+
t_start = 0.0
|
47 |
+
self.t_start = t_start
|
48 |
+
self.time_scale_factor = time_scale_factor
|
49 |
+
spec_min = paddle.to_tensor(data=spec_min, dtype="float32")[None, None, :out_dims].transpose(
|
50 |
+
perm=paddle_aux.transpose_aux_func(
|
51 |
+
paddle.to_tensor(data=spec_min, dtype="float32")[None, None, :out_dims].ndim, -3, -2
|
52 |
+
)
|
53 |
+
)
|
54 |
+
spec_max = paddle.to_tensor(data=spec_max, dtype="float32")[None, None, :out_dims].transpose(
|
55 |
+
perm=paddle_aux.transpose_aux_func(
|
56 |
+
paddle.to_tensor(data=spec_max, dtype="float32")[None, None, :out_dims].ndim, -3, -2
|
57 |
+
)
|
58 |
+
)
|
59 |
+
self.register_buffer(name="spec_min", tensor=spec_min, persistable=False)
|
60 |
+
self.register_buffer(name="spec_max", tensor=spec_max, persistable=False)
|
61 |
+
|
62 |
+
def p_losses(self, x_end, t, cond):
|
63 |
+
x_start = paddle.randn(shape=x_end.shape, dtype=x_end.dtype)
|
64 |
+
x_t = x_start + t[:, None, None, None] * (x_end - x_start)
|
65 |
+
v_pred = self.velocity_fn(x_t, t * self.time_scale_factor, cond)
|
66 |
+
return v_pred, x_end - x_start
|
67 |
+
|
68 |
+
def forward(self, condition, gt_spec=None, src_spec=None, infer=True):
|
69 |
+
cond = condition.transpose(perm=paddle_aux.transpose_aux_func(condition.ndim, 1, 2))
|
70 |
+
b, device = tuple(condition.shape)[0], condition.place
|
71 |
+
if not infer:
|
72 |
+
spec = self.norm_spec(gt_spec).transpose(
|
73 |
+
perm=paddle_aux.transpose_aux_func(self.norm_spec(gt_spec).ndim, -2, -1)
|
74 |
+
)
|
75 |
+
if self.num_feats == 1:
|
76 |
+
spec = spec[:, None, :, :]
|
77 |
+
t = self.t_start + (1.0 - self.t_start) * paddle.rand(shape=(b,))
|
78 |
+
v_pred, v_gt = self.p_losses(spec, t, cond=cond)
|
79 |
+
return v_pred, v_gt, t
|
80 |
+
else:
|
81 |
+
if src_spec is not None:
|
82 |
+
spec = self.norm_spec(src_spec).transpose(
|
83 |
+
perm=paddle_aux.transpose_aux_func(self.norm_spec(src_spec).ndim, -2, -1)
|
84 |
+
)
|
85 |
+
if self.num_feats == 1:
|
86 |
+
spec = spec[:, None, :, :]
|
87 |
+
else:
|
88 |
+
spec = None
|
89 |
+
x = self.inference(cond, b=b, x_end=spec, device=device)
|
90 |
+
return self.denorm_spec(x)
|
91 |
+
|
92 |
+
@paddle.no_grad()
|
93 |
+
def sample_euler(self, x, t, dt, cond):
|
94 |
+
x += self.velocity_fn(x, self.time_scale_factor * t, cond) * dt
|
95 |
+
t += dt
|
96 |
+
return x, t
|
97 |
+
|
98 |
+
@paddle.no_grad()
|
99 |
+
def sample_rk2(self, x, t, dt, cond):
|
100 |
+
k_1 = self.velocity_fn(x, self.time_scale_factor * t, cond)
|
101 |
+
k_2 = self.velocity_fn(x + 0.5 * k_1 * dt, self.time_scale_factor * (t + 0.5 * dt), cond)
|
102 |
+
x += k_2 * dt
|
103 |
+
t += dt
|
104 |
+
return x, t
|
105 |
+
|
106 |
+
@paddle.no_grad()
|
107 |
+
def sample_rk4(self, x, t, dt, cond):
|
108 |
+
k_1 = self.velocity_fn(x, self.time_scale_factor * t, cond)
|
109 |
+
k_2 = self.velocity_fn(x + 0.5 * k_1 * dt, self.time_scale_factor * (t + 0.5 * dt), cond)
|
110 |
+
k_3 = self.velocity_fn(x + 0.5 * k_2 * dt, self.time_scale_factor * (t + 0.5 * dt), cond)
|
111 |
+
k_4 = self.velocity_fn(x + k_3 * dt, self.time_scale_factor * (t + dt), cond)
|
112 |
+
x += (k_1 + 2 * k_2 + 2 * k_3 + k_4) * dt / 6
|
113 |
+
t += dt
|
114 |
+
return x, t
|
115 |
+
|
116 |
+
@paddle.no_grad()
|
117 |
+
def sample_rk5(self, x, t, dt, cond):
|
118 |
+
k_1 = self.velocity_fn(x, self.time_scale_factor * t, cond)
|
119 |
+
k_2 = self.velocity_fn(x + 0.25 * k_1 * dt, self.time_scale_factor * (t + 0.25 * dt), cond)
|
120 |
+
k_3 = self.velocity_fn(x + 0.125 * (k_2 + k_1) * dt, self.time_scale_factor * (t + 0.25 * dt), cond)
|
121 |
+
k_4 = self.velocity_fn(x + 0.5 * (-k_2 + 2 * k_3) * dt, self.time_scale_factor * (t + 0.5 * dt), cond)
|
122 |
+
k_5 = self.velocity_fn(x + 0.0625 * (3 * k_1 + 9 * k_4) * dt, self.time_scale_factor * (t + 0.75 * dt), cond)
|
123 |
+
k_6 = self.velocity_fn(
|
124 |
+
x + (-3 * k_1 + 2 * k_2 + 12 * k_3 - 12 * k_4 + 8 * k_5) * dt / 7, self.time_scale_factor * (t + dt), cond
|
125 |
+
)
|
126 |
+
x += (7 * k_1 + 32 * k_3 + 12 * k_4 + 32 * k_5 + 7 * k_6) * dt / 90
|
127 |
+
t += dt
|
128 |
+
return x, t
|
129 |
+
|
130 |
+
@paddle.no_grad()
|
131 |
+
def inference(self, cond, b=1, x_end=None, device=None):
|
132 |
+
noise = paddle.randn(shape=[b, self.num_feats, self.out_dims, tuple(cond.shape)[2]])
|
133 |
+
t_start = hparams.get("T_start_infer", self.t_start)
|
134 |
+
if self.use_shallow_diffusion and t_start > 0:
|
135 |
+
assert x_end is not None, "Missing shallow diffusion source."
|
136 |
+
if t_start >= 1.0:
|
137 |
+
t_start = 1.0
|
138 |
+
x = x_end
|
139 |
+
else:
|
140 |
+
x = t_start * x_end + (1 - t_start) * noise
|
141 |
+
else:
|
142 |
+
t_start = 0.0
|
143 |
+
x = noise
|
144 |
+
algorithm = hparams["sampling_algorithm"]
|
145 |
+
infer_step = hparams["sampling_steps"]
|
146 |
+
if t_start < 1:
|
147 |
+
dt = (1.0 - t_start) / max(1, infer_step)
|
148 |
+
algorithm_fn = {
|
149 |
+
"euler": self.sample_euler,
|
150 |
+
"rk2": self.sample_rk2,
|
151 |
+
"rk4": self.sample_rk4,
|
152 |
+
"rk5": self.sample_rk5,
|
153 |
+
}.get(algorithm)
|
154 |
+
if algorithm_fn is None:
|
155 |
+
raise ValueError(f"Unsupported algorithm for Rectified Flow: {algorithm}.")
|
156 |
+
dts = paddle.to_tensor(data=[dt]).to(x)
|
157 |
+
for i in tqdm(
|
158 |
+
range(infer_step), desc="sample time step", total=infer_step, disable=not hparams["infer"], leave=False
|
159 |
+
):
|
160 |
+
x, _ = algorithm_fn(x, t_start + i * dts, dt, cond)
|
161 |
+
x = x.astype(dtype="float32")
|
162 |
+
x = x.transpose(perm=paddle_aux.transpose_aux_func(x.ndim, 2, 3)).squeeze(axis=1)
|
163 |
+
return x
|
164 |
+
|
165 |
+
def norm_spec(self, x):
|
166 |
+
return (x - self.spec_min) / (self.spec_max - self.spec_min) * 2 - 1
|
167 |
+
|
168 |
+
def denorm_spec(self, x):
|
169 |
+
return (x + 1) / 2 * (self.spec_max - self.spec_min) + self.spec_min
|
170 |
+
|
171 |
+
|
172 |
+
class RepetitiveRectifiedFlow(RectifiedFlow):
|
173 |
+
def __init__(
|
174 |
+
self,
|
175 |
+
vmin: (float | int | list),
|
176 |
+
vmax: (float | int | list),
|
177 |
+
repeat_bins: int,
|
178 |
+
time_scale_factor=1000,
|
179 |
+
backbone_type=None,
|
180 |
+
backbone_args=None,
|
181 |
+
):
|
182 |
+
assert isinstance(vmin, (float, int)) and isinstance(vmin, (float, int)) or len(vmin) == len(vmax)
|
183 |
+
num_feats = 1 if isinstance(vmin, (float, int)) else len(vmin)
|
184 |
+
spec_min = [vmin] if num_feats == 1 else [[v] for v in vmin]
|
185 |
+
spec_max = [vmax] if num_feats == 1 else [[v] for v in vmax]
|
186 |
+
self.repeat_bins = repeat_bins
|
187 |
+
super().__init__(
|
188 |
+
out_dims=repeat_bins,
|
189 |
+
num_feats=num_feats,
|
190 |
+
time_scale_factor=time_scale_factor,
|
191 |
+
backbone_type=backbone_type,
|
192 |
+
backbone_args=backbone_args,
|
193 |
+
spec_min=spec_min,
|
194 |
+
spec_max=spec_max,
|
195 |
+
)
|
196 |
+
|
197 |
+
def norm_spec(self, x):
|
198 |
+
"""
|
199 |
+
|
200 |
+
:param x: [B, T] or [B, F, T]
|
201 |
+
:return [B, T, R] or [B, F, T, R]
|
202 |
+
"""
|
203 |
+
if self.num_feats == 1:
|
204 |
+
repeats = [1, 1, self.repeat_bins]
|
205 |
+
else:
|
206 |
+
repeats = [1, 1, 1, self.repeat_bins]
|
207 |
+
return super().norm_spec(x.unsqueeze(axis=-1).tile(repeat_times=repeats))
|
208 |
+
|
209 |
+
def denorm_spec(self, x):
|
210 |
+
"""
|
211 |
+
|
212 |
+
:param x: [B, T, R] or [B, F, T, R]
|
213 |
+
:return [B, T] or [B, F, T]
|
214 |
+
"""
|
215 |
+
return super().denorm_spec(x).mean(axis=-1)
|
216 |
+
|
217 |
+
|
218 |
+
class PitchRectifiedFlow(RepetitiveRectifiedFlow):
|
219 |
+
def __init__(
|
220 |
+
self,
|
221 |
+
vmin: float,
|
222 |
+
vmax: float,
|
223 |
+
cmin: float,
|
224 |
+
cmax: float,
|
225 |
+
repeat_bins,
|
226 |
+
time_scale_factor=1000,
|
227 |
+
backbone_type=None,
|
228 |
+
backbone_args=None,
|
229 |
+
):
|
230 |
+
self.vmin = vmin
|
231 |
+
self.vmax = vmax
|
232 |
+
self.cmin = cmin
|
233 |
+
self.cmax = cmax
|
234 |
+
super().__init__(
|
235 |
+
vmin=vmin,
|
236 |
+
vmax=vmax,
|
237 |
+
repeat_bins=repeat_bins,
|
238 |
+
time_scale_factor=time_scale_factor,
|
239 |
+
backbone_type=backbone_type,
|
240 |
+
backbone_args=backbone_args,
|
241 |
+
)
|
242 |
+
|
243 |
+
def norm_spec(self, x):
|
244 |
+
return super().norm_spec(x.clip(min=self.cmin, max=self.cmax))
|
245 |
+
|
246 |
+
def denorm_spec(self, x):
|
247 |
+
return super().denorm_spec(x).clip(min=self.cmin, max=self.cmax)
|
248 |
+
|
249 |
+
|
250 |
+
class MultiVarianceRectifiedFlow(RepetitiveRectifiedFlow):
|
251 |
+
def __init__(
|
252 |
+
self,
|
253 |
+
ranges: List[Tuple[float, float]],
|
254 |
+
clamps: List[Tuple[float | None, float | None] | None],
|
255 |
+
repeat_bins,
|
256 |
+
time_scale_factor=1000,
|
257 |
+
backbone_type=None,
|
258 |
+
backbone_args=None,
|
259 |
+
):
|
260 |
+
assert len(ranges) == len(clamps)
|
261 |
+
self.clamps = clamps
|
262 |
+
vmin = [r[0] for r in ranges]
|
263 |
+
vmax = [r[1] for r in ranges]
|
264 |
+
if len(vmin) == 1:
|
265 |
+
vmin = vmin[0]
|
266 |
+
if len(vmax) == 1:
|
267 |
+
vmax = vmax[0]
|
268 |
+
super().__init__(
|
269 |
+
vmin=vmin,
|
270 |
+
vmax=vmax,
|
271 |
+
repeat_bins=repeat_bins,
|
272 |
+
time_scale_factor=time_scale_factor,
|
273 |
+
backbone_type=backbone_type,
|
274 |
+
backbone_args=backbone_args,
|
275 |
+
)
|
276 |
+
|
277 |
+
def clamp_spec(self, xs: (list | tuple)):
|
278 |
+
clamped = []
|
279 |
+
for x, c in zip(xs, self.clamps):
|
280 |
+
if c is None:
|
281 |
+
clamped.append(x)
|
282 |
+
continue
|
283 |
+
clamped.append(x.clip(min=c[0], max=c[1]))
|
284 |
+
return clamped
|
285 |
+
|
286 |
+
def norm_spec(self, xs: (list | tuple)):
|
287 |
+
"""
|
288 |
+
|
289 |
+
:param xs: sequence of [B, T]
|
290 |
+
:return: [B, F, T] => super().norm_spec(xs) => [B, F, T, R]
|
291 |
+
"""
|
292 |
+
assert len(xs) == self.num_feats
|
293 |
+
clamped = self.clamp_spec(xs)
|
294 |
+
xs = paddle.stack(x=clamped, axis=1)
|
295 |
+
if self.num_feats == 1:
|
296 |
+
xs = xs.squeeze(axis=1)
|
297 |
+
return super().norm_spec(xs)
|
298 |
+
|
299 |
+
def denorm_spec(self, xs):
|
300 |
+
"""
|
301 |
+
|
302 |
+
:param xs: [B, T, R] or [B, F, T, R] => super().denorm_spec(xs) => [B, T] or [B, F, T]
|
303 |
+
:return: sequence of [B, T]
|
304 |
+
"""
|
305 |
+
xs = super().denorm_spec(xs)
|
306 |
+
if self.num_feats == 1:
|
307 |
+
xs = [xs]
|
308 |
+
else:
|
309 |
+
xs = xs.unbind(axis=1)
|
310 |
+
assert len(xs) == self.num_feats
|
311 |
+
return self.clamp_spec(xs)
|
VLMEvalKit_old/PaddleMIX/paddlemix/models/diffsinger/modules/fastspeech/acoustic_encoder.py
ADDED
@@ -0,0 +1,110 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
|
2 |
+
#
|
3 |
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
4 |
+
# you may not use this file except in compliance with the License.
|
5 |
+
# You may obtain a copy of the License at
|
6 |
+
#
|
7 |
+
# http://www.apache.org/licenses/LICENSE-2.0
|
8 |
+
#
|
9 |
+
# Unless required by applicable law or agreed to in writing, software
|
10 |
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
11 |
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
12 |
+
# See the License for the specific language governing permissions and
|
13 |
+
# limitations under the License.
|
14 |
+
|
15 |
+
import paddle
|
16 |
+
|
17 |
+
from paddlemix.models.diffsinger.modules.commons.common_layers import (
|
18 |
+
NormalInitEmbedding as Embedding,
|
19 |
+
)
|
20 |
+
from paddlemix.models.diffsinger.modules.commons.common_layers import (
|
21 |
+
XavierUniformInitLinear as Linear,
|
22 |
+
)
|
23 |
+
from paddlemix.models.diffsinger.modules.fastspeech.tts_modules import (
|
24 |
+
FastSpeech2Encoder,
|
25 |
+
mel2ph_to_dur,
|
26 |
+
)
|
27 |
+
from paddlemix.models.diffsinger.utils.hparams import hparams
|
28 |
+
from paddlemix.models.diffsinger.utils.text_encoder import PAD_INDEX
|
29 |
+
|
30 |
+
|
31 |
+
class FastSpeech2Acoustic(paddle.nn.Layer):
|
32 |
+
def __init__(self, vocab_size):
|
33 |
+
super().__init__()
|
34 |
+
self.txt_embed = Embedding(vocab_size, hparams["hidden_size"], PAD_INDEX)
|
35 |
+
self.dur_embed = Linear(1, hparams["hidden_size"])
|
36 |
+
self.encoder = FastSpeech2Encoder(
|
37 |
+
hidden_size=hparams["hidden_size"],
|
38 |
+
num_layers=hparams["enc_layers"],
|
39 |
+
ffn_kernel_size=hparams["enc_ffn_kernel_size"],
|
40 |
+
ffn_act=hparams["ffn_act"],
|
41 |
+
dropout=hparams["dropout"],
|
42 |
+
num_heads=hparams["num_heads"],
|
43 |
+
use_pos_embed=hparams["use_pos_embed"],
|
44 |
+
rel_pos=hparams["rel_pos"],
|
45 |
+
)
|
46 |
+
self.pitch_embed = Linear(1, hparams["hidden_size"])
|
47 |
+
self.variance_embed_list = []
|
48 |
+
self.use_energy_embed = hparams.get("use_energy_embed", False)
|
49 |
+
self.use_breathiness_embed = hparams.get("use_breathiness_embed", False)
|
50 |
+
self.use_voicing_embed = hparams.get("use_voicing_embed", False)
|
51 |
+
self.use_tension_embed = hparams.get("use_tension_embed", False)
|
52 |
+
if self.use_energy_embed:
|
53 |
+
self.variance_embed_list.append("energy")
|
54 |
+
if self.use_breathiness_embed:
|
55 |
+
self.variance_embed_list.append("breathiness")
|
56 |
+
if self.use_voicing_embed:
|
57 |
+
self.variance_embed_list.append("voicing")
|
58 |
+
if self.use_tension_embed:
|
59 |
+
self.variance_embed_list.append("tension")
|
60 |
+
self.use_variance_embeds = len(self.variance_embed_list) > 0
|
61 |
+
if self.use_variance_embeds:
|
62 |
+
self.variance_embeds = paddle.nn.LayerDict(
|
63 |
+
sublayers={v_name: Linear(1, hparams["hidden_size"]) for v_name in self.variance_embed_list}
|
64 |
+
)
|
65 |
+
self.use_key_shift_embed = hparams.get("use_key_shift_embed", False)
|
66 |
+
if self.use_key_shift_embed:
|
67 |
+
self.key_shift_embed = Linear(1, hparams["hidden_size"])
|
68 |
+
self.use_speed_embed = hparams.get("use_speed_embed", False)
|
69 |
+
if self.use_speed_embed:
|
70 |
+
self.speed_embed = Linear(1, hparams["hidden_size"])
|
71 |
+
self.use_spk_id = hparams["use_spk_id"]
|
72 |
+
if self.use_spk_id:
|
73 |
+
self.spk_embed = Embedding(hparams["num_spk"], hparams["hidden_size"])
|
74 |
+
|
75 |
+
def forward_variance_embedding(self, condition, key_shift=None, speed=None, **variances):
|
76 |
+
if self.use_variance_embeds:
|
77 |
+
variance_embeds = paddle.stack(
|
78 |
+
x=[self.variance_embeds[v_name](variances[v_name][:, :, None]) for v_name in self.variance_embed_list],
|
79 |
+
axis=-1,
|
80 |
+
).sum(axis=-1)
|
81 |
+
condition += variance_embeds
|
82 |
+
if self.use_key_shift_embed:
|
83 |
+
key_shift_embed = self.key_shift_embed(key_shift[:, :, None])
|
84 |
+
condition += key_shift_embed
|
85 |
+
if self.use_speed_embed:
|
86 |
+
speed_embed = self.speed_embed(speed[:, :, None])
|
87 |
+
condition += speed_embed
|
88 |
+
return condition
|
89 |
+
|
90 |
+
def forward(self, txt_tokens, mel2ph, f0, key_shift=None, speed=None, spk_embed_id=None, **kwargs):
|
91 |
+
txt_embed = self.txt_embed(txt_tokens)
|
92 |
+
# dur = mel2ph_to_dur(mel2ph, tuple(txt_tokens.shape)[1]).float()
|
93 |
+
dur = paddle.cast(mel2ph_to_dur(mel2ph, tuple(txt_tokens.shape)[1]), dtype="float32")
|
94 |
+
dur_embed = self.dur_embed(dur[:, :, None])
|
95 |
+
encoder_out = self.encoder(txt_embed, dur_embed, txt_tokens == 0)
|
96 |
+
encoder_out = paddle.nn.functional.pad(x=encoder_out, pad=[0, 0, 1, 0], pad_from_left_axis=False)
|
97 |
+
mel2ph_ = mel2ph[..., None].tile(repeat_times=[1, 1, tuple(encoder_out.shape)[-1]])
|
98 |
+
condition = paddle.take_along_axis(arr=encoder_out, axis=1, indices=mel2ph_, broadcast=False)
|
99 |
+
if self.use_spk_id:
|
100 |
+
spk_mix_embed = kwargs.get("spk_mix_embed")
|
101 |
+
if spk_mix_embed is not None:
|
102 |
+
spk_embed = spk_mix_embed
|
103 |
+
else:
|
104 |
+
spk_embed = self.spk_embed(spk_embed_id)[:, None, :]
|
105 |
+
condition += spk_embed
|
106 |
+
f0_mel = (1 + f0 / 700).log()
|
107 |
+
pitch_embed = self.pitch_embed(f0_mel[:, :, None])
|
108 |
+
condition += pitch_embed
|
109 |
+
condition = self.forward_variance_embedding(condition, key_shift=key_shift, speed=speed, **kwargs)
|
110 |
+
return condition
|
VLMEvalKit_old/PaddleMIX/paddlemix/models/diffsinger/modules/fastspeech/param_adaptor.py
ADDED
@@ -0,0 +1,88 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
|
2 |
+
#
|
3 |
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
4 |
+
# you may not use this file except in compliance with the License.
|
5 |
+
# You may obtain a copy of the License at
|
6 |
+
#
|
7 |
+
# http://www.apache.org/licenses/LICENSE-2.0
|
8 |
+
#
|
9 |
+
# Unless required by applicable law or agreed to in writing, software
|
10 |
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
11 |
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
12 |
+
# See the License for the specific language governing permissions and
|
13 |
+
# limitations under the License.
|
14 |
+
|
15 |
+
from __future__ import annotations
|
16 |
+
|
17 |
+
import sys
|
18 |
+
|
19 |
+
import paddle
|
20 |
+
|
21 |
+
import paddlemix.models.diffsinger.modules.compat as compat
|
22 |
+
from paddlemix.models.diffsinger.modules.core.ddpm import MultiVarianceDiffusion
|
23 |
+
from paddlemix.models.diffsinger.utils import filter_kwargs
|
24 |
+
from paddlemix.models.diffsinger.utils.hparams import hparams
|
25 |
+
|
26 |
+
VARIANCE_CHECKLIST = ["energy", "breathiness", "voicing", "tension"]
|
27 |
+
|
28 |
+
|
29 |
+
class ParameterAdaptorModule(paddle.nn.Layer):
|
30 |
+
def __init__(self):
|
31 |
+
super().__init__()
|
32 |
+
self.variance_prediction_list = []
|
33 |
+
self.predict_energy = hparams.get("predict_energy", False)
|
34 |
+
self.predict_breathiness = hparams.get("predict_breathiness", False)
|
35 |
+
self.predict_voicing = hparams.get("predict_voicing", False)
|
36 |
+
self.predict_tension = hparams.get("predict_tension", False)
|
37 |
+
if self.predict_energy:
|
38 |
+
self.variance_prediction_list.append("energy")
|
39 |
+
if self.predict_breathiness:
|
40 |
+
self.variance_prediction_list.append("breathiness")
|
41 |
+
if self.predict_voicing:
|
42 |
+
self.variance_prediction_list.append("voicing")
|
43 |
+
if self.predict_tension:
|
44 |
+
self.variance_prediction_list.append("tension")
|
45 |
+
self.predict_variances = len(self.variance_prediction_list) > 0
|
46 |
+
|
47 |
+
def build_adaptor(self, cls=MultiVarianceDiffusion):
|
48 |
+
ranges = []
|
49 |
+
clamps = []
|
50 |
+
if self.predict_energy:
|
51 |
+
ranges.append((hparams["energy_db_min"], hparams["energy_db_max"]))
|
52 |
+
clamps.append((hparams["energy_db_min"], 0.0))
|
53 |
+
if self.predict_breathiness:
|
54 |
+
ranges.append((hparams["breathiness_db_min"], hparams["breathiness_db_max"]))
|
55 |
+
clamps.append((hparams["breathiness_db_min"], 0.0))
|
56 |
+
if self.predict_voicing:
|
57 |
+
ranges.append((hparams["voicing_db_min"], hparams["voicing_db_max"]))
|
58 |
+
clamps.append((hparams["voicing_db_min"], 0.0))
|
59 |
+
if self.predict_tension:
|
60 |
+
ranges.append((hparams["tension_logit_min"], hparams["tension_logit_max"]))
|
61 |
+
clamps.append((hparams["tension_logit_min"], hparams["tension_logit_max"]))
|
62 |
+
variances_hparams = hparams["variances_prediction_args"]
|
63 |
+
total_repeat_bins = variances_hparams["total_repeat_bins"]
|
64 |
+
assert (
|
65 |
+
total_repeat_bins % len(self.variance_prediction_list) == 0
|
66 |
+
), f"Total number of repeat bins must be divisible by number of variance parameters ({len(self.variance_prediction_list)})."
|
67 |
+
repeat_bins = total_repeat_bins // len(self.variance_prediction_list)
|
68 |
+
backbone_type = compat.get_backbone_type(hparams, nested_config=variances_hparams)
|
69 |
+
backbone_args = compat.get_backbone_args(variances_hparams, backbone_type=backbone_type)
|
70 |
+
kwargs = filter_kwargs(
|
71 |
+
{
|
72 |
+
"ranges": ranges,
|
73 |
+
"clamps": clamps,
|
74 |
+
"repeat_bins": repeat_bins,
|
75 |
+
"timesteps": hparams.get("timesteps"),
|
76 |
+
"time_scale_factor": hparams.get("time_scale_factor"),
|
77 |
+
"backbone_type": backbone_type,
|
78 |
+
"backbone_args": backbone_args,
|
79 |
+
},
|
80 |
+
cls,
|
81 |
+
)
|
82 |
+
return cls(**kwargs)
|
83 |
+
|
84 |
+
def collect_variance_inputs(self, **kwargs) -> list:
|
85 |
+
return [kwargs.get(name) for name in self.variance_prediction_list]
|
86 |
+
|
87 |
+
def collect_variance_outputs(self, variances: (list | tuple)) -> dict:
|
88 |
+
return {name: pred for name, pred in zip(self.variance_prediction_list, variances)}
|
VLMEvalKit_old/PaddleMIX/paddlemix/models/diffsinger/modules/fastspeech/tts_modules.py
ADDED
@@ -0,0 +1,473 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
|
2 |
+
#
|
3 |
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
4 |
+
# you may not use this file except in compliance with the License.
|
5 |
+
# You may obtain a copy of the License at
|
6 |
+
#
|
7 |
+
# http://www.apache.org/licenses/LICENSE-2.0
|
8 |
+
#
|
9 |
+
# Unless required by applicable law or agreed to in writing, software
|
10 |
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
11 |
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
12 |
+
# See the License for the specific language governing permissions and
|
13 |
+
# limitations under the License.
|
14 |
+
|
15 |
+
import math
|
16 |
+
import sys
|
17 |
+
|
18 |
+
import paddle
|
19 |
+
|
20 |
+
from paddlemix.models.diffsinger.utils import paddle_aux
|
21 |
+
from paddlemix.models.diffsinger.modules.commons.common_layers import (
|
22 |
+
EncSALayer,
|
23 |
+
SinusoidalPositionalEmbedding,
|
24 |
+
)
|
25 |
+
from paddlemix.models.diffsinger.modules.commons.espnet_positional_embedding import (
|
26 |
+
RelPositionalEncoding,
|
27 |
+
)
|
28 |
+
|
29 |
+
DEFAULT_MAX_SOURCE_POSITIONS = 2000
|
30 |
+
DEFAULT_MAX_TARGET_POSITIONS = 2000
|
31 |
+
|
32 |
+
|
33 |
+
class TransformerEncoderLayer(paddle.nn.Layer):
|
34 |
+
def __init__(self, hidden_size, dropout, kernel_size=None, act="gelu", num_heads=2):
|
35 |
+
super().__init__()
|
36 |
+
self.op = EncSALayer(
|
37 |
+
hidden_size,
|
38 |
+
num_heads,
|
39 |
+
dropout=dropout,
|
40 |
+
attention_dropout=0.0,
|
41 |
+
relu_dropout=dropout,
|
42 |
+
kernel_size=kernel_size,
|
43 |
+
act=act,
|
44 |
+
)
|
45 |
+
|
46 |
+
def forward(self, x, **kwargs):
|
47 |
+
return self.op(x, **kwargs)
|
48 |
+
|
49 |
+
|
50 |
+
class LayerNorm(paddle.nn.LayerNorm):
|
51 |
+
"""Layer normalization module.
|
52 |
+
:param int nout: output dim size
|
53 |
+
:param int dim: dimension to be normalized
|
54 |
+
"""
|
55 |
+
|
56 |
+
def __init__(self, nout, dim=-1):
|
57 |
+
"""Construct an LayerNorm object."""
|
58 |
+
super(LayerNorm, self).__init__(nout, eps=1e-12)
|
59 |
+
self.dim = dim
|
60 |
+
|
61 |
+
def forward(self, x):
|
62 |
+
"""Apply layer normalization.
|
63 |
+
:param torch.Tensor x: input tensor
|
64 |
+
:return: layer normalized tensor
|
65 |
+
:rtype torch.Tensor
|
66 |
+
"""
|
67 |
+
if self.dim == -1:
|
68 |
+
return super(LayerNorm, self).forward(x)
|
69 |
+
return (
|
70 |
+
super(LayerNorm, self)
|
71 |
+
.forward(x.transpose(perm=paddle_aux.transpose_aux_func(x.ndim, 1, -1)))
|
72 |
+
.transpose(
|
73 |
+
perm=paddle_aux.transpose_aux_func(
|
74 |
+
super(LayerNorm, self)
|
75 |
+
.forward(x.transpose(perm=paddle_aux.transpose_aux_func(x.ndim, 1, -1)))
|
76 |
+
.ndim,
|
77 |
+
1,
|
78 |
+
-1,
|
79 |
+
)
|
80 |
+
)
|
81 |
+
)
|
82 |
+
|
83 |
+
|
84 |
+
class DurationPredictor(paddle.nn.Layer):
|
85 |
+
"""Duration predictor module.
|
86 |
+
This is a module of duration predictor described in `FastSpeech: Fast, Robust and Controllable Text to Speech`_.
|
87 |
+
The duration predictor predicts a duration of each frame in log domain from the hidden embeddings of encoder.
|
88 |
+
.. _`FastSpeech: Fast, Robust and Controllable Text to Speech`:
|
89 |
+
https://arxiv.org/pdf/1905.09263.pdf
|
90 |
+
Note:
|
91 |
+
The calculation domain of outputs is different between in `forward` and in `inference`. In `forward`,
|
92 |
+
the outputs are calculated in log domain but in `inference`, those are calculated in linear domain.
|
93 |
+
"""
|
94 |
+
|
95 |
+
def __init__(
|
96 |
+
self, in_dims, n_layers=2, n_chans=384, kernel_size=3, dropout_rate=0.1, offset=1.0, dur_loss_type="mse"
|
97 |
+
):
|
98 |
+
"""Initialize duration predictor module.
|
99 |
+
Args:
|
100 |
+
in_dims (int): Input dimension.
|
101 |
+
n_layers (int, optional): Number of convolutional layers.
|
102 |
+
n_chans (int, optional): Number of channels of convolutional layers.
|
103 |
+
kernel_size (int, optional): Kernel size of convolutional layers.
|
104 |
+
dropout_rate (float, optional): Dropout rate.
|
105 |
+
offset (float, optional): Offset value to avoid nan in log domain.
|
106 |
+
"""
|
107 |
+
super(DurationPredictor, self).__init__()
|
108 |
+
self.offset = offset
|
109 |
+
self.conv = paddle.nn.LayerList()
|
110 |
+
self.kernel_size = kernel_size
|
111 |
+
for idx in range(n_layers):
|
112 |
+
in_chans = in_dims if idx == 0 else n_chans
|
113 |
+
self.conv.append(
|
114 |
+
paddle.nn.Sequential(
|
115 |
+
paddle.nn.Identity(),
|
116 |
+
paddle.nn.Conv1D(
|
117 |
+
in_channels=in_chans,
|
118 |
+
out_channels=n_chans,
|
119 |
+
kernel_size=kernel_size,
|
120 |
+
stride=1,
|
121 |
+
padding=kernel_size // 2,
|
122 |
+
),
|
123 |
+
paddle.nn.ReLU(),
|
124 |
+
LayerNorm(n_chans, dim=1),
|
125 |
+
paddle.nn.Dropout(p=dropout_rate),
|
126 |
+
)
|
127 |
+
)
|
128 |
+
self.loss_type = dur_loss_type
|
129 |
+
if self.loss_type in ["mse", "huber"]:
|
130 |
+
self.out_dims = 1
|
131 |
+
else:
|
132 |
+
raise NotImplementedError()
|
133 |
+
self.linear = paddle.nn.Linear(in_features=n_chans, out_features=self.out_dims)
|
134 |
+
|
135 |
+
def out2dur(self, xs):
|
136 |
+
if self.loss_type in ["mse", "huber"]:
|
137 |
+
dur = xs.squeeze(axis=-1).exp() - self.offset
|
138 |
+
else:
|
139 |
+
raise NotImplementedError()
|
140 |
+
return dur
|
141 |
+
|
142 |
+
def forward(self, xs, x_masks=None, infer=True):
|
143 |
+
"""Calculate forward propagation.
|
144 |
+
Args:
|
145 |
+
xs (Tensor): Batch of input sequences (B, Tmax, idim).
|
146 |
+
x_masks (BoolTensor, optional): Batch of masks indicating padded part (B, Tmax).
|
147 |
+
infer (bool): Whether inference
|
148 |
+
Returns:
|
149 |
+
(train) FloatTensor, (infer) LongTensor: Batch of predicted durations in linear domain (B, Tmax).
|
150 |
+
"""
|
151 |
+
xs = xs.transpose(perm=paddle_aux.transpose_aux_func(xs.ndim, 1, -1))
|
152 |
+
masks = 1 - x_masks.astype(dtype="float32")
|
153 |
+
masks_ = masks[:, None, :]
|
154 |
+
for f in self.conv:
|
155 |
+
xs = f(xs)
|
156 |
+
if x_masks is not None:
|
157 |
+
xs = xs * masks_
|
158 |
+
xs = self.linear(xs.transpose(perm=paddle_aux.transpose_aux_func(xs.ndim, 1, -1)))
|
159 |
+
xs = xs * masks[:, :, None]
|
160 |
+
dur_pred = self.out2dur(xs)
|
161 |
+
if infer:
|
162 |
+
dur_pred = dur_pred.clip(min=0.0)
|
163 |
+
return dur_pred
|
164 |
+
|
165 |
+
|
166 |
+
class VariancePredictor(paddle.nn.Layer):
|
167 |
+
def __init__(self, vmin, vmax, in_dims, n_layers=5, n_chans=512, kernel_size=5, dropout_rate=0.1):
|
168 |
+
"""Initialize variance predictor module.
|
169 |
+
Args:
|
170 |
+
in_dims (int): Input dimension.
|
171 |
+
n_layers (int, optional): Number of convolutional layers.
|
172 |
+
n_chans (int, optional): Number of channels of convolutional layers.
|
173 |
+
kernel_size (int, optional): Kernel size of convolutional layers.
|
174 |
+
dropout_rate (float, optional): Dropout rate.
|
175 |
+
"""
|
176 |
+
super(VariancePredictor, self).__init__()
|
177 |
+
self.vmin = vmin
|
178 |
+
self.vmax = vmax
|
179 |
+
self.conv = paddle.nn.LayerList()
|
180 |
+
self.kernel_size = kernel_size
|
181 |
+
for idx in range(n_layers):
|
182 |
+
in_chans = in_dims if idx == 0 else n_chans
|
183 |
+
self.conv.append(
|
184 |
+
paddle.nn.Sequential(
|
185 |
+
paddle.nn.Conv1D(
|
186 |
+
in_channels=in_chans,
|
187 |
+
out_channels=n_chans,
|
188 |
+
kernel_size=kernel_size,
|
189 |
+
stride=1,
|
190 |
+
padding=kernel_size // 2,
|
191 |
+
),
|
192 |
+
paddle.nn.ReLU(),
|
193 |
+
LayerNorm(n_chans, dim=1),
|
194 |
+
paddle.nn.Dropout(p=dropout_rate),
|
195 |
+
)
|
196 |
+
)
|
197 |
+
self.linear = paddle.nn.Linear(in_features=n_chans, out_features=1)
|
198 |
+
self.embed_positions = SinusoidalPositionalEmbedding(in_dims, 0, init_size=4096)
|
199 |
+
self.pos_embed_alpha = paddle.base.framework.EagerParamBase.from_tensor(
|
200 |
+
tensor=paddle.to_tensor(data=[1], dtype="float32")
|
201 |
+
)
|
202 |
+
|
203 |
+
def out2value(self, xs):
|
204 |
+
return (xs + 1) / 2 * (self.vmax - self.vmin) + self.vmin
|
205 |
+
|
206 |
+
def forward(self, xs, infer=True):
|
207 |
+
"""
|
208 |
+
:param xs: [B, T, H]
|
209 |
+
:param infer: whether inference
|
210 |
+
:return: [B, T]
|
211 |
+
"""
|
212 |
+
positions = self.pos_embed_alpha * self.embed_positions(xs[..., 0])
|
213 |
+
xs = xs + positions
|
214 |
+
xs = xs.transpose(perm=paddle_aux.transpose_aux_func(xs.ndim, 1, -1))
|
215 |
+
for f in self.conv:
|
216 |
+
xs = f(xs)
|
217 |
+
xs = self.linear(xs.transpose(perm=paddle_aux.transpose_aux_func(xs.ndim, 1, -1))).squeeze(axis=-1)
|
218 |
+
if infer:
|
219 |
+
xs = self.out2value(xs)
|
220 |
+
return xs
|
221 |
+
|
222 |
+
|
223 |
+
class PitchPredictor(paddle.nn.Layer):
|
224 |
+
def __init__(
|
225 |
+
self, vmin, vmax, num_bins, deviation, in_dims, n_layers=5, n_chans=384, kernel_size=5, dropout_rate=0.1
|
226 |
+
):
|
227 |
+
"""Initialize pitch predictor module.
|
228 |
+
Args:
|
229 |
+
in_dims (int): Input dimension.
|
230 |
+
n_layers (int, optional): Number of convolutional layers.
|
231 |
+
n_chans (int, optional): Number of channels of convolutional layers.
|
232 |
+
kernel_size (int, optional): Kernel size of convolutional layers.
|
233 |
+
dropout_rate (float, optional): Dropout rate.
|
234 |
+
"""
|
235 |
+
super(PitchPredictor, self).__init__()
|
236 |
+
self.vmin = vmin
|
237 |
+
self.vmax = vmax
|
238 |
+
self.interval = (vmax - vmin) / (num_bins - 1)
|
239 |
+
self.sigma = deviation / self.interval
|
240 |
+
self.register_buffer(name="x", tensor=paddle.arange(end=num_bins).astype(dtype="float32").reshape(1, 1, -1))
|
241 |
+
self.base_pitch_embed = paddle.nn.Linear(in_features=1, out_features=in_dims)
|
242 |
+
self.conv = paddle.nn.LayerList()
|
243 |
+
self.kernel_size = kernel_size
|
244 |
+
for idx in range(n_layers):
|
245 |
+
in_chans = in_dims if idx == 0 else n_chans
|
246 |
+
self.conv.append(
|
247 |
+
paddle.nn.Sequential(
|
248 |
+
paddle.nn.Conv1D(
|
249 |
+
in_channels=in_chans,
|
250 |
+
out_channels=n_chans,
|
251 |
+
kernel_size=kernel_size,
|
252 |
+
stride=1,
|
253 |
+
padding=kernel_size // 2,
|
254 |
+
),
|
255 |
+
paddle.nn.ReLU(),
|
256 |
+
LayerNorm(n_chans, dim=1),
|
257 |
+
paddle.nn.Dropout(p=dropout_rate),
|
258 |
+
)
|
259 |
+
)
|
260 |
+
self.linear = paddle.nn.Linear(in_features=n_chans, out_features=num_bins)
|
261 |
+
self.embed_positions = SinusoidalPositionalEmbedding(in_dims, 0, init_size=4096)
|
262 |
+
self.pos_embed_alpha = paddle.base.framework.EagerParamBase.from_tensor(
|
263 |
+
tensor=paddle.to_tensor(data=[1], dtype="float32")
|
264 |
+
)
|
265 |
+
|
266 |
+
def bins_to_values(self, bins):
|
267 |
+
return bins * self.interval + self.vmin
|
268 |
+
|
269 |
+
def out2pitch(self, probs):
|
270 |
+
logits = probs.sigmoid()
|
271 |
+
bins = paddle.sum(x=self.x * logits, axis=2) / paddle.sum(x=logits, axis=2)
|
272 |
+
pitch = self.bins_to_values(bins)
|
273 |
+
return pitch
|
274 |
+
|
275 |
+
def forward(self, xs, base):
|
276 |
+
"""
|
277 |
+
:param xs: [B, T, H]
|
278 |
+
:param base: [B, T]
|
279 |
+
:return: [B, T, N]
|
280 |
+
"""
|
281 |
+
xs = xs + self.base_pitch_embed(base[..., None])
|
282 |
+
positions = self.pos_embed_alpha * self.embed_positions(xs[..., 0])
|
283 |
+
xs = xs + positions
|
284 |
+
xs = xs.transpose(perm=paddle_aux.transpose_aux_func(xs.ndim, 1, -1))
|
285 |
+
for f in self.conv:
|
286 |
+
xs = f(xs)
|
287 |
+
xs = self.linear(xs.transpose(perm=paddle_aux.transpose_aux_func(xs.ndim, 1, -1)))
|
288 |
+
return self.out2pitch(xs) + base, xs
|
289 |
+
|
290 |
+
|
291 |
+
class RhythmRegulator(paddle.nn.Layer):
|
292 |
+
def __init__(self, eps=1e-05):
|
293 |
+
super().__init__()
|
294 |
+
self.eps = eps
|
295 |
+
|
296 |
+
def forward(self, ph_dur, ph2word, word_dur):
|
297 |
+
"""
|
298 |
+
Example (no batch dim version):
|
299 |
+
1. ph_dur = [4,2,3,2]
|
300 |
+
2. word_dur = [3,4,2], ph2word = [1,2,2,3]
|
301 |
+
3. word_dur_in = [4,5,2]
|
302 |
+
4. alpha_w = [0.75,0.8,1], alpha_ph = [0.75,0.8,0.8,1]
|
303 |
+
5. ph_dur_out = [3,1.6,2.4,2]
|
304 |
+
:param ph_dur: [B, T_ph]
|
305 |
+
:param ph2word: [B, T_ph]
|
306 |
+
:param word_dur: [B, T_w]
|
307 |
+
"""
|
308 |
+
ph_dur = ph_dur.astype(dtype="float32") * (ph2word > 0)
|
309 |
+
word_dur = word_dur.astype(dtype="float32")
|
310 |
+
word_dur_in = paddle.zeros(
|
311 |
+
shape=[tuple(ph_dur.shape)[0], ph2word.max() + 1], dtype=ph_dur.dtype
|
312 |
+
).put_along_axis(axis=1, indices=ph2word, values=ph_dur, reduce="add")[:, 1:]
|
313 |
+
alpha_w = word_dur / word_dur_in.clip(min=self.eps)
|
314 |
+
alpha_ph = paddle.take_along_axis(
|
315 |
+
arr=paddle.nn.functional.pad(x=alpha_w, pad=[1, 0], pad_from_left_axis=False),
|
316 |
+
axis=1,
|
317 |
+
indices=ph2word,
|
318 |
+
broadcast=False,
|
319 |
+
)
|
320 |
+
ph_dur_out = ph_dur * alpha_ph
|
321 |
+
return ph_dur_out.round().astype(dtype="int64")
|
322 |
+
|
323 |
+
|
324 |
+
class LengthRegulator(paddle.nn.Layer):
|
325 |
+
def forward(self, dur, dur_padding=None, alpha=None):
|
326 |
+
"""
|
327 |
+
Example (no batch dim version):
|
328 |
+
1. dur = [2,2,3]
|
329 |
+
2. token_idx = [[1],[2],[3]], dur_cumsum = [2,4,7], dur_cumsum_prev = [0,2,4]
|
330 |
+
3. token_mask = [[1,1,0,0,0,0,0],
|
331 |
+
[0,0,1,1,0,0,0],
|
332 |
+
[0,0,0,0,1,1,1]]
|
333 |
+
4. token_idx * token_mask = [[1,1,0,0,0,0,0],
|
334 |
+
[0,0,2,2,0,0,0],
|
335 |
+
[0,0,0,0,3,3,3]]
|
336 |
+
5. (token_idx * token_mask).sum(0) = [1,1,2,2,3,3,3]
|
337 |
+
|
338 |
+
:param dur: Batch of durations of each frame (B, T_txt)
|
339 |
+
:param dur_padding: Batch of padding of each frame (B, T_txt)
|
340 |
+
:param alpha: duration rescale coefficient
|
341 |
+
:return:
|
342 |
+
mel2ph (B, T_speech)
|
343 |
+
"""
|
344 |
+
assert alpha is None or alpha > 0
|
345 |
+
if alpha is not None:
|
346 |
+
dur = paddle.round(dur.astype(dtype="float32") * alpha).astype(dtype="int64")
|
347 |
+
if dur_padding is not None:
|
348 |
+
dur = dur * (1 - dur_padding.astype(dtype="int64"))
|
349 |
+
token_idx = paddle.arange(start=1, end=tuple(dur.shape)[1] + 1)[None, :, None].to(dur.place)
|
350 |
+
dur_cumsum = paddle.cumsum(x=dur, axis=1)
|
351 |
+
# dur_cumsum_prev = paddle.nn.functional.pad(x=dur_cumsum, pad=[1, -1
|
352 |
+
# ], mode='constant', value=0, pad_from_left_axis=False)
|
353 |
+
dur_cumsum_prev = paddle.concat([paddle.zeros_like(dur_cumsum[:, :1]), dur_cumsum[:, :-1]], axis=1)
|
354 |
+
|
355 |
+
pos_idx = paddle.arange(end=dur.sum(axis=-1).max())[None, None].to(dur.place)
|
356 |
+
token_mask = (pos_idx >= dur_cumsum_prev[:, :, None]) & (pos_idx < dur_cumsum[:, :, None])
|
357 |
+
mel2ph = (token_idx * token_mask.astype(dtype="int64")).sum(axis=1)
|
358 |
+
return mel2ph
|
359 |
+
|
360 |
+
|
361 |
+
class StretchRegulator(paddle.nn.Layer):
|
362 |
+
def forward(self, mel2ph, dur=None):
|
363 |
+
"""
|
364 |
+
Example (no batch dim version):
|
365 |
+
1. dur = [2,4,3]
|
366 |
+
2. mel2ph = [1,1,2,2,2,2,3,3,3]
|
367 |
+
3. mel2dur = [2,2,4,4,4,4,3,3,3]
|
368 |
+
4. bound_mask = [0,1,0,0,0,1,0,0,1]
|
369 |
+
5. 1 - bound_mask * mel2dur = [1,-1,1,1,1,-3,1,1,-2] => pad => [0,1,-1,1,1,1,-3,1,1]
|
370 |
+
6. stretch_denorm = [0,1,0,1,2,3,0,1,2]
|
371 |
+
|
372 |
+
:param dur: Batch of durations of each frame (B, T_txt)
|
373 |
+
:param mel2ph: Batch of mel2ph (B, T_speech)
|
374 |
+
:return:
|
375 |
+
stretch (B, T_speech)
|
376 |
+
"""
|
377 |
+
if dur is None:
|
378 |
+
dur = mel2ph_to_dur(mel2ph, mel2ph.max())
|
379 |
+
dur = paddle.nn.functional.pad(x=dur, pad=[1, 0], value=1, pad_from_left_axis=False)
|
380 |
+
mel2dur = paddle.take_along_axis(arr=dur, axis=1, indices=mel2ph, broadcast=False)
|
381 |
+
bound_mask = paddle.greater_than(x=mel2ph[:, 1:], y=paddle.to_tensor(mel2ph[:, :-1]))
|
382 |
+
bound_mask = paddle.nn.functional.pad(
|
383 |
+
x=bound_mask, pad=[0, 1], mode="constant", value=True, pad_from_left_axis=False
|
384 |
+
)
|
385 |
+
stretch_delta = 1 - bound_mask * mel2dur
|
386 |
+
stretch_delta = paddle.nn.functional.pad(
|
387 |
+
x=stretch_delta, pad=[1, -1], mode="constant", value=0, pad_from_left_axis=False
|
388 |
+
)
|
389 |
+
stretch_denorm = paddle.cumsum(x=stretch_delta, axis=1)
|
390 |
+
stretch = stretch_denorm / mel2dur
|
391 |
+
return stretch * (mel2ph > 0)
|
392 |
+
|
393 |
+
|
394 |
+
def mel2ph_to_dur(mel2ph, T_txt, max_dur=None):
|
395 |
+
B, _ = tuple(mel2ph.shape)
|
396 |
+
dur = paddle.zeros(shape=[B, T_txt + 1], dtype=mel2ph.dtype).put_along_axis(
|
397 |
+
axis=1, indices=mel2ph, values=paddle.ones_like(x=mel2ph), reduce="add"
|
398 |
+
)
|
399 |
+
dur = dur[:, 1:]
|
400 |
+
if max_dur is not None:
|
401 |
+
dur = dur.clip(max=max_dur)
|
402 |
+
return dur
|
403 |
+
|
404 |
+
|
405 |
+
class FastSpeech2Encoder(paddle.nn.Layer):
|
406 |
+
def __init__(
|
407 |
+
self,
|
408 |
+
hidden_size,
|
409 |
+
num_layers,
|
410 |
+
ffn_kernel_size=9,
|
411 |
+
ffn_act="gelu",
|
412 |
+
dropout=None,
|
413 |
+
num_heads=2,
|
414 |
+
use_pos_embed=True,
|
415 |
+
rel_pos=True,
|
416 |
+
):
|
417 |
+
super().__init__()
|
418 |
+
self.num_layers = num_layers
|
419 |
+
embed_dim = self.hidden_size = hidden_size
|
420 |
+
self.dropout = dropout
|
421 |
+
self.use_pos_embed = use_pos_embed
|
422 |
+
self.layers = paddle.nn.LayerList(
|
423 |
+
sublayers=[
|
424 |
+
TransformerEncoderLayer(
|
425 |
+
self.hidden_size, self.dropout, kernel_size=ffn_kernel_size, act=ffn_act, num_heads=num_heads
|
426 |
+
)
|
427 |
+
for _ in range(self.num_layers)
|
428 |
+
]
|
429 |
+
)
|
430 |
+
self.layer_norm = paddle.nn.LayerNorm(normalized_shape=embed_dim)
|
431 |
+
self.embed_scale = math.sqrt(hidden_size)
|
432 |
+
self.padding_idx = 0
|
433 |
+
self.rel_pos = rel_pos
|
434 |
+
if self.rel_pos:
|
435 |
+
self.embed_positions = RelPositionalEncoding(hidden_size, dropout_rate=0.0)
|
436 |
+
else:
|
437 |
+
self.embed_positions = SinusoidalPositionalEmbedding(
|
438 |
+
hidden_size, self.padding_idx, init_size=DEFAULT_MAX_TARGET_POSITIONS
|
439 |
+
)
|
440 |
+
|
441 |
+
def forward_embedding(self, main_embed, extra_embed=None, padding_mask=None):
|
442 |
+
x = self.embed_scale * main_embed
|
443 |
+
if extra_embed is not None:
|
444 |
+
x = x + extra_embed
|
445 |
+
if self.use_pos_embed:
|
446 |
+
if self.rel_pos:
|
447 |
+
x = self.embed_positions(x)
|
448 |
+
else:
|
449 |
+
positions = self.embed_positions(~padding_mask)
|
450 |
+
x = x + positions
|
451 |
+
x = paddle.nn.functional.dropout(x=x, p=self.dropout, training=self.training)
|
452 |
+
return x
|
453 |
+
|
454 |
+
def forward(self, main_embed, extra_embed, padding_mask, attn_mask=None, return_hiddens=False):
|
455 |
+
x = self.forward_embedding(main_embed, extra_embed, padding_mask=padding_mask)
|
456 |
+
nonpadding_mask_TB = (
|
457 |
+
1
|
458 |
+
- padding_mask.transpose(perm=paddle_aux.transpose_aux_func(padding_mask.ndim, 0, 1)).astype(
|
459 |
+
dtype="float32"
|
460 |
+
)[:, :, None]
|
461 |
+
)
|
462 |
+
x = x.transpose(perm=paddle_aux.transpose_aux_func(x.ndim, 0, 1)) * nonpadding_mask_TB
|
463 |
+
hiddens = []
|
464 |
+
for layer in self.layers:
|
465 |
+
x = layer(x, encoder_padding_mask=padding_mask, attn_mask=attn_mask) * nonpadding_mask_TB
|
466 |
+
hiddens.append(x)
|
467 |
+
x = self.layer_norm(x) * nonpadding_mask_TB
|
468 |
+
if return_hiddens:
|
469 |
+
x = paddle.stack(x=hiddens, axis=0)
|
470 |
+
x = x.transpose(perm=paddle_aux.transpose_aux_func(x.ndim, 1, 2))
|
471 |
+
else:
|
472 |
+
x = x.transpose(perm=paddle_aux.transpose_aux_func(x.ndim, 0, 1))
|
473 |
+
return x
|
VLMEvalKit_old/PaddleMIX/paddlemix/models/diffsinger/modules/fastspeech/variance_encoder.py
ADDED
@@ -0,0 +1,151 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
|
2 |
+
#
|
3 |
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
4 |
+
# you may not use this file except in compliance with the License.
|
5 |
+
# You may obtain a copy of the License at
|
6 |
+
#
|
7 |
+
# http://www.apache.org/licenses/LICENSE-2.0
|
8 |
+
#
|
9 |
+
# Unless required by applicable law or agreed to in writing, software
|
10 |
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
11 |
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
12 |
+
# See the License for the specific language governing permissions and
|
13 |
+
# limitations under the License.
|
14 |
+
|
15 |
+
import sys
|
16 |
+
|
17 |
+
import paddle
|
18 |
+
|
19 |
+
from paddlemix.models.diffsinger.utils import paddle_aux
|
20 |
+
from paddlemix.models.diffsinger.modules.commons.common_layers import (
|
21 |
+
NormalInitEmbedding as Embedding,
|
22 |
+
)
|
23 |
+
from paddlemix.models.diffsinger.modules.commons.common_layers import (
|
24 |
+
XavierUniformInitLinear as Linear,
|
25 |
+
)
|
26 |
+
from paddlemix.models.diffsinger.modules.fastspeech.tts_modules import (
|
27 |
+
DurationPredictor,
|
28 |
+
FastSpeech2Encoder,
|
29 |
+
)
|
30 |
+
from paddlemix.models.diffsinger.utils.hparams import hparams
|
31 |
+
from paddlemix.models.diffsinger.utils.text_encoder import PAD_INDEX
|
32 |
+
|
33 |
+
|
34 |
+
class FastSpeech2Variance(paddle.nn.Layer):
|
35 |
+
def __init__(self, vocab_size):
|
36 |
+
super().__init__()
|
37 |
+
self.predict_dur = hparams["predict_dur"]
|
38 |
+
self.linguistic_mode = "word" if hparams["predict_dur"] else "phoneme"
|
39 |
+
self.txt_embed = Embedding(vocab_size, hparams["hidden_size"], PAD_INDEX)
|
40 |
+
if self.predict_dur:
|
41 |
+
self.onset_embed = Embedding(2, hparams["hidden_size"])
|
42 |
+
self.word_dur_embed = Linear(1, hparams["hidden_size"])
|
43 |
+
else:
|
44 |
+
self.ph_dur_embed = Linear(1, hparams["hidden_size"])
|
45 |
+
self.encoder = FastSpeech2Encoder(
|
46 |
+
hidden_size=hparams["hidden_size"],
|
47 |
+
num_layers=hparams["enc_layers"],
|
48 |
+
ffn_kernel_size=hparams["enc_ffn_kernel_size"],
|
49 |
+
ffn_act=hparams["ffn_act"],
|
50 |
+
dropout=hparams["dropout"],
|
51 |
+
num_heads=hparams["num_heads"],
|
52 |
+
use_pos_embed=hparams["use_pos_embed"],
|
53 |
+
rel_pos=hparams["rel_pos"],
|
54 |
+
)
|
55 |
+
dur_hparams = hparams["dur_prediction_args"]
|
56 |
+
if self.predict_dur:
|
57 |
+
self.midi_embed = Embedding(128, hparams["hidden_size"])
|
58 |
+
self.dur_predictor = DurationPredictor(
|
59 |
+
in_dims=hparams["hidden_size"],
|
60 |
+
n_chans=dur_hparams["hidden_size"],
|
61 |
+
n_layers=dur_hparams["num_layers"],
|
62 |
+
dropout_rate=dur_hparams["dropout"],
|
63 |
+
kernel_size=dur_hparams["kernel_size"],
|
64 |
+
offset=dur_hparams["log_offset"],
|
65 |
+
dur_loss_type=dur_hparams["loss_type"],
|
66 |
+
)
|
67 |
+
|
68 |
+
def forward(self, txt_tokens, midi, ph2word, ph_dur=None, word_dur=None, spk_embed=None, infer=True):
|
69 |
+
"""
|
70 |
+
:param txt_tokens: (train, infer) [B, T_ph]
|
71 |
+
:param midi: (train, infer) [B, T_ph]
|
72 |
+
:param ph2word: (train, infer) [B, T_ph]
|
73 |
+
:param ph_dur: (train, [infer]) [B, T_ph]
|
74 |
+
:param word_dur: (infer) [B, T_w]
|
75 |
+
:param spk_embed: (train) [B, T_ph, H]
|
76 |
+
:param infer: whether inference
|
77 |
+
:return: encoder_out, ph_dur_pred
|
78 |
+
"""
|
79 |
+
txt_embed = self.txt_embed(txt_tokens)
|
80 |
+
if self.linguistic_mode == "word":
|
81 |
+
b = tuple(txt_tokens.shape)[0]
|
82 |
+
onset = paddle.diff(x=ph2word, axis=1, prepend=paddle.zeros(shape=[b, 1], dtype=ph2word.dtype)) > 0
|
83 |
+
onset_embed = self.onset_embed(onset.astype(dtype="int64"))
|
84 |
+
if word_dur is None or not infer:
|
85 |
+
word_dur = paddle.zeros(shape=[b, ph2word.max() + 1], dtype=ph_dur.dtype).put_along_axis(
|
86 |
+
axis=1, indices=ph2word, values=ph_dur, reduce="add"
|
87 |
+
)[:, 1:]
|
88 |
+
word_dur = paddle.take_along_axis(
|
89 |
+
arr=paddle.nn.functional.pad(x=word_dur, pad=[1, 0], value=0, pad_from_left_axis=False),
|
90 |
+
axis=1,
|
91 |
+
indices=ph2word,
|
92 |
+
broadcast=False,
|
93 |
+
)
|
94 |
+
word_dur_embed = self.word_dur_embed(word_dur.astype(dtype="float32")[:, :, None])
|
95 |
+
encoder_out = self.encoder(txt_embed, onset_embed + word_dur_embed, txt_tokens == 0)
|
96 |
+
else:
|
97 |
+
ph_dur_embed = self.ph_dur_embed(ph_dur.astype(dtype="float32")[:, :, None])
|
98 |
+
encoder_out = self.encoder(txt_embed, ph_dur_embed, txt_tokens == 0)
|
99 |
+
if self.predict_dur:
|
100 |
+
midi_embed = self.midi_embed(midi)
|
101 |
+
dur_cond = encoder_out + midi_embed
|
102 |
+
if spk_embed is not None:
|
103 |
+
dur_cond += spk_embed
|
104 |
+
ph_dur_pred = self.dur_predictor(dur_cond, x_masks=txt_tokens == PAD_INDEX, infer=infer)
|
105 |
+
return encoder_out, ph_dur_pred
|
106 |
+
else:
|
107 |
+
return encoder_out, None
|
108 |
+
|
109 |
+
|
110 |
+
class MelodyEncoder(paddle.nn.Layer):
|
111 |
+
def __init__(self, enc_hparams: dict):
|
112 |
+
super().__init__()
|
113 |
+
|
114 |
+
def get_hparam(key):
|
115 |
+
return enc_hparams.get(key, hparams.get(key))
|
116 |
+
|
117 |
+
hidden_size = get_hparam("hidden_size")
|
118 |
+
self.note_midi_embed = Linear(1, hidden_size)
|
119 |
+
self.note_dur_embed = Linear(1, hidden_size)
|
120 |
+
self.use_glide_embed = hparams["use_glide_embed"]
|
121 |
+
self.glide_embed_scale = hparams["glide_embed_scale"]
|
122 |
+
if self.use_glide_embed:
|
123 |
+
self.note_glide_embed = Embedding(len(hparams["glide_types"]) + 1, hidden_size, padding_idx=0)
|
124 |
+
self.encoder = FastSpeech2Encoder(
|
125 |
+
hidden_size=hidden_size,
|
126 |
+
num_layers=get_hparam("enc_layers"),
|
127 |
+
ffn_kernel_size=get_hparam("enc_ffn_kernel_size"),
|
128 |
+
ffn_act=get_hparam("ffn_act"),
|
129 |
+
dropout=get_hparam("dropout"),
|
130 |
+
num_heads=get_hparam("num_heads"),
|
131 |
+
use_pos_embed=get_hparam("use_pos_embed"),
|
132 |
+
rel_pos=get_hparam("rel_pos"),
|
133 |
+
)
|
134 |
+
self.out_proj = Linear(hidden_size, hparams["hidden_size"])
|
135 |
+
|
136 |
+
def forward(self, note_midi, note_rest, note_dur, glide=None):
|
137 |
+
"""
|
138 |
+
:param note_midi: float32 [B, T_n], -1: padding
|
139 |
+
:param note_rest: bool [B, T_n]
|
140 |
+
:param note_dur: int64 [B, T_n]
|
141 |
+
:param glide: int64 [B, T_n]
|
142 |
+
:return: [B, T_n, H]
|
143 |
+
"""
|
144 |
+
midi_embed = self.note_midi_embed(note_midi[:, :, None]) * ~note_rest[:, :, None]
|
145 |
+
dur_embed = self.note_dur_embed(note_dur.astype(dtype="float32")[:, :, None])
|
146 |
+
ornament_embed = 0
|
147 |
+
if self.use_glide_embed:
|
148 |
+
ornament_embed += self.note_glide_embed(glide) * self.glide_embed_scale
|
149 |
+
encoder_out = self.encoder(midi_embed, dur_embed + ornament_embed, padding_mask=note_midi < 0)
|
150 |
+
encoder_out = self.out_proj(encoder_out)
|
151 |
+
return encoder_out
|
VLMEvalKit_old/PaddleMIX/paddlemix/models/diffsinger/modules/hnsep/vr/__init__.py
ADDED
@@ -0,0 +1,42 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
|
2 |
+
#
|
3 |
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
4 |
+
# you may not use this file except in compliance with the License.
|
5 |
+
# You may obtain a copy of the License at
|
6 |
+
#
|
7 |
+
# http://www.apache.org/licenses/LICENSE-2.0
|
8 |
+
#
|
9 |
+
# Unless required by applicable law or agreed to in writing, software
|
10 |
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
11 |
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
12 |
+
# See the License for the specific language governing permissions and
|
13 |
+
# limitations under the License.
|
14 |
+
|
15 |
+
import pathlib
|
16 |
+
|
17 |
+
import paddle
|
18 |
+
import yaml
|
19 |
+
|
20 |
+
from .nets import CascadedNet
|
21 |
+
|
22 |
+
|
23 |
+
class DotDict(dict):
|
24 |
+
def __getattr__(*args):
|
25 |
+
val = dict.get(*args)
|
26 |
+
return DotDict(val) if type(val) is dict else val
|
27 |
+
|
28 |
+
__setattr__ = dict.__setitem__
|
29 |
+
__delattr__ = dict.__delitem__
|
30 |
+
|
31 |
+
|
32 |
+
def load_sep_model(model_path, device="cpu"):
|
33 |
+
model_path = pathlib.Path(model_path)
|
34 |
+
config_file = model_path.with_name("config.yaml")
|
35 |
+
with open(config_file, "r") as config:
|
36 |
+
args = yaml.safe_load(config)
|
37 |
+
args = DotDict(args)
|
38 |
+
model = CascadedNet(args.n_fft, args.hop_length, args.n_out, args.n_out_lstm, True, is_mono=args.is_mono)
|
39 |
+
model.to(device)
|
40 |
+
model.set_state_dict(state_dict=paddle.load(path=str(model_path)))
|
41 |
+
model.eval()
|
42 |
+
return model
|
VLMEvalKit_old/PaddleMIX/paddlemix/models/diffsinger/modules/hnsep/vr/layers.py
ADDED
@@ -0,0 +1,140 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
|
2 |
+
#
|
3 |
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
4 |
+
# you may not use this file except in compliance with the License.
|
5 |
+
# You may obtain a copy of the License at
|
6 |
+
#
|
7 |
+
# http://www.apache.org/licenses/LICENSE-2.0
|
8 |
+
#
|
9 |
+
# Unless required by applicable law or agreed to in writing, software
|
10 |
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
11 |
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
12 |
+
# See the License for the specific language governing permissions and
|
13 |
+
# limitations under the License.
|
14 |
+
|
15 |
+
import sys
|
16 |
+
|
17 |
+
import paddle
|
18 |
+
import paddle_aux
|
19 |
+
|
20 |
+
|
21 |
+
def crop_center(h1, h2):
|
22 |
+
h1_shape = tuple(h1.shape)
|
23 |
+
h2_shape = tuple(h2.shape)
|
24 |
+
if h1_shape[3] == h2_shape[3]:
|
25 |
+
return h1
|
26 |
+
elif h1_shape[3] < h2_shape[3]:
|
27 |
+
raise ValueError("h1_shape[3] must be greater than h2_shape[3]")
|
28 |
+
s_time = (h1_shape[3] - h2_shape[3]) // 2
|
29 |
+
e_time = s_time + h2_shape[3]
|
30 |
+
h1 = h1[:, :, :, s_time:e_time]
|
31 |
+
return h1
|
32 |
+
|
33 |
+
|
34 |
+
class Conv2DBNActiv(paddle.nn.Layer):
|
35 |
+
def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=paddle.nn.ReLU):
|
36 |
+
super(Conv2DBNActiv, self).__init__()
|
37 |
+
self.conv = paddle.nn.Sequential(
|
38 |
+
paddle.nn.Conv2D(
|
39 |
+
in_channels=nin,
|
40 |
+
out_channels=nout,
|
41 |
+
kernel_size=ksize,
|
42 |
+
stride=stride,
|
43 |
+
padding=pad,
|
44 |
+
dilation=dilation,
|
45 |
+
bias_attr=False,
|
46 |
+
),
|
47 |
+
paddle.nn.BatchNorm2D(num_features=nout),
|
48 |
+
activ(),
|
49 |
+
)
|
50 |
+
|
51 |
+
def forward(self, x):
|
52 |
+
return self.conv(x)
|
53 |
+
|
54 |
+
|
55 |
+
class Encoder(paddle.nn.Layer):
|
56 |
+
def __init__(self, nin, nout, ksize=3, stride=1, pad=1, activ=paddle.nn.LeakyReLU):
|
57 |
+
super(Encoder, self).__init__()
|
58 |
+
self.conv1 = Conv2DBNActiv(nin, nout, ksize, stride, pad, activ=activ)
|
59 |
+
self.conv2 = Conv2DBNActiv(nout, nout, ksize, 1, pad, activ=activ)
|
60 |
+
|
61 |
+
def forward(self, x):
|
62 |
+
h = self.conv1(x)
|
63 |
+
h = self.conv2(h)
|
64 |
+
return h
|
65 |
+
|
66 |
+
|
67 |
+
class Decoder(paddle.nn.Layer):
|
68 |
+
def __init__(self, nin, nout, ksize=3, stride=1, pad=1, activ=paddle.nn.ReLU, dropout=False):
|
69 |
+
super(Decoder, self).__init__()
|
70 |
+
self.conv1 = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ)
|
71 |
+
self.dropout = paddle.nn.Dropout2D(p=0.1) if dropout else None
|
72 |
+
|
73 |
+
def forward(self, x, skip=None):
|
74 |
+
x = paddle.nn.functional.interpolate(x=x, scale_factor=2, mode="bilinear", align_corners=True)
|
75 |
+
if skip is not None:
|
76 |
+
skip = crop_center(skip, x)
|
77 |
+
x = paddle.concat(x=[x, skip], axis=1)
|
78 |
+
h = self.conv1(x)
|
79 |
+
if self.dropout is not None:
|
80 |
+
h = self.dropout(h)
|
81 |
+
return h
|
82 |
+
|
83 |
+
|
84 |
+
class Mean(paddle.nn.Layer):
|
85 |
+
def __init__(self, dim, keepdims=False):
|
86 |
+
super(Mean, self).__init__()
|
87 |
+
self.dim = dim
|
88 |
+
self.keepdims = keepdims
|
89 |
+
|
90 |
+
def forward(self, x):
|
91 |
+
return x.mean(self.dim, keepdims=self.keepdims)
|
92 |
+
|
93 |
+
|
94 |
+
class ASPPModule(paddle.nn.Layer):
|
95 |
+
def __init__(self, nin, nout, dilations=(4, 8, 12), activ=paddle.nn.ReLU, dropout=False):
|
96 |
+
super(ASPPModule, self).__init__()
|
97 |
+
self.conv1 = paddle.nn.Sequential(Mean(dim=-2, keepdims=True), Conv2DBNActiv(nin, nout, 1, 1, 0, activ=activ))
|
98 |
+
self.conv2 = Conv2DBNActiv(nin, nout, 1, 1, 0, activ=activ)
|
99 |
+
self.conv3 = Conv2DBNActiv(nin, nout, 3, 1, dilations[0], dilations[0], activ=activ)
|
100 |
+
self.conv4 = Conv2DBNActiv(nin, nout, 3, 1, dilations[1], dilations[1], activ=activ)
|
101 |
+
self.conv5 = Conv2DBNActiv(nin, nout, 3, 1, dilations[2], dilations[2], activ=activ)
|
102 |
+
self.bottleneck = Conv2DBNActiv(nout * 5, nout, 1, 1, 0, activ=activ)
|
103 |
+
self.dropout = paddle.nn.Dropout2D(p=0.1) if dropout else None
|
104 |
+
|
105 |
+
def forward(self, x):
|
106 |
+
_, _, h, w = tuple(x.shape)
|
107 |
+
feat1 = self.conv1(x).tile(repeat_times=[1, 1, h, 1])
|
108 |
+
feat2 = self.conv2(x)
|
109 |
+
feat3 = self.conv3(x)
|
110 |
+
feat4 = self.conv4(x)
|
111 |
+
feat5 = self.conv5(x)
|
112 |
+
out = paddle.concat(x=(feat1, feat2, feat3, feat4, feat5), axis=1)
|
113 |
+
out = self.bottleneck(out)
|
114 |
+
if self.dropout is not None:
|
115 |
+
out = self.dropout(out)
|
116 |
+
return out
|
117 |
+
|
118 |
+
|
119 |
+
class LSTMModule(paddle.nn.Layer):
|
120 |
+
def __init__(self, nin_conv, nin_lstm, nout_lstm):
|
121 |
+
super(LSTMModule, self).__init__()
|
122 |
+
self.conv = Conv2DBNActiv(nin_conv, 1, 1, 1, 0)
|
123 |
+
self.lstm = paddle.nn.LSTM(
|
124 |
+
input_size=nin_lstm, hidden_size=nout_lstm // 2, time_major=not False, direction="bidirect"
|
125 |
+
)
|
126 |
+
self.dense = paddle.nn.Sequential(
|
127 |
+
paddle.nn.Linear(in_features=nout_lstm, out_features=nin_lstm),
|
128 |
+
paddle.nn.BatchNorm1D(num_features=nin_lstm),
|
129 |
+
paddle.nn.ReLU(),
|
130 |
+
)
|
131 |
+
|
132 |
+
def forward(self, x):
|
133 |
+
N, _, nbins, nframes = tuple(x.shape)
|
134 |
+
h = self.conv(x)[:, 0]
|
135 |
+
h = h.transpose(perm=[2, 0, 1])
|
136 |
+
h, _ = self.lstm(h)
|
137 |
+
h = self.dense(h.reshape(-1, tuple(h.shape)[-1]))
|
138 |
+
h = h.reshape(nframes, N, 1, nbins)
|
139 |
+
h = h.transpose(perm=[1, 2, 3, 0])
|
140 |
+
return h
|
VLMEvalKit_old/PaddleMIX/paddlemix/models/diffsinger/modules/hnsep/vr/nets.py
ADDED
@@ -0,0 +1,185 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
|
2 |
+
#
|
3 |
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
4 |
+
# you may not use this file except in compliance with the License.
|
5 |
+
# You may obtain a copy of the License at
|
6 |
+
#
|
7 |
+
# http://www.apache.org/licenses/LICENSE-2.0
|
8 |
+
#
|
9 |
+
# Unless required by applicable law or agreed to in writing, software
|
10 |
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
11 |
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
12 |
+
# See the License for the specific language governing permissions and
|
13 |
+
# limitations under the License.
|
14 |
+
|
15 |
+
import sys
|
16 |
+
|
17 |
+
import paddle
|
18 |
+
|
19 |
+
from . import layers
|
20 |
+
|
21 |
+
|
22 |
+
class BaseNet(paddle.nn.Layer):
|
23 |
+
def __init__(self, nin, nout, nin_lstm, nout_lstm, dilations=((4, 2), (8, 4), (12, 6))):
|
24 |
+
super(BaseNet, self).__init__()
|
25 |
+
self.enc1 = layers.Conv2DBNActiv(nin, nout, 3, 1, 1)
|
26 |
+
self.enc2 = layers.Encoder(nout, nout * 2, 3, 2, 1)
|
27 |
+
self.enc3 = layers.Encoder(nout * 2, nout * 4, 3, 2, 1)
|
28 |
+
self.enc4 = layers.Encoder(nout * 4, nout * 6, 3, 2, 1)
|
29 |
+
self.enc5 = layers.Encoder(nout * 6, nout * 8, 3, 2, 1)
|
30 |
+
self.aspp = layers.ASPPModule(nout * 8, nout * 8, dilations, dropout=True)
|
31 |
+
self.dec4 = layers.Decoder(nout * (6 + 8), nout * 6, 3, 1, 1)
|
32 |
+
self.dec3 = layers.Decoder(nout * (4 + 6), nout * 4, 3, 1, 1)
|
33 |
+
self.dec2 = layers.Decoder(nout * (2 + 4), nout * 2, 3, 1, 1)
|
34 |
+
self.lstm_dec2 = layers.LSTMModule(nout * 2, nin_lstm, nout_lstm)
|
35 |
+
self.dec1 = layers.Decoder(nout * (1 + 2) + 1, nout * 1, 3, 1, 1)
|
36 |
+
|
37 |
+
def forward(self, x):
|
38 |
+
e1 = self.enc1(x)
|
39 |
+
e2 = self.enc2(e1)
|
40 |
+
e3 = self.enc3(e2)
|
41 |
+
e4 = self.enc4(e3)
|
42 |
+
e5 = self.enc5(e4)
|
43 |
+
h = self.aspp(e5)
|
44 |
+
h = self.dec4(h, e4)
|
45 |
+
h = self.dec3(h, e3)
|
46 |
+
h = self.dec2(h, e2)
|
47 |
+
h = paddle.concat(x=[h, self.lstm_dec2(h)], axis=1)
|
48 |
+
h = self.dec1(h, e1)
|
49 |
+
return h
|
50 |
+
|
51 |
+
|
52 |
+
class CascadedNet(paddle.nn.Layer):
|
53 |
+
def __init__(self, n_fft, hop_length, nout=32, nout_lstm=128, is_complex=False, is_mono=False):
|
54 |
+
super(CascadedNet, self).__init__()
|
55 |
+
self.n_fft = n_fft
|
56 |
+
self.hop_length = hop_length
|
57 |
+
self.is_complex = is_complex
|
58 |
+
self.is_mono = is_mono
|
59 |
+
self.register_buffer(
|
60 |
+
name="window",
|
61 |
+
tensor=paddle.audio.functional.get_window("hann", n_fft).astype("float32"),
|
62 |
+
persistable=False,
|
63 |
+
)
|
64 |
+
self.max_bin = n_fft // 2
|
65 |
+
self.output_bin = n_fft // 2 + 1
|
66 |
+
self.nin_lstm = self.max_bin // 2
|
67 |
+
self.offset = 64
|
68 |
+
nin = 4 if is_complex else 2
|
69 |
+
if is_mono:
|
70 |
+
nin = nin // 2
|
71 |
+
self.stg1_low_band_net = paddle.nn.Sequential(
|
72 |
+
BaseNet(nin, nout // 2, self.nin_lstm // 2, nout_lstm), layers.Conv2DBNActiv(nout // 2, nout // 4, 1, 1, 0)
|
73 |
+
)
|
74 |
+
self.stg1_high_band_net = BaseNet(nin, nout // 4, self.nin_lstm // 2, nout_lstm // 2)
|
75 |
+
self.stg2_low_band_net = paddle.nn.Sequential(
|
76 |
+
BaseNet(nout // 4 + nin, nout, self.nin_lstm // 2, nout_lstm),
|
77 |
+
layers.Conv2DBNActiv(nout, nout // 2, 1, 1, 0),
|
78 |
+
)
|
79 |
+
self.stg2_high_band_net = BaseNet(nout // 4 + nin, nout // 2, self.nin_lstm // 2, nout_lstm // 2)
|
80 |
+
self.stg3_full_band_net = BaseNet(3 * nout // 4 + nin, nout, self.nin_lstm, nout_lstm)
|
81 |
+
self.out = paddle.nn.Conv2D(in_channels=nout, out_channels=nin, kernel_size=1, bias_attr=False)
|
82 |
+
self.aux_out = paddle.nn.Conv2D(in_channels=3 * nout // 4, out_channels=nin, kernel_size=1, bias_attr=False)
|
83 |
+
|
84 |
+
def forward(self, x):
|
85 |
+
if self.is_complex:
|
86 |
+
x = paddle.concat(x=[x.real(), x.imag()], axis=1)
|
87 |
+
x = x[:, :, : self.max_bin]
|
88 |
+
bandw = tuple(x.shape)[2] // 2
|
89 |
+
l1_in = x[:, :, :bandw]
|
90 |
+
h1_in = x[:, :, bandw:]
|
91 |
+
l1 = self.stg1_low_band_net(l1_in)
|
92 |
+
h1 = self.stg1_high_band_net(h1_in)
|
93 |
+
aux1 = paddle.concat(x=[l1, h1], axis=2)
|
94 |
+
l2_in = paddle.concat(x=[l1_in, l1], axis=1)
|
95 |
+
h2_in = paddle.concat(x=[h1_in, h1], axis=1)
|
96 |
+
l2 = self.stg2_low_band_net(l2_in)
|
97 |
+
h2 = self.stg2_high_band_net(h2_in)
|
98 |
+
aux2 = paddle.concat(x=[l2, h2], axis=2)
|
99 |
+
f3_in = paddle.concat(x=[x, aux1, aux2], axis=1)
|
100 |
+
f3 = self.stg3_full_band_net(f3_in)
|
101 |
+
if self.is_complex:
|
102 |
+
mask = self.out(f3)
|
103 |
+
if self.is_mono:
|
104 |
+
mask = paddle.complex(real=mask[:, :1], imag=mask[:, 1:])
|
105 |
+
else:
|
106 |
+
mask = paddle.complex(real=mask[:, :2], imag=mask[:, 2:])
|
107 |
+
mask = self.bounded_mask(mask)
|
108 |
+
else:
|
109 |
+
mask = paddle.nn.functional.sigmoid(x=self.out(f3))
|
110 |
+
mask = paddle.nn.functional.pad(
|
111 |
+
x=mask, pad=(0, 0, 0, self.output_bin - tuple(mask.shape)[2]), mode="replicate", pad_from_left_axis=False
|
112 |
+
)
|
113 |
+
return mask
|
114 |
+
|
115 |
+
def bounded_mask(self, mask, eps=1e-08):
|
116 |
+
mask_mag = paddle.abs(x=mask)
|
117 |
+
mask = paddle.nn.functional.tanh(x=mask_mag) * mask / (mask_mag + eps)
|
118 |
+
return mask
|
119 |
+
|
120 |
+
def predict_mask(self, x):
|
121 |
+
mask = self.forward(x)
|
122 |
+
if self.offset > 0:
|
123 |
+
mask = mask[:, :, :, self.offset : -self.offset]
|
124 |
+
assert tuple(mask.shape)[3] > 0
|
125 |
+
return mask
|
126 |
+
|
127 |
+
def predict(self, x):
|
128 |
+
mask = self.forward(x)
|
129 |
+
pred = x * mask
|
130 |
+
if self.offset > 0:
|
131 |
+
pred = pred[:, :, :, self.offset : -self.offset]
|
132 |
+
assert tuple(pred.shape)[3] > 0
|
133 |
+
return pred
|
134 |
+
|
135 |
+
def audio2spec(self, x, use_pad=False):
|
136 |
+
B, C, T = tuple(x.shape)
|
137 |
+
x = x.reshape(B * C, T)
|
138 |
+
if use_pad:
|
139 |
+
n_frames = T // self.hop_length + 1
|
140 |
+
T_pad = (32 * ((n_frames - 1) // 32 + 1) - 1) * self.hop_length - T
|
141 |
+
nl_pad = T_pad // 2 // self.hop_length
|
142 |
+
Tl_pad = nl_pad * self.hop_length
|
143 |
+
x = paddle.nn.functional.pad(x=x, pad=(Tl_pad, T_pad - Tl_pad), pad_from_left_axis=False)
|
144 |
+
spec = paddle.signal.stft(
|
145 |
+
x,
|
146 |
+
n_fft=self.n_fft,
|
147 |
+
hop_length=self.hop_length,
|
148 |
+
return_complex=True,
|
149 |
+
window=self.window,
|
150 |
+
pad_mode="constant",
|
151 |
+
)
|
152 |
+
spec = spec.reshape(B, C, tuple(spec.shape)[-2], tuple(spec.shape)[-1])
|
153 |
+
return spec
|
154 |
+
|
155 |
+
def spec2audio(self, x):
|
156 |
+
B, C, N, T = tuple(x.shape)
|
157 |
+
x = x.reshape(-1, N, T)
|
158 |
+
x = paddle.signal.istft(x=x, n_fft=self.n_fft, hop_length=self.hop_length, window=self.window)
|
159 |
+
x = x.reshape(B, C, -1)
|
160 |
+
return x
|
161 |
+
|
162 |
+
def predict_from_audio(self, x):
|
163 |
+
B, C, T = tuple(x.shape)
|
164 |
+
x = x.reshape(B * C, T)
|
165 |
+
n_frames = T // self.hop_length + 1
|
166 |
+
T_pad = (32 * (n_frames // 32 + 1) - 1) * self.hop_length - T
|
167 |
+
nl_pad = T_pad // 2 // self.hop_length
|
168 |
+
Tl_pad = nl_pad * self.hop_length
|
169 |
+
x = paddle.nn.functional.pad(x=x, pad=(Tl_pad, T_pad - Tl_pad), pad_from_left_axis=False)
|
170 |
+
spec = paddle.signal.stft(
|
171 |
+
x,
|
172 |
+
n_fft=self.n_fft,
|
173 |
+
hop_length=self.hop_length,
|
174 |
+
return_complex=True,
|
175 |
+
window=self.window,
|
176 |
+
pad_mode="constant",
|
177 |
+
)
|
178 |
+
spec = spec.reshape(B, C, tuple(spec.shape)[-2], tuple(spec.shape)[-1])
|
179 |
+
mask = self.forward(spec)
|
180 |
+
spec_pred = spec * mask
|
181 |
+
spec_pred = spec_pred.reshape(B * C, tuple(spec.shape)[-2], tuple(spec.shape)[-1])
|
182 |
+
x_pred = paddle.signal.istft(x=spec_pred, n_fft=self.n_fft, hop_length=self.hop_length, window=self.window)
|
183 |
+
x_pred = x_pred[:, Tl_pad : Tl_pad + T]
|
184 |
+
x_pred = x_pred.reshape(B, C, T)
|
185 |
+
return x_pred
|
VLMEvalKit_old/PaddleMIX/paddlemix/models/diffsinger/modules/nsf_hifigan/env.py
ADDED
@@ -0,0 +1,46 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
|
2 |
+
#
|
3 |
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
4 |
+
# you may not use this file except in compliance with the License.
|
5 |
+
# You may obtain a copy of the License at
|
6 |
+
#
|
7 |
+
# http://www.apache.org/licenses/LICENSE-2.0
|
8 |
+
#
|
9 |
+
# Unless required by applicable law or agreed to in writing, software
|
10 |
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
11 |
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
12 |
+
# See the License for the specific language governing permissions and
|
13 |
+
# limitations under the License.
|
14 |
+
|
15 |
+
|
16 |
+
class AttrDict(dict):
|
17 |
+
"""A dictionary with attribute-style access. It maps attribute access to
|
18 |
+
the real dictionary."""
|
19 |
+
|
20 |
+
def __init__(self, *args, **kwargs):
|
21 |
+
dict.__init__(self, *args, **kwargs)
|
22 |
+
|
23 |
+
def __getstate__(self):
|
24 |
+
return self.__dict__.items()
|
25 |
+
|
26 |
+
def __setstate__(self, items):
|
27 |
+
for key, val in items:
|
28 |
+
self.__dict__[key] = val
|
29 |
+
|
30 |
+
def __repr__(self):
|
31 |
+
return "%s(%s)" % (self.__class__.__name__, dict.__repr__(self))
|
32 |
+
|
33 |
+
def __setitem__(self, key, value):
|
34 |
+
return super(AttrDict, self).__setitem__(key, value)
|
35 |
+
|
36 |
+
def __getitem__(self, name):
|
37 |
+
return super(AttrDict, self).__getitem__(name)
|
38 |
+
|
39 |
+
def __delitem__(self, name):
|
40 |
+
return super(AttrDict, self).__delitem__(name)
|
41 |
+
|
42 |
+
__getattr__ = __getitem__
|
43 |
+
__setattr__ = __setitem__
|
44 |
+
|
45 |
+
def copy(self):
|
46 |
+
return AttrDict(self)
|
VLMEvalKit_old/PaddleMIX/paddlemix/models/diffsinger/modules/nsf_hifigan/models.py
ADDED
@@ -0,0 +1,380 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
|
2 |
+
#
|
3 |
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
4 |
+
# you may not use this file except in compliance with the License.
|
5 |
+
# You may obtain a copy of the License at
|
6 |
+
#
|
7 |
+
# http://www.apache.org/licenses/LICENSE-2.0
|
8 |
+
#
|
9 |
+
# Unless required by applicable law or agreed to in writing, software
|
10 |
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
11 |
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
12 |
+
# See the License for the specific language governing permissions and
|
13 |
+
# limitations under the License.
|
14 |
+
|
15 |
+
import json
|
16 |
+
import pathlib
|
17 |
+
import sys
|
18 |
+
import numpy as np
|
19 |
+
import paddle
|
20 |
+
import paddle.nn.functional as F
|
21 |
+
|
22 |
+
from paddlemix.models.diffsinger.utils import paddle_aux
|
23 |
+
from paddle.nn.utils import remove_weight_norm, weight_norm
|
24 |
+
|
25 |
+
from .env import AttrDict
|
26 |
+
from .utils import get_padding, init_weights
|
27 |
+
|
28 |
+
LRELU_SLOPE = 0.1
|
29 |
+
|
30 |
+
|
31 |
+
def load_model(model_path: pathlib.Path):
|
32 |
+
config_file = model_path.with_name("config.json")
|
33 |
+
with open(config_file) as f:
|
34 |
+
data = f.read()
|
35 |
+
json_config = json.loads(data)
|
36 |
+
h = AttrDict(json_config)
|
37 |
+
generator = Generator(h)
|
38 |
+
cp_dict = paddle.load(path=str(model_path))
|
39 |
+
generator.set_state_dict(state_dict=cp_dict["generator"])
|
40 |
+
generator.eval()
|
41 |
+
generator.remove_weight_norm()
|
42 |
+
del cp_dict
|
43 |
+
return generator, h
|
44 |
+
|
45 |
+
|
46 |
+
class ResBlock1(paddle.nn.Layer):
|
47 |
+
def __init__(self, h, channels, kernel_size=3, dilation=(1, 3, 5)):
|
48 |
+
super(ResBlock1, self).__init__()
|
49 |
+
self.h = h
|
50 |
+
self.convs1 = paddle.nn.LayerList(
|
51 |
+
sublayers=[
|
52 |
+
paddle.nn.utils.weight_norm(
|
53 |
+
layer=paddle.nn.Conv1D(
|
54 |
+
in_channels=channels,
|
55 |
+
out_channels=channels,
|
56 |
+
kernel_size=kernel_size,
|
57 |
+
stride=1,
|
58 |
+
dilation=dilation[0],
|
59 |
+
padding=get_padding(kernel_size, dilation[0]),
|
60 |
+
)
|
61 |
+
),
|
62 |
+
paddle.nn.utils.weight_norm(
|
63 |
+
layer=paddle.nn.Conv1D(
|
64 |
+
in_channels=channels,
|
65 |
+
out_channels=channels,
|
66 |
+
kernel_size=kernel_size,
|
67 |
+
stride=1,
|
68 |
+
dilation=dilation[1],
|
69 |
+
padding=get_padding(kernel_size, dilation[1]),
|
70 |
+
)
|
71 |
+
),
|
72 |
+
paddle.nn.utils.weight_norm(
|
73 |
+
layer=paddle.nn.Conv1D(
|
74 |
+
in_channels=channels,
|
75 |
+
out_channels=channels,
|
76 |
+
kernel_size=kernel_size,
|
77 |
+
stride=1,
|
78 |
+
dilation=dilation[2],
|
79 |
+
padding=get_padding(kernel_size, dilation[2]),
|
80 |
+
)
|
81 |
+
),
|
82 |
+
]
|
83 |
+
)
|
84 |
+
self.convs1.apply(init_weights)
|
85 |
+
self.convs2 = paddle.nn.LayerList(
|
86 |
+
sublayers=[
|
87 |
+
paddle.nn.utils.weight_norm(
|
88 |
+
layer=paddle.nn.Conv1D(
|
89 |
+
in_channels=channels,
|
90 |
+
out_channels=channels,
|
91 |
+
kernel_size=kernel_size,
|
92 |
+
stride=1,
|
93 |
+
dilation=1,
|
94 |
+
padding=get_padding(kernel_size, 1),
|
95 |
+
)
|
96 |
+
),
|
97 |
+
paddle.nn.utils.weight_norm(
|
98 |
+
layer=paddle.nn.Conv1D(
|
99 |
+
in_channels=channels,
|
100 |
+
out_channels=channels,
|
101 |
+
kernel_size=kernel_size,
|
102 |
+
stride=1,
|
103 |
+
dilation=1,
|
104 |
+
padding=get_padding(kernel_size, 1),
|
105 |
+
)
|
106 |
+
),
|
107 |
+
paddle.nn.utils.weight_norm(
|
108 |
+
layer=paddle.nn.Conv1D(
|
109 |
+
in_channels=channels,
|
110 |
+
out_channels=channels,
|
111 |
+
kernel_size=kernel_size,
|
112 |
+
stride=1,
|
113 |
+
dilation=1,
|
114 |
+
padding=get_padding(kernel_size, 1),
|
115 |
+
)
|
116 |
+
),
|
117 |
+
]
|
118 |
+
)
|
119 |
+
self.convs2.apply(init_weights)
|
120 |
+
|
121 |
+
def forward(self, x):
|
122 |
+
for c1, c2 in zip(self.convs1, self.convs2):
|
123 |
+
xt = paddle.nn.functional.leaky_relu(x=x, negative_slope=LRELU_SLOPE)
|
124 |
+
xt = c1(xt)
|
125 |
+
xt = paddle.nn.functional.leaky_relu(x=xt, negative_slope=LRELU_SLOPE)
|
126 |
+
xt = c2(xt)
|
127 |
+
x = xt + x
|
128 |
+
return x
|
129 |
+
|
130 |
+
def remove_weight_norm(self):
|
131 |
+
for l in self.convs1:
|
132 |
+
paddle.nn.utils.remove_weight_norm(layer=l)
|
133 |
+
for l in self.convs2:
|
134 |
+
paddle.nn.utils.remove_weight_norm(layer=l)
|
135 |
+
|
136 |
+
|
137 |
+
class ResBlock2(paddle.nn.Layer):
|
138 |
+
def __init__(self, h, channels, kernel_size=3, dilation=(1, 3)):
|
139 |
+
super(ResBlock2, self).__init__()
|
140 |
+
self.h = h
|
141 |
+
self.convs = paddle.nn.LayerList(
|
142 |
+
sublayers=[
|
143 |
+
paddle.nn.utils.weight_norm(
|
144 |
+
layer=paddle.nn.Conv1D(
|
145 |
+
in_channels=channels,
|
146 |
+
out_channels=channels,
|
147 |
+
kernel_size=kernel_size,
|
148 |
+
stride=1,
|
149 |
+
dilation=dilation[0],
|
150 |
+
padding=get_padding(kernel_size, dilation[0]),
|
151 |
+
)
|
152 |
+
),
|
153 |
+
paddle.nn.utils.weight_norm(
|
154 |
+
layer=paddle.nn.Conv1D(
|
155 |
+
in_channels=channels,
|
156 |
+
out_channels=channels,
|
157 |
+
kernel_size=kernel_size,
|
158 |
+
stride=1,
|
159 |
+
dilation=dilation[1],
|
160 |
+
padding=get_padding(kernel_size, dilation[1]),
|
161 |
+
)
|
162 |
+
),
|
163 |
+
]
|
164 |
+
)
|
165 |
+
self.convs.apply(init_weights)
|
166 |
+
|
167 |
+
def forward(self, x):
|
168 |
+
for c in self.convs:
|
169 |
+
xt = paddle.nn.functional.leaky_relu(x=x, negative_slope=LRELU_SLOPE)
|
170 |
+
xt = c(xt)
|
171 |
+
x = xt + x
|
172 |
+
return x
|
173 |
+
|
174 |
+
def remove_weight_norm(self):
|
175 |
+
for l in self.convs:
|
176 |
+
paddle.nn.utils.remove_weight_norm(layer=l)
|
177 |
+
|
178 |
+
|
179 |
+
class SineGen(paddle.nn.Layer):
|
180 |
+
"""Definition of sine generator
|
181 |
+
SineGen(samp_rate, harmonic_num = 0,
|
182 |
+
sine_amp = 0.1, noise_std = 0.003,
|
183 |
+
voiced_threshold = 0,
|
184 |
+
flag_for_pulse=False)
|
185 |
+
samp_rate: sampling rate in Hz
|
186 |
+
harmonic_num: number of harmonic overtones (default 0)
|
187 |
+
sine_amp: amplitude of sine-waveform (default 0.1)
|
188 |
+
noise_std: std of Gaussian noise (default 0.003)
|
189 |
+
voiced_threshold: F0 threshold for U/V classification (default 0)
|
190 |
+
flag_for_pulse: this SinGen is used inside PulseGen (default False)
|
191 |
+
Note: when flag_for_pulse is True, the first time step of a voiced
|
192 |
+
segment is always sin(np.pi) or cos(0)
|
193 |
+
"""
|
194 |
+
|
195 |
+
def __init__(self, samp_rate, harmonic_num=0, sine_amp=0.1, noise_std=0.003, voiced_threshold=0):
|
196 |
+
super(SineGen, self).__init__()
|
197 |
+
self.sine_amp = sine_amp
|
198 |
+
self.noise_std = noise_std
|
199 |
+
self.harmonic_num = harmonic_num
|
200 |
+
self.dim = self.harmonic_num + 1
|
201 |
+
self.sampling_rate = samp_rate
|
202 |
+
self.voiced_threshold = voiced_threshold
|
203 |
+
|
204 |
+
def _f02uv(self, f0):
|
205 |
+
uv = paddle.ones_like(x=f0)
|
206 |
+
uv = uv * (f0 > self.voiced_threshold)
|
207 |
+
return uv
|
208 |
+
|
209 |
+
def _f02sine(self, f0, upp):
|
210 |
+
"""f0: (batchsize, length, dim)
|
211 |
+
where dim indicates fundamental tone and overtones
|
212 |
+
"""
|
213 |
+
# rad = f0 / self.sampling_rate * paddle.arange(start=1, end=upp + 1)
|
214 |
+
rad = f0 / self.sampling_rate * paddle.arange(start=1, end=upp + 1, dtype="float32")
|
215 |
+
rad2 = (
|
216 |
+
paddle.mod(
|
217 |
+
x=rad[..., -1:].astype(dtype="float32") + 0.5,
|
218 |
+
y=paddle.to_tensor(1.0, dtype=(rad[..., -1:].astype(dtype="float32") + 0.5).dtype),
|
219 |
+
)
|
220 |
+
- 0.5
|
221 |
+
)
|
222 |
+
rad_acc = rad2.cumsum(axis=1).mod(y=paddle.to_tensor(1.0)).to(f0)
|
223 |
+
# rad += paddle.nn.functional.pad(x=rad_acc, pad=(0, 0, 1, -1),
|
224 |
+
# pad_from_left_axis=False)
|
225 |
+
# 等效实现
|
226 |
+
rad_shifted = paddle.concat([paddle.zeros_like(rad_acc[:, :1]), rad_acc[:, :-1]], axis=1)
|
227 |
+
rad += rad_shifted
|
228 |
+
rad = rad.reshape(tuple(f0.shape)[0], -1, 1)
|
229 |
+
# rad = paddle.multiply(x=rad, y=paddle.to_tensor(paddle.arange(start
|
230 |
+
# =1, end=self.dim + 1).reshape(1, 1, -1)))
|
231 |
+
rad = paddle.multiply(
|
232 |
+
x=rad,
|
233 |
+
y=paddle.to_tensor(
|
234 |
+
paddle.arange(start=1, end=self.dim + 1), dtype="float32" # Explicitly set dtype to float32
|
235 |
+
).reshape(1, 1, -1),
|
236 |
+
)
|
237 |
+
|
238 |
+
rand_ini = paddle.rand(shape=[1, 1, self.dim])
|
239 |
+
rand_ini[..., 0] = 0
|
240 |
+
rad += rand_ini
|
241 |
+
sines = paddle.sin(x=2 * np.pi * rad)
|
242 |
+
return sines
|
243 |
+
|
244 |
+
@paddle.no_grad()
|
245 |
+
def forward(self, f0, upp):
|
246 |
+
"""sine_tensor, uv = forward(f0)
|
247 |
+
input F0: tensor(batchsize=1, length, dim=1)
|
248 |
+
f0 for unvoiced steps should be 0
|
249 |
+
output sine_tensor: tensor(batchsize=1, length, dim)
|
250 |
+
output uv: tensor(batchsize=1, length, 1)
|
251 |
+
"""
|
252 |
+
f0 = f0.unsqueeze(axis=-1)
|
253 |
+
sine_waves = self._f02sine(f0, upp) * self.sine_amp
|
254 |
+
uv = (f0 > self.voiced_threshold).astype(dtype="float32")
|
255 |
+
uv = F.interpolate(uv.transpose([0, 2, 1]), scale_factor=upp, mode="linear", data_format="NCW").transpose(
|
256 |
+
[0, 2, 1]
|
257 |
+
)
|
258 |
+
noise_amp = uv * self.noise_std + (1 - uv) * self.sine_amp / 3
|
259 |
+
noise = noise_amp * paddle.randn(shape=sine_waves.shape, dtype=sine_waves.dtype)
|
260 |
+
sine_waves = sine_waves * uv + noise
|
261 |
+
return sine_waves
|
262 |
+
|
263 |
+
|
264 |
+
class SourceModuleHnNSF(paddle.nn.Layer):
|
265 |
+
"""SourceModule for hn-nsf
|
266 |
+
SourceModule(sampling_rate, harmonic_num=0, sine_amp=0.1,
|
267 |
+
add_noise_std=0.003, voiced_threshod=0)
|
268 |
+
sampling_rate: sampling_rate in Hz
|
269 |
+
harmonic_num: number of harmonic above F0 (default: 0)
|
270 |
+
sine_amp: amplitude of sine source signal (default: 0.1)
|
271 |
+
add_noise_std: std of additive Gaussian noise (default: 0.003)
|
272 |
+
note that amplitude of noise in unvoiced is decided
|
273 |
+
by sine_amp
|
274 |
+
voiced_threshold: threhold to set U/V given F0 (default: 0)
|
275 |
+
Sine_source, noise_source = SourceModuleHnNSF(F0_sampled)
|
276 |
+
F0_sampled (batchsize, length, 1)
|
277 |
+
Sine_source (batchsize, length, 1)
|
278 |
+
noise_source (batchsize, length 1)
|
279 |
+
uv (batchsize, length, 1)
|
280 |
+
"""
|
281 |
+
|
282 |
+
def __init__(self, sampling_rate, harmonic_num=0, sine_amp=0.1, add_noise_std=0.003, voiced_threshold=0):
|
283 |
+
super(SourceModuleHnNSF, self).__init__()
|
284 |
+
self.sine_amp = sine_amp
|
285 |
+
self.noise_std = add_noise_std
|
286 |
+
self.l_sin_gen = SineGen(sampling_rate, harmonic_num, sine_amp, add_noise_std, voiced_threshold)
|
287 |
+
self.l_linear = paddle.nn.Linear(in_features=harmonic_num + 1, out_features=1)
|
288 |
+
self.l_tanh = paddle.nn.Tanh()
|
289 |
+
|
290 |
+
def forward(self, x, upp):
|
291 |
+
sine_wavs = self.l_sin_gen(x, upp)
|
292 |
+
sine_merge = self.l_tanh(self.l_linear(sine_wavs))
|
293 |
+
return sine_merge
|
294 |
+
|
295 |
+
|
296 |
+
class Generator(paddle.nn.Layer):
|
297 |
+
def __init__(self, h):
|
298 |
+
super(Generator, self).__init__()
|
299 |
+
self.h = h
|
300 |
+
self.num_kernels = len(h.resblock_kernel_sizes)
|
301 |
+
self.num_upsamples = len(h.upsample_rates)
|
302 |
+
self.m_source = SourceModuleHnNSF(sampling_rate=h.sampling_rate, harmonic_num=8)
|
303 |
+
self.noise_convs = paddle.nn.LayerList()
|
304 |
+
self.conv_pre = paddle.nn.utils.weight_norm(
|
305 |
+
layer=paddle.nn.Conv1D(
|
306 |
+
in_channels=h.num_mels, out_channels=h.upsample_initial_channel, kernel_size=7, stride=1, padding=3
|
307 |
+
)
|
308 |
+
)
|
309 |
+
resblock = ResBlock1 if h.resblock == "1" else ResBlock2
|
310 |
+
self.ups = paddle.nn.LayerList()
|
311 |
+
for i, (u, k) in enumerate(zip(h.upsample_rates, h.upsample_kernel_sizes)):
|
312 |
+
c_cur = h.upsample_initial_channel // 2 ** (i + 1)
|
313 |
+
self.ups.append(
|
314 |
+
paddle.nn.utils.weight_norm(
|
315 |
+
layer=paddle.nn.Conv1DTranspose(
|
316 |
+
in_channels=h.upsample_initial_channel // 2**i,
|
317 |
+
out_channels=h.upsample_initial_channel // 2 ** (i + 1),
|
318 |
+
kernel_size=k,
|
319 |
+
stride=u,
|
320 |
+
padding=(k - u) // 2,
|
321 |
+
)
|
322 |
+
)
|
323 |
+
)
|
324 |
+
if i + 1 < len(h.upsample_rates):
|
325 |
+
stride_f0 = int(np.prod(h.upsample_rates[i + 1 :]))
|
326 |
+
self.noise_convs.append(
|
327 |
+
paddle.nn.Conv1D(
|
328 |
+
in_channels=1,
|
329 |
+
out_channels=c_cur,
|
330 |
+
kernel_size=stride_f0 * 2,
|
331 |
+
stride=stride_f0,
|
332 |
+
padding=stride_f0 // 2,
|
333 |
+
)
|
334 |
+
)
|
335 |
+
else:
|
336 |
+
self.noise_convs.append(paddle.nn.Conv1D(in_channels=1, out_channels=c_cur, kernel_size=1))
|
337 |
+
self.resblocks = paddle.nn.LayerList()
|
338 |
+
ch = h.upsample_initial_channel
|
339 |
+
for i in range(len(self.ups)):
|
340 |
+
ch //= 2
|
341 |
+
for j, (k, d) in enumerate(zip(h.resblock_kernel_sizes, h.resblock_dilation_sizes)):
|
342 |
+
self.resblocks.append(resblock(h, ch, k, d))
|
343 |
+
self.conv_post = paddle.nn.utils.weight_norm(
|
344 |
+
layer=paddle.nn.Conv1D(in_channels=ch, out_channels=1, kernel_size=7, stride=1, padding=3)
|
345 |
+
)
|
346 |
+
self.ups.apply(init_weights)
|
347 |
+
self.conv_post.apply(init_weights)
|
348 |
+
self.upp = int(np.prod(h.upsample_rates))
|
349 |
+
|
350 |
+
def forward(self, x, f0):
|
351 |
+
har_source = self.m_source(f0, self.upp).transpose(
|
352 |
+
perm=paddle_aux.transpose_aux_func(self.m_source(f0, self.upp).ndim, 1, 2)
|
353 |
+
)
|
354 |
+
# har_source = self.m_source(f0, self.upp).transpose(1, 2)
|
355 |
+
x = self.conv_pre(x)
|
356 |
+
for i in range(self.num_upsamples):
|
357 |
+
x = paddle.nn.functional.leaky_relu(x=x, negative_slope=LRELU_SLOPE)
|
358 |
+
x = self.ups[i](x)
|
359 |
+
x_source = self.noise_convs[i](har_source)
|
360 |
+
x = x + x_source
|
361 |
+
xs = None
|
362 |
+
for j in range(self.num_kernels):
|
363 |
+
if xs is None:
|
364 |
+
xs = self.resblocks[i * self.num_kernels + j](x)
|
365 |
+
else:
|
366 |
+
xs += self.resblocks[i * self.num_kernels + j](x)
|
367 |
+
x = xs / self.num_kernels
|
368 |
+
x = paddle.nn.functional.leaky_relu(x=x)
|
369 |
+
x = self.conv_post(x)
|
370 |
+
x = paddle.nn.functional.tanh(x=x)
|
371 |
+
return x
|
372 |
+
|
373 |
+
def remove_weight_norm(self):
|
374 |
+
print("Removing weight norm...")
|
375 |
+
for l in self.ups:
|
376 |
+
remove_weight_norm(l)
|
377 |
+
for l in self.resblocks:
|
378 |
+
l.remove_weight_norm()
|
379 |
+
remove_weight_norm(self.conv_pre)
|
380 |
+
remove_weight_norm(self.conv_post)
|
VLMEvalKit_old/PaddleMIX/paddlemix/models/diffsinger/modules/nsf_hifigan/nvSTFT.py
ADDED
@@ -0,0 +1,104 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
|
2 |
+
#
|
3 |
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
4 |
+
# you may not use this file except in compliance with the License.
|
5 |
+
# You may obtain a copy of the License at
|
6 |
+
#
|
7 |
+
# http://www.apache.org/licenses/LICENSE-2.0
|
8 |
+
#
|
9 |
+
# Unless required by applicable law or agreed to in writing, software
|
10 |
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
11 |
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
12 |
+
# See the License for the specific language governing permissions and
|
13 |
+
# limitations under the License.
|
14 |
+
|
15 |
+
import os
|
16 |
+
|
17 |
+
import paddle
|
18 |
+
|
19 |
+
os.environ["LRU_CACHE_CAPACITY"] = "3"
|
20 |
+
import numpy as np
|
21 |
+
from librosa.filters import mel as librosa_mel_fn
|
22 |
+
|
23 |
+
|
24 |
+
def dynamic_range_compression(x, C=1, clip_val=1e-05):
|
25 |
+
return np.log(np.clip(x, a_min=clip_val, a_max=None) * C)
|
26 |
+
|
27 |
+
|
28 |
+
def dynamic_range_decompression(x, C=1):
|
29 |
+
return np.exp(x) / C
|
30 |
+
|
31 |
+
|
32 |
+
def dynamic_range_compression_torch(x, C=1, clip_val=1e-05):
|
33 |
+
return paddle.log(x=paddle.clip(x=x, min=clip_val) * C)
|
34 |
+
|
35 |
+
|
36 |
+
def dynamic_range_decompression_torch(x, C=1):
|
37 |
+
return paddle.exp(x=x) / C
|
38 |
+
|
39 |
+
|
40 |
+
class STFT:
|
41 |
+
def __init__(
|
42 |
+
self,
|
43 |
+
sr=22050,
|
44 |
+
n_mels=80,
|
45 |
+
n_fft=1024,
|
46 |
+
win_size=1024,
|
47 |
+
hop_length=256,
|
48 |
+
fmin=20,
|
49 |
+
fmax=11025,
|
50 |
+
clip_val=1e-05,
|
51 |
+
device=None,
|
52 |
+
):
|
53 |
+
self.target_sr = sr
|
54 |
+
self.n_mels = n_mels
|
55 |
+
self.n_fft = n_fft
|
56 |
+
self.win_size = win_size
|
57 |
+
self.hop_length = hop_length
|
58 |
+
self.fmin = fmin
|
59 |
+
self.fmax = fmax
|
60 |
+
self.clip_val = clip_val
|
61 |
+
if device is None:
|
62 |
+
device = str("cuda" if paddle.device.cuda.device_count() >= 1 else "cpu").replace("cuda", "gpu")
|
63 |
+
self.device = device
|
64 |
+
mel_basis = librosa_mel_fn(sr=sr, n_fft=n_fft, n_mels=n_mels, fmin=fmin, fmax=fmax)
|
65 |
+
self.mel_basis = paddle.to_tensor(data=mel_basis).astype(dtype="float32").to(device)
|
66 |
+
|
67 |
+
def get_mel(self, y, keyshift=0, speed=1, center=False):
|
68 |
+
factor = 2 ** (keyshift / 12)
|
69 |
+
n_fft_new = int(np.round(self.n_fft * factor))
|
70 |
+
win_size_new = int(np.round(self.win_size * factor))
|
71 |
+
hop_length_new = int(np.round(self.hop_length * speed))
|
72 |
+
if paddle.min(x=y) < -1.0:
|
73 |
+
print("min value is ", paddle.min(x=y))
|
74 |
+
if paddle.max(x=y) > 1.0:
|
75 |
+
print("max value is ", paddle.max(x=y))
|
76 |
+
window = paddle.audio.functional.get_window("hann", win_size_new).astype("float32").to(self.device)
|
77 |
+
y = paddle.nn.functional.pad(
|
78 |
+
x=y.unsqueeze(axis=1),
|
79 |
+
pad=((win_size_new - hop_length_new) // 2, (win_size_new - hop_length_new + 1) // 2),
|
80 |
+
mode="reflect",
|
81 |
+
pad_from_left_axis=False,
|
82 |
+
)
|
83 |
+
y = y.squeeze(axis=1)
|
84 |
+
spec = paddle.signal.stft(
|
85 |
+
y,
|
86 |
+
n_fft_new,
|
87 |
+
hop_length=hop_length_new,
|
88 |
+
win_length=win_size_new,
|
89 |
+
window=window,
|
90 |
+
center=center,
|
91 |
+
pad_mode="reflect",
|
92 |
+
normalized=False,
|
93 |
+
onesided=True,
|
94 |
+
).abs()
|
95 |
+
|
96 |
+
if keyshift != 0:
|
97 |
+
size = self.n_fft // 2 + 1
|
98 |
+
resize = spec.shape[1]
|
99 |
+
if resize < size:
|
100 |
+
spec = paddle.nn.functional.pad(x=spec, pad=(0, 0, 0, size - resize), pad_from_left_axis=False)
|
101 |
+
spec = spec[:, :size, :] * self.win_size / win_size_new
|
102 |
+
spec = paddle.matmul(x=self.mel_basis, y=spec)
|
103 |
+
spec = dynamic_range_compression_torch(spec, clip_val=self.clip_val)
|
104 |
+
return spec
|
VLMEvalKit_old/PaddleMIX/paddlemix/models/diffsinger/modules/nsf_hifigan/utils.py
ADDED
@@ -0,0 +1,27 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
|
2 |
+
#
|
3 |
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
4 |
+
# you may not use this file except in compliance with the License.
|
5 |
+
# You may obtain a copy of the License at
|
6 |
+
#
|
7 |
+
# http://www.apache.org/licenses/LICENSE-2.0
|
8 |
+
#
|
9 |
+
# Unless required by applicable law or agreed to in writing, software
|
10 |
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
11 |
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
12 |
+
# See the License for the specific language governing permissions and
|
13 |
+
# limitations under the License.
|
14 |
+
|
15 |
+
import matplotlib
|
16 |
+
|
17 |
+
matplotlib.use("Agg")
|
18 |
+
|
19 |
+
|
20 |
+
def init_weights(m, mean=0.0, std=0.01):
|
21 |
+
classname = m.__class__.__name__
|
22 |
+
if classname.find("Conv") != -1:
|
23 |
+
m.weight.data.normal_(mean, std)
|
24 |
+
|
25 |
+
|
26 |
+
def get_padding(kernel_size, dilation=1):
|
27 |
+
return int((kernel_size * dilation - dilation) / 2)
|
VLMEvalKit_old/PaddleMIX/paddlemix/models/diffsinger/modules/pe/pm.py
ADDED
@@ -0,0 +1,30 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
|
2 |
+
#
|
3 |
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
4 |
+
# you may not use this file except in compliance with the License.
|
5 |
+
# You may obtain a copy of the License at
|
6 |
+
#
|
7 |
+
# http://www.apache.org/licenses/LICENSE-2.0
|
8 |
+
#
|
9 |
+
# Unless required by applicable law or agreed to in writing, software
|
10 |
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
11 |
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
12 |
+
# See the License for the specific language governing permissions and
|
13 |
+
# limitations under the License.
|
14 |
+
|
15 |
+
from paddlemix.models.diffsinger.basics.base_pe import BasePE
|
16 |
+
from paddlemix.models.diffsinger.utils.binarizer_utils import get_pitch_parselmouth
|
17 |
+
|
18 |
+
|
19 |
+
class ParselmouthPE(BasePE):
|
20 |
+
def get_pitch(self, waveform, samplerate, length, *, hop_size, f0_min=65, f0_max=1100, speed=1, interp_uv=False):
|
21 |
+
return get_pitch_parselmouth(
|
22 |
+
waveform,
|
23 |
+
samplerate=samplerate,
|
24 |
+
length=length,
|
25 |
+
hop_size=hop_size,
|
26 |
+
f0_min=f0_min,
|
27 |
+
f0_max=f0_max,
|
28 |
+
speed=speed,
|
29 |
+
interp_uv=interp_uv,
|
30 |
+
)
|
VLMEvalKit_old/PaddleMIX/paddlemix/models/diffsinger/modules/pe/rmvpe/__init__.py
ADDED
@@ -0,0 +1,19 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
|
2 |
+
#
|
3 |
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
4 |
+
# you may not use this file except in compliance with the License.
|
5 |
+
# You may obtain a copy of the License at
|
6 |
+
#
|
7 |
+
# http://www.apache.org/licenses/LICENSE-2.0
|
8 |
+
#
|
9 |
+
# Unless required by applicable law or agreed to in writing, software
|
10 |
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
11 |
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
12 |
+
# See the License for the specific language governing permissions and
|
13 |
+
# limitations under the License.
|
14 |
+
|
15 |
+
from .constants import *
|
16 |
+
from .inference import RMVPE
|
17 |
+
from .model import E2E0
|
18 |
+
from .spec import MelSpectrogram
|
19 |
+
from .utils import to_local_average_f0, to_viterbi_f0
|
VLMEvalKit_old/PaddleMIX/paddlemix/models/diffsinger/modules/pe/rmvpe/constants.py
ADDED
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
|
2 |
+
#
|
3 |
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
4 |
+
# you may not use this file except in compliance with the License.
|
5 |
+
# You may obtain a copy of the License at
|
6 |
+
#
|
7 |
+
# http://www.apache.org/licenses/LICENSE-2.0
|
8 |
+
#
|
9 |
+
# Unless required by applicable law or agreed to in writing, software
|
10 |
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
11 |
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
12 |
+
# See the License for the specific language governing permissions and
|
13 |
+
# limitations under the License.
|
14 |
+
|
15 |
+
SAMPLE_RATE = 16000
|
16 |
+
N_CLASS = 360
|
17 |
+
N_MELS = 128
|
18 |
+
MEL_FMIN = 30
|
19 |
+
MEL_FMAX = 8000
|
20 |
+
WINDOW_LENGTH = 1024
|
21 |
+
CONST = 1997.379408437619
|
VLMEvalKit_old/PaddleMIX/paddlemix/models/diffsinger/modules/pe/rmvpe/deepunet.py
ADDED
@@ -0,0 +1,194 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
|
2 |
+
#
|
3 |
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
4 |
+
# you may not use this file except in compliance with the License.
|
5 |
+
# You may obtain a copy of the License at
|
6 |
+
#
|
7 |
+
# http://www.apache.org/licenses/LICENSE-2.0
|
8 |
+
#
|
9 |
+
# Unless required by applicable law or agreed to in writing, software
|
10 |
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
11 |
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
12 |
+
# See the License for the specific language governing permissions and
|
13 |
+
# limitations under the License.
|
14 |
+
|
15 |
+
import paddle
|
16 |
+
|
17 |
+
from .constants import N_MELS
|
18 |
+
|
19 |
+
|
20 |
+
class ConvBlockRes(paddle.nn.Layer):
|
21 |
+
def __init__(self, in_channels, out_channels, momentum=0.01):
|
22 |
+
super(ConvBlockRes, self).__init__()
|
23 |
+
self.conv = paddle.nn.Sequential(
|
24 |
+
paddle.nn.Conv2D(
|
25 |
+
in_channels=in_channels,
|
26 |
+
out_channels=out_channels,
|
27 |
+
kernel_size=(3, 3),
|
28 |
+
stride=(1, 1),
|
29 |
+
padding=(1, 1),
|
30 |
+
bias_attr=False,
|
31 |
+
),
|
32 |
+
paddle.nn.BatchNorm2D(num_features=out_channels, momentum=1 - momentum),
|
33 |
+
paddle.nn.ReLU(),
|
34 |
+
paddle.nn.Conv2D(
|
35 |
+
in_channels=out_channels,
|
36 |
+
out_channels=out_channels,
|
37 |
+
kernel_size=(3, 3),
|
38 |
+
stride=(1, 1),
|
39 |
+
padding=(1, 1),
|
40 |
+
bias_attr=False,
|
41 |
+
),
|
42 |
+
paddle.nn.BatchNorm2D(num_features=out_channels, momentum=1 - momentum),
|
43 |
+
paddle.nn.ReLU(),
|
44 |
+
)
|
45 |
+
if in_channels != out_channels:
|
46 |
+
self.shortcut = paddle.nn.Conv2D(in_channels=in_channels, out_channels=out_channels, kernel_size=(1, 1))
|
47 |
+
self.is_shortcut = True
|
48 |
+
else:
|
49 |
+
self.is_shortcut = False
|
50 |
+
|
51 |
+
def forward(self, x):
|
52 |
+
if self.is_shortcut:
|
53 |
+
return self.conv(x) + self.shortcut(x)
|
54 |
+
else:
|
55 |
+
return self.conv(x) + x
|
56 |
+
|
57 |
+
|
58 |
+
class ResEncoderBlock(paddle.nn.Layer):
|
59 |
+
def __init__(self, in_channels, out_channels, kernel_size, n_blocks=1, momentum=0.01):
|
60 |
+
super(ResEncoderBlock, self).__init__()
|
61 |
+
self.n_blocks = n_blocks
|
62 |
+
self.conv = paddle.nn.LayerList()
|
63 |
+
self.conv.append(ConvBlockRes(in_channels, out_channels, momentum))
|
64 |
+
for i in range(n_blocks - 1):
|
65 |
+
self.conv.append(ConvBlockRes(out_channels, out_channels, momentum))
|
66 |
+
self.kernel_size = kernel_size
|
67 |
+
if self.kernel_size is not None:
|
68 |
+
self.pool = paddle.nn.AvgPool2D(kernel_size=kernel_size, exclusive=False)
|
69 |
+
|
70 |
+
def forward(self, x):
|
71 |
+
for i in range(self.n_blocks):
|
72 |
+
x = self.conv[i](x)
|
73 |
+
if self.kernel_size is not None:
|
74 |
+
return x, self.pool(x)
|
75 |
+
else:
|
76 |
+
return x
|
77 |
+
|
78 |
+
|
79 |
+
class ResDecoderBlock(paddle.nn.Layer):
|
80 |
+
def __init__(self, in_channels, out_channels, stride, n_blocks=1, momentum=0.01):
|
81 |
+
super(ResDecoderBlock, self).__init__()
|
82 |
+
out_padding = (0, 1) if stride == (1, 2) else (1, 1)
|
83 |
+
self.n_blocks = n_blocks
|
84 |
+
self.conv1 = paddle.nn.Sequential(
|
85 |
+
paddle.nn.Conv2DTranspose(
|
86 |
+
in_channels=in_channels,
|
87 |
+
out_channels=out_channels,
|
88 |
+
kernel_size=(3, 3),
|
89 |
+
stride=stride,
|
90 |
+
padding=(1, 1),
|
91 |
+
output_padding=out_padding,
|
92 |
+
bias_attr=False,
|
93 |
+
),
|
94 |
+
paddle.nn.BatchNorm2D(num_features=out_channels, momentum=1 - momentum),
|
95 |
+
paddle.nn.ReLU(),
|
96 |
+
)
|
97 |
+
self.conv2 = paddle.nn.LayerList()
|
98 |
+
self.conv2.append(ConvBlockRes(out_channels * 2, out_channels, momentum))
|
99 |
+
for i in range(n_blocks - 1):
|
100 |
+
self.conv2.append(ConvBlockRes(out_channels, out_channels, momentum))
|
101 |
+
|
102 |
+
def forward(self, x, concat_tensor):
|
103 |
+
x = self.conv1(x)
|
104 |
+
x = paddle.concat(x=(x, concat_tensor), axis=1)
|
105 |
+
for i in range(self.n_blocks):
|
106 |
+
x = self.conv2[i](x)
|
107 |
+
return x
|
108 |
+
|
109 |
+
|
110 |
+
class Encoder(paddle.nn.Layer):
|
111 |
+
def __init__(self, in_channels, in_size, n_encoders, kernel_size, n_blocks, out_channels=16, momentum=0.01):
|
112 |
+
super(Encoder, self).__init__()
|
113 |
+
self.n_encoders = n_encoders
|
114 |
+
self.bn = paddle.nn.BatchNorm2D(num_features=in_channels, momentum=1 - momentum)
|
115 |
+
self.layers = paddle.nn.LayerList()
|
116 |
+
self.latent_channels = []
|
117 |
+
for i in range(self.n_encoders):
|
118 |
+
self.layers.append(ResEncoderBlock(in_channels, out_channels, kernel_size, n_blocks, momentum=momentum))
|
119 |
+
self.latent_channels.append([out_channels, in_size])
|
120 |
+
in_channels = out_channels
|
121 |
+
out_channels *= 2
|
122 |
+
in_size //= 2
|
123 |
+
self.out_size = in_size
|
124 |
+
self.out_channel = out_channels
|
125 |
+
|
126 |
+
def forward(self, x):
|
127 |
+
concat_tensors = []
|
128 |
+
x = self.bn(x)
|
129 |
+
for i in range(self.n_encoders):
|
130 |
+
_, x = self.layers[i](x)
|
131 |
+
concat_tensors.append(_)
|
132 |
+
return x, concat_tensors
|
133 |
+
|
134 |
+
|
135 |
+
class Intermediate(paddle.nn.Layer):
|
136 |
+
def __init__(self, in_channels, out_channels, n_inters, n_blocks, momentum=0.01):
|
137 |
+
super(Intermediate, self).__init__()
|
138 |
+
self.n_inters = n_inters
|
139 |
+
self.layers = paddle.nn.LayerList()
|
140 |
+
self.layers.append(ResEncoderBlock(in_channels, out_channels, None, n_blocks, momentum))
|
141 |
+
for i in range(self.n_inters - 1):
|
142 |
+
self.layers.append(ResEncoderBlock(out_channels, out_channels, None, n_blocks, momentum))
|
143 |
+
|
144 |
+
def forward(self, x):
|
145 |
+
for i in range(self.n_inters):
|
146 |
+
x = self.layers[i](x)
|
147 |
+
return x
|
148 |
+
|
149 |
+
|
150 |
+
class Decoder(paddle.nn.Layer):
|
151 |
+
def __init__(self, in_channels, n_decoders, stride, n_blocks, momentum=0.01):
|
152 |
+
super(Decoder, self).__init__()
|
153 |
+
self.layers = paddle.nn.LayerList()
|
154 |
+
self.n_decoders = n_decoders
|
155 |
+
for i in range(self.n_decoders):
|
156 |
+
out_channels = in_channels // 2
|
157 |
+
self.layers.append(ResDecoderBlock(in_channels, out_channels, stride, n_blocks, momentum))
|
158 |
+
in_channels = out_channels
|
159 |
+
|
160 |
+
def forward(self, x, concat_tensors):
|
161 |
+
for i in range(self.n_decoders):
|
162 |
+
x = self.layers[i](x, concat_tensors[-1 - i])
|
163 |
+
return x
|
164 |
+
|
165 |
+
|
166 |
+
class TimbreFilter(paddle.nn.Layer):
|
167 |
+
def __init__(self, latent_rep_channels):
|
168 |
+
super(TimbreFilter, self).__init__()
|
169 |
+
self.layers = paddle.nn.LayerList()
|
170 |
+
for latent_rep in latent_rep_channels:
|
171 |
+
self.layers.append(ConvBlockRes(latent_rep[0], latent_rep[0]))
|
172 |
+
|
173 |
+
def forward(self, x_tensors):
|
174 |
+
out_tensors = []
|
175 |
+
for i, layer in enumerate(self.layers):
|
176 |
+
out_tensors.append(layer(x_tensors[i]))
|
177 |
+
return out_tensors
|
178 |
+
|
179 |
+
|
180 |
+
class DeepUnet0(paddle.nn.Layer):
|
181 |
+
def __init__(self, kernel_size, n_blocks, en_de_layers=5, inter_layers=4, in_channels=1, en_out_channels=16):
|
182 |
+
super(DeepUnet0, self).__init__()
|
183 |
+
self.encoder = Encoder(in_channels, N_MELS, en_de_layers, kernel_size, n_blocks, en_out_channels)
|
184 |
+
self.intermediate = Intermediate(
|
185 |
+
self.encoder.out_channel // 2, self.encoder.out_channel, inter_layers, n_blocks
|
186 |
+
)
|
187 |
+
self.tf = TimbreFilter(self.encoder.latent_channels)
|
188 |
+
self.decoder = Decoder(self.encoder.out_channel, en_de_layers, kernel_size, n_blocks)
|
189 |
+
|
190 |
+
def forward(self, x):
|
191 |
+
x, concat_tensors = self.encoder(x)
|
192 |
+
x = self.intermediate(x)
|
193 |
+
x = self.decoder(x, concat_tensors)
|
194 |
+
return x
|
VLMEvalKit_old/PaddleMIX/paddlemix/models/diffsinger/modules/pe/rmvpe/inference.py
ADDED
@@ -0,0 +1,80 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
|
2 |
+
#
|
3 |
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
4 |
+
# you may not use this file except in compliance with the License.
|
5 |
+
# You may obtain a copy of the License at
|
6 |
+
#
|
7 |
+
# http://www.apache.org/licenses/LICENSE-2.0
|
8 |
+
#
|
9 |
+
# Unless required by applicable law or agreed to in writing, software
|
10 |
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
11 |
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
12 |
+
# See the License for the specific language governing permissions and
|
13 |
+
# limitations under the License.
|
14 |
+
|
15 |
+
import numpy as np
|
16 |
+
import paddle
|
17 |
+
from basics.base_pe import BasePE
|
18 |
+
from torchaudio.transforms import Resample
|
19 |
+
from utils.infer_utils import resample_align_curve
|
20 |
+
from utils.pitch_utils import interp_f0
|
21 |
+
|
22 |
+
from .constants import *
|
23 |
+
from .model import E2E0
|
24 |
+
from .spec import MelSpectrogram
|
25 |
+
from .utils import to_local_average_f0, to_viterbi_f0
|
26 |
+
|
27 |
+
|
28 |
+
class RMVPE(BasePE):
|
29 |
+
def __init__(self, model_path, hop_length=160):
|
30 |
+
self.resample_kernel = {}
|
31 |
+
self.device = "cuda" if paddle.device.cuda.device_count() >= 1 else "cpu"
|
32 |
+
self.model = E2E0(4, 1, (2, 2)).eval().to(self.device)
|
33 |
+
ckpt = paddle.load(path=str(model_path))
|
34 |
+
self.model.set_state_dict(state_dict=ckpt["model"])
|
35 |
+
self.mel_extractor = MelSpectrogram(
|
36 |
+
N_MELS, SAMPLE_RATE, WINDOW_LENGTH, hop_length, None, MEL_FMIN, MEL_FMAX
|
37 |
+
).to(self.device)
|
38 |
+
|
39 |
+
@paddle.no_grad()
|
40 |
+
def mel2hidden(self, mel):
|
41 |
+
n_frames = tuple(mel.shape)[-1]
|
42 |
+
mel = paddle.nn.functional.pad(
|
43 |
+
x=mel, pad=(0, 32 * ((n_frames - 1) // 32 + 1) - n_frames), mode="constant", pad_from_left_axis=False
|
44 |
+
)
|
45 |
+
hidden = self.model(mel)
|
46 |
+
return hidden[:, :n_frames]
|
47 |
+
|
48 |
+
def decode(self, hidden, thred=0.03, use_viterbi=False):
|
49 |
+
if use_viterbi:
|
50 |
+
f0 = to_viterbi_f0(hidden, thred=thred)
|
51 |
+
else:
|
52 |
+
f0 = to_local_average_f0(hidden, thred=thred)
|
53 |
+
return f0
|
54 |
+
|
55 |
+
def infer_from_audio(self, audio, sample_rate=16000, thred=0.03, use_viterbi=False):
|
56 |
+
audio = paddle.to_tensor(data=audio).astype(dtype="float32").unsqueeze(axis=0).to(self.device)
|
57 |
+
if sample_rate == 16000:
|
58 |
+
audio_res = audio
|
59 |
+
else:
|
60 |
+
key_str = str(sample_rate)
|
61 |
+
if key_str not in self.resample_kernel:
|
62 |
+
self.resample_kernel[key_str] = Resample(sample_rate, 16000, lowpass_filter_width=128)
|
63 |
+
self.resample_kernel[key_str] = self.resample_kernel[key_str].to(self.device)
|
64 |
+
audio_res = self.resample_kernel[key_str](audio)
|
65 |
+
mel = self.mel_extractor(audio_res, center=True)
|
66 |
+
hidden = self.mel2hidden(mel)
|
67 |
+
f0 = self.decode(hidden, thred=thred, use_viterbi=use_viterbi)
|
68 |
+
return f0
|
69 |
+
|
70 |
+
def get_pitch(self, waveform, samplerate, length, *, hop_size, f0_min=65, f0_max=1100, speed=1, interp_uv=False):
|
71 |
+
f0 = self.infer_from_audio(waveform, sample_rate=samplerate)
|
72 |
+
uv = f0 == 0
|
73 |
+
f0, uv = interp_f0(f0, uv)
|
74 |
+
hop_size = int(np.round(hop_size * speed))
|
75 |
+
time_step = hop_size / samplerate
|
76 |
+
f0_res = resample_align_curve(f0, 0.01, time_step, length)
|
77 |
+
uv_res = resample_align_curve(uv.astype(np.float32), 0.01, time_step, length) > 0.5
|
78 |
+
if not interp_uv:
|
79 |
+
f0_res[uv_res] = 0
|
80 |
+
return f0_res, uv_res
|
VLMEvalKit_old/PaddleMIX/paddlemix/models/diffsinger/modules/pe/rmvpe/model.py
ADDED
@@ -0,0 +1,54 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
|
2 |
+
#
|
3 |
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
4 |
+
# you may not use this file except in compliance with the License.
|
5 |
+
# You may obtain a copy of the License at
|
6 |
+
#
|
7 |
+
# http://www.apache.org/licenses/LICENSE-2.0
|
8 |
+
#
|
9 |
+
# Unless required by applicable law or agreed to in writing, software
|
10 |
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
11 |
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
12 |
+
# See the License for the specific language governing permissions and
|
13 |
+
# limitations under the License.
|
14 |
+
|
15 |
+
import sys
|
16 |
+
|
17 |
+
import paddle
|
18 |
+
import paddle_aux
|
19 |
+
|
20 |
+
from .constants import *
|
21 |
+
from .deepunet import DeepUnet0
|
22 |
+
from .seq import BiGRU
|
23 |
+
|
24 |
+
|
25 |
+
class E2E0(paddle.nn.Layer):
|
26 |
+
def __init__(
|
27 |
+
self, n_blocks, n_gru, kernel_size, en_de_layers=5, inter_layers=4, in_channels=1, en_out_channels=16
|
28 |
+
):
|
29 |
+
super(E2E0, self).__init__()
|
30 |
+
self.unet = DeepUnet0(kernel_size, n_blocks, en_de_layers, inter_layers, in_channels, en_out_channels)
|
31 |
+
self.cnn = paddle.nn.Conv2D(in_channels=en_out_channels, out_channels=3, kernel_size=(3, 3), padding=(1, 1))
|
32 |
+
if n_gru:
|
33 |
+
self.fc = paddle.nn.Sequential(
|
34 |
+
BiGRU(3 * N_MELS, 256, n_gru),
|
35 |
+
paddle.nn.Linear(in_features=512, out_features=N_CLASS),
|
36 |
+
paddle.nn.Dropout(p=0.25),
|
37 |
+
paddle.nn.Sigmoid(),
|
38 |
+
)
|
39 |
+
else:
|
40 |
+
self.fc = paddle.nn.Sequential(
|
41 |
+
paddle.nn.Linear(in_features=3 * N_MELS, out_features=N_CLASS),
|
42 |
+
paddle.nn.Dropout(p=0.25),
|
43 |
+
paddle.nn.Sigmoid(),
|
44 |
+
)
|
45 |
+
|
46 |
+
def forward(self, mel):
|
47 |
+
mel = mel.transpose(perm=paddle_aux.transpose_aux_func(mel.ndim, -1, -2)).unsqueeze(axis=1)
|
48 |
+
x = (
|
49 |
+
self.cnn(self.unet(mel))
|
50 |
+
.transpose(perm=paddle_aux.transpose_aux_func(self.cnn(self.unet(mel)).ndim, 1, 2))
|
51 |
+
.flatten(start_axis=-2)
|
52 |
+
)
|
53 |
+
x = self.fc(x)
|
54 |
+
return x
|
VLMEvalKit_old/PaddleMIX/paddlemix/models/diffsinger/modules/pe/rmvpe/seq.py
ADDED
@@ -0,0 +1,30 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
|
2 |
+
#
|
3 |
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
4 |
+
# you may not use this file except in compliance with the License.
|
5 |
+
# You may obtain a copy of the License at
|
6 |
+
#
|
7 |
+
# http://www.apache.org/licenses/LICENSE-2.0
|
8 |
+
#
|
9 |
+
# Unless required by applicable law or agreed to in writing, software
|
10 |
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
11 |
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
12 |
+
# See the License for the specific language governing permissions and
|
13 |
+
# limitations under the License.
|
14 |
+
|
15 |
+
import paddle
|
16 |
+
|
17 |
+
|
18 |
+
class BiGRU(paddle.nn.Layer):
|
19 |
+
def __init__(self, input_features, hidden_features, num_layers):
|
20 |
+
super(BiGRU, self).__init__()
|
21 |
+
self.gru = paddle.nn.GRU(
|
22 |
+
input_size=input_features,
|
23 |
+
hidden_size=hidden_features,
|
24 |
+
num_layers=num_layers,
|
25 |
+
time_major=not True,
|
26 |
+
direction="bidirect",
|
27 |
+
)
|
28 |
+
|
29 |
+
def forward(self, x):
|
30 |
+
return self.gru(x)[0]
|
VLMEvalKit_old/PaddleMIX/paddlemix/models/diffsinger/modules/pe/rmvpe/spec.py
ADDED
@@ -0,0 +1,65 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
|
2 |
+
#
|
3 |
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
4 |
+
# you may not use this file except in compliance with the License.
|
5 |
+
# You may obtain a copy of the License at
|
6 |
+
#
|
7 |
+
# http://www.apache.org/licenses/LICENSE-2.0
|
8 |
+
#
|
9 |
+
# Unless required by applicable law or agreed to in writing, software
|
10 |
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
11 |
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
12 |
+
# See the License for the specific language governing permissions and
|
13 |
+
# limitations under the License.
|
14 |
+
|
15 |
+
import numpy as np
|
16 |
+
import paddle
|
17 |
+
from librosa.filters import mel
|
18 |
+
|
19 |
+
|
20 |
+
class MelSpectrogram(paddle.nn.Layer):
|
21 |
+
def __init__(
|
22 |
+
self, n_mel_channels, sampling_rate, win_length, hop_length, n_fft=None, mel_fmin=0, mel_fmax=None, clamp=1e-05
|
23 |
+
):
|
24 |
+
super().__init__()
|
25 |
+
n_fft = win_length if n_fft is None else n_fft
|
26 |
+
self.hann_window = {}
|
27 |
+
mel_basis = mel(sr=sampling_rate, n_fft=n_fft, n_mels=n_mel_channels, fmin=mel_fmin, fmax=mel_fmax, htk=True)
|
28 |
+
mel_basis = paddle.to_tensor(data=mel_basis).astype(dtype="float32")
|
29 |
+
self.register_buffer(name="mel_basis", tensor=mel_basis)
|
30 |
+
self.n_fft = win_length if n_fft is None else n_fft
|
31 |
+
self.hop_length = hop_length
|
32 |
+
self.win_length = win_length
|
33 |
+
self.sampling_rate = sampling_rate
|
34 |
+
self.n_mel_channels = n_mel_channels
|
35 |
+
self.clamp = clamp
|
36 |
+
|
37 |
+
def forward(self, audio, keyshift=0, speed=1, center=True):
|
38 |
+
factor = 2 ** (keyshift / 12)
|
39 |
+
n_fft_new = int(np.round(self.n_fft * factor))
|
40 |
+
win_length_new = int(np.round(self.win_length * factor))
|
41 |
+
hop_length_new = int(np.round(self.hop_length * speed))
|
42 |
+
keyshift_key = str(keyshift) + "_" + str(audio.place)
|
43 |
+
if keyshift_key not in self.hann_window:
|
44 |
+
self.hann_window[keyshift_key] = paddle.audio.functional.get_window("hann", win_length_new).to(audio.place)
|
45 |
+
fft = paddle.signal.stft(
|
46 |
+
audio,
|
47 |
+
n_fft=n_fft_new,
|
48 |
+
hop_length=hop_length_new,
|
49 |
+
win_length=win_length_new,
|
50 |
+
window=self.hann_window[keyshift_key],
|
51 |
+
center=center,
|
52 |
+
return_complex=True,
|
53 |
+
)
|
54 |
+
magnitude = fft.abs()
|
55 |
+
if keyshift != 0:
|
56 |
+
size = self.n_fft // 2 + 1
|
57 |
+
resize = magnitude.shape[1]
|
58 |
+
if resize < size:
|
59 |
+
magnitude = paddle.nn.functional.pad(
|
60 |
+
x=magnitude, pad=(0, 0, 0, size - resize), pad_from_left_axis=False
|
61 |
+
)
|
62 |
+
magnitude = magnitude[:, :size, :] * self.win_length / win_length_new
|
63 |
+
mel_output = paddle.matmul(x=self.mel_basis, y=magnitude)
|
64 |
+
log_mel_spec = paddle.log(x=paddle.clip(x=mel_output, min=self.clamp))
|
65 |
+
return log_mel_spec
|
VLMEvalKit_old/PaddleMIX/paddlemix/models/diffsinger/modules/pe/rmvpe/utils.py
ADDED
@@ -0,0 +1,54 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
|
2 |
+
#
|
3 |
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
4 |
+
# you may not use this file except in compliance with the License.
|
5 |
+
# You may obtain a copy of the License at
|
6 |
+
#
|
7 |
+
# http://www.apache.org/licenses/LICENSE-2.0
|
8 |
+
#
|
9 |
+
# Unless required by applicable law or agreed to in writing, software
|
10 |
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
11 |
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
12 |
+
# See the License for the specific language governing permissions and
|
13 |
+
# limitations under the License.
|
14 |
+
|
15 |
+
import sys
|
16 |
+
|
17 |
+
import librosa
|
18 |
+
import numpy as np
|
19 |
+
import paddle
|
20 |
+
import paddle_aux
|
21 |
+
|
22 |
+
from .constants import *
|
23 |
+
|
24 |
+
|
25 |
+
def to_local_average_f0(hidden, center=None, thred=0.03):
|
26 |
+
idx = paddle.arange(end=N_CLASS)[None, None, :]
|
27 |
+
idx_cents = idx * 20 + CONST
|
28 |
+
if center is None:
|
29 |
+
center = paddle.argmax(x=hidden, axis=2, keepdim=True)
|
30 |
+
start = paddle.clip(x=center - 4, min=0)
|
31 |
+
end = paddle.clip(x=center + 5, max=N_CLASS)
|
32 |
+
idx_mask = (idx >= start) & (idx < end)
|
33 |
+
weights = hidden * idx_mask
|
34 |
+
product_sum = paddle.sum(x=weights * idx_cents, axis=2)
|
35 |
+
weight_sum = paddle.sum(x=weights, axis=2)
|
36 |
+
cents = product_sum / (weight_sum + (weight_sum == 0))
|
37 |
+
f0 = 10 * 2 ** (cents / 1200)
|
38 |
+
uv = hidden.max(dim=2)[0] < thred
|
39 |
+
f0 = f0 * ~uv
|
40 |
+
return f0.squeeze(axis=0).cpu().numpy()
|
41 |
+
|
42 |
+
|
43 |
+
def to_viterbi_f0(hidden, thred=0.03):
|
44 |
+
if not hasattr(to_viterbi_f0, "transition"):
|
45 |
+
xx, yy = np.meshgrid(range(N_CLASS), range(N_CLASS))
|
46 |
+
transition = np.maximum(30 - abs(xx - yy), 0)
|
47 |
+
transition = transition / transition.sum(axis=1, keepdims=True)
|
48 |
+
to_viterbi_f0.transition = transition
|
49 |
+
prob = hidden.squeeze(axis=0).cpu().numpy()
|
50 |
+
prob = prob.T
|
51 |
+
prob = prob / prob.sum(axis=0)
|
52 |
+
path = librosa.sequence.viterbi(prob, to_viterbi_f0.transition).astype(np.int64)
|
53 |
+
center = paddle.to_tensor(data=path).unsqueeze(axis=0).unsqueeze(axis=-1).to(hidden.place)
|
54 |
+
return to_local_average_f0(hidden, center=center, thred=thred)
|
VLMEvalKit_old/PaddleMIX/paddlemix/models/diffsinger/modules/toplevel.py
ADDED
@@ -0,0 +1,323 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
|
2 |
+
#
|
3 |
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
4 |
+
# you may not use this file except in compliance with the License.
|
5 |
+
# You may obtain a copy of the License at
|
6 |
+
#
|
7 |
+
# http://www.apache.org/licenses/LICENSE-2.0
|
8 |
+
#
|
9 |
+
# Unless required by applicable law or agreed to in writing, software
|
10 |
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
11 |
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
12 |
+
# See the License for the specific language governing permissions and
|
13 |
+
# limitations under the License.
|
14 |
+
|
15 |
+
from typing import Dict
|
16 |
+
|
17 |
+
import paddle
|
18 |
+
|
19 |
+
import paddlemix.models.diffsinger.modules.compat as compat
|
20 |
+
from paddlemix.models.diffsinger.basics.base_module import CategorizedModule
|
21 |
+
from paddlemix.models.diffsinger.modules.aux_decoder import AuxDecoderAdaptor
|
22 |
+
from paddlemix.models.diffsinger.modules.commons.common_layers import (
|
23 |
+
NormalInitEmbedding as Embedding,
|
24 |
+
)
|
25 |
+
from paddlemix.models.diffsinger.modules.commons.common_layers import (
|
26 |
+
XavierUniformInitLinear as Linear,
|
27 |
+
)
|
28 |
+
from paddlemix.models.diffsinger.modules.core import (
|
29 |
+
GaussianDiffusion,
|
30 |
+
MultiVarianceDiffusion,
|
31 |
+
MultiVarianceRectifiedFlow,
|
32 |
+
PitchDiffusion,
|
33 |
+
PitchRectifiedFlow,
|
34 |
+
RectifiedFlow,
|
35 |
+
)
|
36 |
+
from paddlemix.models.diffsinger.modules.fastspeech.acoustic_encoder import (
|
37 |
+
FastSpeech2Acoustic,
|
38 |
+
)
|
39 |
+
from paddlemix.models.diffsinger.modules.fastspeech.param_adaptor import (
|
40 |
+
ParameterAdaptorModule,
|
41 |
+
)
|
42 |
+
from paddlemix.models.diffsinger.modules.fastspeech.tts_modules import (
|
43 |
+
LengthRegulator,
|
44 |
+
RhythmRegulator,
|
45 |
+
)
|
46 |
+
from paddlemix.models.diffsinger.modules.fastspeech.variance_encoder import (
|
47 |
+
FastSpeech2Variance,
|
48 |
+
MelodyEncoder,
|
49 |
+
)
|
50 |
+
from paddlemix.models.diffsinger.utils.hparams import hparams
|
51 |
+
|
52 |
+
|
53 |
+
class ShallowDiffusionOutput:
|
54 |
+
def __init__(self, *, aux_out=None, diff_out=None):
|
55 |
+
self.aux_out = aux_out
|
56 |
+
self.diff_out = diff_out
|
57 |
+
|
58 |
+
|
59 |
+
class DiffSingerAcoustic(CategorizedModule, ParameterAdaptorModule):
|
60 |
+
@property
|
61 |
+
def category(self):
|
62 |
+
return "acoustic"
|
63 |
+
|
64 |
+
def __init__(self, vocab_size, out_dims):
|
65 |
+
CategorizedModule.__init__(self)
|
66 |
+
ParameterAdaptorModule.__init__(self)
|
67 |
+
self.fs2 = FastSpeech2Acoustic(vocab_size=vocab_size)
|
68 |
+
self.use_shallow_diffusion = hparams.get("use_shallow_diffusion", False)
|
69 |
+
self.shallow_args = hparams.get("shallow_diffusion_args", {})
|
70 |
+
if self.use_shallow_diffusion:
|
71 |
+
self.train_aux_decoder = self.shallow_args["train_aux_decoder"]
|
72 |
+
self.train_diffusion = self.shallow_args["train_diffusion"]
|
73 |
+
self.aux_decoder_grad = self.shallow_args["aux_decoder_grad"]
|
74 |
+
self.aux_decoder = AuxDecoderAdaptor(
|
75 |
+
in_dims=hparams["hidden_size"],
|
76 |
+
out_dims=out_dims,
|
77 |
+
num_feats=1,
|
78 |
+
spec_min=hparams["spec_min"],
|
79 |
+
spec_max=hparams["spec_max"],
|
80 |
+
aux_decoder_arch=self.shallow_args["aux_decoder_arch"],
|
81 |
+
aux_decoder_args=self.shallow_args["aux_decoder_args"],
|
82 |
+
)
|
83 |
+
self.diffusion_type = hparams.get("diffusion_type", "ddpm")
|
84 |
+
self.backbone_type = compat.get_backbone_type(hparams)
|
85 |
+
self.backbone_args = compat.get_backbone_args(hparams, self.backbone_type)
|
86 |
+
if self.diffusion_type == "ddpm":
|
87 |
+
self.diffusion = GaussianDiffusion(
|
88 |
+
out_dims=out_dims,
|
89 |
+
num_feats=1,
|
90 |
+
timesteps=hparams["timesteps"],
|
91 |
+
k_step=hparams["K_step"],
|
92 |
+
backbone_type=self.backbone_type,
|
93 |
+
backbone_args=self.backbone_args,
|
94 |
+
spec_min=hparams["spec_min"],
|
95 |
+
spec_max=hparams["spec_max"],
|
96 |
+
)
|
97 |
+
elif self.diffusion_type == "reflow":
|
98 |
+
self.diffusion = RectifiedFlow(
|
99 |
+
out_dims=out_dims,
|
100 |
+
num_feats=1,
|
101 |
+
t_start=hparams["T_start"],
|
102 |
+
time_scale_factor=hparams["time_scale_factor"],
|
103 |
+
backbone_type=self.backbone_type,
|
104 |
+
backbone_args=self.backbone_args,
|
105 |
+
spec_min=hparams["spec_min"],
|
106 |
+
spec_max=hparams["spec_max"],
|
107 |
+
)
|
108 |
+
else:
|
109 |
+
raise NotImplementedError(self.diffusion_type)
|
110 |
+
|
111 |
+
def forward(
|
112 |
+
self, txt_tokens, mel2ph, f0, key_shift=None, speed=None, spk_embed_id=None, gt_mel=None, infer=True, **kwargs
|
113 |
+
) -> ShallowDiffusionOutput:
|
114 |
+
condition = self.fs2(
|
115 |
+
txt_tokens, mel2ph, f0, key_shift=key_shift, speed=speed, spk_embed_id=spk_embed_id, **kwargs
|
116 |
+
)
|
117 |
+
if infer:
|
118 |
+
if self.use_shallow_diffusion:
|
119 |
+
aux_mel_pred = self.aux_decoder(condition, infer=True)
|
120 |
+
aux_mel_pred *= (mel2ph > 0).astype(dtype="float32")[:, :, None]
|
121 |
+
if gt_mel is not None and self.shallow_args["val_gt_start"]:
|
122 |
+
src_mel = gt_mel
|
123 |
+
else:
|
124 |
+
src_mel = aux_mel_pred
|
125 |
+
else:
|
126 |
+
aux_mel_pred = src_mel = None
|
127 |
+
mel_pred = self.diffusion(condition, src_spec=src_mel, infer=True)
|
128 |
+
mel_pred *= (mel2ph > 0).astype(dtype="float32")[:, :, None]
|
129 |
+
return ShallowDiffusionOutput(aux_out=aux_mel_pred, diff_out=mel_pred)
|
130 |
+
elif self.use_shallow_diffusion:
|
131 |
+
if self.train_aux_decoder:
|
132 |
+
aux_cond = condition * self.aux_decoder_grad + condition.detach() * (1 - self.aux_decoder_grad)
|
133 |
+
aux_out = self.aux_decoder(aux_cond, infer=False)
|
134 |
+
else:
|
135 |
+
aux_out = None
|
136 |
+
if self.train_diffusion:
|
137 |
+
diff_out = self.diffusion(condition, gt_spec=gt_mel, infer=False)
|
138 |
+
else:
|
139 |
+
diff_out = None
|
140 |
+
return ShallowDiffusionOutput(aux_out=aux_out, diff_out=diff_out)
|
141 |
+
else:
|
142 |
+
aux_out = None
|
143 |
+
diff_out = self.diffusion(condition, gt_spec=gt_mel, infer=False)
|
144 |
+
return ShallowDiffusionOutput(aux_out=aux_out, diff_out=diff_out)
|
145 |
+
|
146 |
+
|
147 |
+
class DiffSingerVariance(CategorizedModule, ParameterAdaptorModule):
|
148 |
+
@property
|
149 |
+
def category(self):
|
150 |
+
return "variance"
|
151 |
+
|
152 |
+
def __init__(self, vocab_size):
|
153 |
+
CategorizedModule.__init__(self)
|
154 |
+
ParameterAdaptorModule.__init__(self)
|
155 |
+
self.predict_dur = hparams["predict_dur"]
|
156 |
+
self.predict_pitch = hparams["predict_pitch"]
|
157 |
+
self.use_spk_id = hparams["use_spk_id"]
|
158 |
+
if self.use_spk_id:
|
159 |
+
self.spk_embed = Embedding(hparams["num_spk"], hparams["hidden_size"])
|
160 |
+
self.fs2 = FastSpeech2Variance(vocab_size=vocab_size)
|
161 |
+
self.rr = RhythmRegulator()
|
162 |
+
self.lr = LengthRegulator()
|
163 |
+
self.diffusion_type = hparams.get("diffusion_type", "ddpm")
|
164 |
+
if self.predict_pitch:
|
165 |
+
self.use_melody_encoder = hparams.get("use_melody_encoder", False)
|
166 |
+
if self.use_melody_encoder:
|
167 |
+
self.melody_encoder = MelodyEncoder(enc_hparams=hparams["melody_encoder_args"])
|
168 |
+
self.delta_pitch_embed = Linear(1, hparams["hidden_size"])
|
169 |
+
else:
|
170 |
+
self.base_pitch_embed = Linear(1, hparams["hidden_size"])
|
171 |
+
self.pitch_retake_embed = Embedding(2, hparams["hidden_size"])
|
172 |
+
pitch_hparams = hparams["pitch_prediction_args"]
|
173 |
+
self.pitch_backbone_type = compat.get_backbone_type(hparams, nested_config=pitch_hparams)
|
174 |
+
self.pitch_backbone_args = compat.get_backbone_args(pitch_hparams, backbone_type=self.pitch_backbone_type)
|
175 |
+
if self.diffusion_type == "ddpm":
|
176 |
+
self.pitch_predictor = PitchDiffusion(
|
177 |
+
vmin=pitch_hparams["pitd_norm_min"],
|
178 |
+
vmax=pitch_hparams["pitd_norm_max"],
|
179 |
+
cmin=pitch_hparams["pitd_clip_min"],
|
180 |
+
cmax=pitch_hparams["pitd_clip_max"],
|
181 |
+
repeat_bins=pitch_hparams["repeat_bins"],
|
182 |
+
timesteps=hparams["timesteps"],
|
183 |
+
k_step=hparams["K_step"],
|
184 |
+
backbone_type=self.pitch_backbone_type,
|
185 |
+
backbone_args=self.pitch_backbone_args,
|
186 |
+
)
|
187 |
+
elif self.diffusion_type == "reflow":
|
188 |
+
self.pitch_predictor = PitchRectifiedFlow(
|
189 |
+
vmin=pitch_hparams["pitd_norm_min"],
|
190 |
+
vmax=pitch_hparams["pitd_norm_max"],
|
191 |
+
cmin=pitch_hparams["pitd_clip_min"],
|
192 |
+
cmax=pitch_hparams["pitd_clip_max"],
|
193 |
+
repeat_bins=pitch_hparams["repeat_bins"],
|
194 |
+
time_scale_factor=hparams["time_scale_factor"],
|
195 |
+
backbone_type=self.pitch_backbone_type,
|
196 |
+
backbone_args=self.pitch_backbone_args,
|
197 |
+
)
|
198 |
+
else:
|
199 |
+
raise ValueError(f"Invalid diffusion type: {self.diffusion_type}")
|
200 |
+
if self.predict_variances:
|
201 |
+
self.pitch_embed = Linear(1, hparams["hidden_size"])
|
202 |
+
self.variance_embeds = paddle.nn.LayerDict(
|
203 |
+
sublayers={v_name: Linear(1, hparams["hidden_size"]) for v_name in self.variance_prediction_list}
|
204 |
+
)
|
205 |
+
if self.diffusion_type == "ddpm":
|
206 |
+
self.variance_predictor = self.build_adaptor(cls=MultiVarianceDiffusion)
|
207 |
+
elif self.diffusion_type == "reflow":
|
208 |
+
self.variance_predictor = self.build_adaptor(cls=MultiVarianceRectifiedFlow)
|
209 |
+
else:
|
210 |
+
raise NotImplementedError(self.diffusion_type)
|
211 |
+
|
212 |
+
def forward(
|
213 |
+
self,
|
214 |
+
txt_tokens,
|
215 |
+
midi,
|
216 |
+
ph2word,
|
217 |
+
ph_dur=None,
|
218 |
+
word_dur=None,
|
219 |
+
mel2ph=None,
|
220 |
+
note_midi=None,
|
221 |
+
note_rest=None,
|
222 |
+
note_dur=None,
|
223 |
+
note_glide=None,
|
224 |
+
mel2note=None,
|
225 |
+
base_pitch=None,
|
226 |
+
pitch=None,
|
227 |
+
pitch_expr=None,
|
228 |
+
pitch_retake=None,
|
229 |
+
variance_retake: Dict[str, paddle.Tensor] = None,
|
230 |
+
spk_id=None,
|
231 |
+
infer=True,
|
232 |
+
**kwargs
|
233 |
+
):
|
234 |
+
if self.use_spk_id:
|
235 |
+
ph_spk_mix_embed = kwargs.get("ph_spk_mix_embed")
|
236 |
+
spk_mix_embed = kwargs.get("spk_mix_embed")
|
237 |
+
if ph_spk_mix_embed is not None and spk_mix_embed is not None:
|
238 |
+
ph_spk_embed = ph_spk_mix_embed
|
239 |
+
spk_embed = spk_mix_embed
|
240 |
+
else:
|
241 |
+
ph_spk_embed = spk_embed = self.spk_embed(spk_id)[:, None, :]
|
242 |
+
else:
|
243 |
+
ph_spk_embed = spk_embed = None
|
244 |
+
encoder_out, dur_pred_out = self.fs2(
|
245 |
+
txt_tokens,
|
246 |
+
midi=midi,
|
247 |
+
ph2word=ph2word,
|
248 |
+
ph_dur=ph_dur,
|
249 |
+
word_dur=word_dur,
|
250 |
+
spk_embed=ph_spk_embed,
|
251 |
+
infer=infer,
|
252 |
+
)
|
253 |
+
if not self.predict_pitch and not self.predict_variances:
|
254 |
+
return dur_pred_out, None, {} if infer else None
|
255 |
+
if mel2ph is None and word_dur is not None:
|
256 |
+
dur_pred_align = self.rr(dur_pred_out, ph2word, word_dur)
|
257 |
+
mel2ph = self.lr(dur_pred_align)
|
258 |
+
mel2ph = paddle.nn.functional.pad(
|
259 |
+
x=mel2ph, pad=[0, tuple(base_pitch.shape)[1] - tuple(mel2ph.shape)[1]], pad_from_left_axis=False
|
260 |
+
)
|
261 |
+
encoder_out = paddle.nn.functional.pad(x=encoder_out, pad=[0, 0, 1, 0], pad_from_left_axis=False)
|
262 |
+
mel2ph_ = mel2ph[..., None].tile(repeat_times=[1, 1, hparams[hidden_size]])
|
263 |
+
condition = paddle.take_along_axis(arr=encoder_out, axis=1, indices=mel2ph_, broadcast=False)
|
264 |
+
if self.use_spk_id:
|
265 |
+
condition += spk_embed
|
266 |
+
if self.predict_pitch:
|
267 |
+
if self.use_melody_encoder:
|
268 |
+
melody_encoder_out = self.melody_encoder(note_midi, note_rest, note_dur, glide=note_glide)
|
269 |
+
melody_encoder_out = paddle.nn.functional.pad(
|
270 |
+
x=melody_encoder_out, pad=[0, 0, 1, 0], pad_from_left_axis=False
|
271 |
+
)
|
272 |
+
mel2note_ = mel2note[..., None].tile(repeat_times=[1, 1, hparams[hidden_size]])
|
273 |
+
melody_condition = paddle.take_along_axis(
|
274 |
+
arr=melody_encoder_out, axis=1, indices=mel2note_, broadcast=False
|
275 |
+
)
|
276 |
+
pitch_cond = condition + melody_condition
|
277 |
+
else:
|
278 |
+
pitch_cond = condition.clone()
|
279 |
+
retake_unset = pitch_retake is None
|
280 |
+
if retake_unset:
|
281 |
+
pitch_retake = paddle.ones_like(x=mel2ph, dtype="bool")
|
282 |
+
if pitch_expr is None:
|
283 |
+
pitch_retake_embed = self.pitch_retake_embed(pitch_retake.astype(dtype="int64"))
|
284 |
+
else:
|
285 |
+
retake_true_embed = self.pitch_retake_embed(paddle.ones(shape=[1, 1], dtype="int64"))
|
286 |
+
retake_false_embed = self.pitch_retake_embed(paddle.zeros(shape=[1, 1], dtype="int64"))
|
287 |
+
pitch_expr = (pitch_expr * pitch_retake)[:, :, None]
|
288 |
+
pitch_retake_embed = pitch_expr * retake_true_embed + (1.0 - pitch_expr) * retake_false_embed
|
289 |
+
pitch_cond += pitch_retake_embed
|
290 |
+
if self.use_melody_encoder:
|
291 |
+
if retake_unset:
|
292 |
+
delta_pitch_in = paddle.zeros_like(x=base_pitch)
|
293 |
+
else:
|
294 |
+
delta_pitch_in = (pitch - base_pitch) * ~pitch_retake
|
295 |
+
pitch_cond += self.delta_pitch_embed(delta_pitch_in[:, :, None])
|
296 |
+
else:
|
297 |
+
if not retake_unset:
|
298 |
+
base_pitch = base_pitch * pitch_retake + pitch * ~pitch_retake
|
299 |
+
pitch_cond += self.base_pitch_embed(base_pitch[:, :, None])
|
300 |
+
if infer:
|
301 |
+
pitch_pred_out = self.pitch_predictor(pitch_cond, infer=True)
|
302 |
+
else:
|
303 |
+
pitch_pred_out = self.pitch_predictor(pitch_cond, pitch - base_pitch, infer=False)
|
304 |
+
else:
|
305 |
+
pitch_pred_out = None
|
306 |
+
if not self.predict_variances:
|
307 |
+
return dur_pred_out, pitch_pred_out, {} if infer else None
|
308 |
+
if pitch is None:
|
309 |
+
pitch = base_pitch + pitch_pred_out
|
310 |
+
var_cond = condition + self.pitch_embed(pitch[:, :, None])
|
311 |
+
variance_inputs = self.collect_variance_inputs(**kwargs)
|
312 |
+
if variance_retake is not None:
|
313 |
+
variance_embeds = [
|
314 |
+
(self.variance_embeds[v_name](v_input[:, :, None]) * ~variance_retake[v_name][:, :, None])
|
315 |
+
for v_name, v_input in zip(self.variance_prediction_list, variance_inputs)
|
316 |
+
]
|
317 |
+
var_cond += paddle.stack(x=variance_embeds, axis=-1).sum(axis=-1)
|
318 |
+
variance_outputs = self.variance_predictor(var_cond, variance_inputs, infer=infer)
|
319 |
+
if infer:
|
320 |
+
variances_pred_out = self.collect_variance_outputs(variance_outputs)
|
321 |
+
else:
|
322 |
+
variances_pred_out = variance_outputs
|
323 |
+
return dur_pred_out, pitch_pred_out, variances_pred_out
|
VLMEvalKit_old/PaddleMIX/paddlemix/models/diffsinger/utils/__init__.py
ADDED
@@ -0,0 +1,342 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
|
2 |
+
#
|
3 |
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
4 |
+
# you may not use this file except in compliance with the License.
|
5 |
+
# You may obtain a copy of the License at
|
6 |
+
#
|
7 |
+
# http://www.apache.org/licenses/LICENSE-2.0
|
8 |
+
#
|
9 |
+
# Unless required by applicable law or agreed to in writing, software
|
10 |
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
11 |
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
12 |
+
# See the License for the specific language governing permissions and
|
13 |
+
# limitations under the License.
|
14 |
+
|
15 |
+
from __future__ import annotations
|
16 |
+
|
17 |
+
import pathlib
|
18 |
+
import re
|
19 |
+
import time
|
20 |
+
import types
|
21 |
+
from collections import OrderedDict
|
22 |
+
|
23 |
+
import numpy as np
|
24 |
+
import paddle
|
25 |
+
|
26 |
+
from paddlemix.models.diffsinger.basics.base_module import CategorizedModule
|
27 |
+
from paddlemix.models.diffsinger.utils import paddle_aux
|
28 |
+
from paddlemix.models.diffsinger.utils.hparams import hparams
|
29 |
+
|
30 |
+
def tensors_to_scalars(metrics):
|
31 |
+
new_metrics = {}
|
32 |
+
for k, v in metrics.items():
|
33 |
+
if isinstance(v, paddle.Tensor):
|
34 |
+
v = v.item()
|
35 |
+
if type(v) is dict:
|
36 |
+
v = tensors_to_scalars(v)
|
37 |
+
new_metrics[k] = v
|
38 |
+
return new_metrics
|
39 |
+
|
40 |
+
|
41 |
+
def collate_nd(values, pad_value=0, max_len=None):
|
42 |
+
"""
|
43 |
+
Pad a list of Nd tensors on their first dimension and stack them into a (N+1)d tensor.
|
44 |
+
"""
|
45 |
+
size = max(v.shape[0] for v in values) if max_len is None else max_len, *tuple(values[0].shape)[1:]
|
46 |
+
res = paddle.full(shape=(len(values), *size), fill_value=pad_value, dtype=values[0].dtype)
|
47 |
+
for i, v in enumerate(values):
|
48 |
+
res[i, : len(v), ...] = v
|
49 |
+
return res
|
50 |
+
|
51 |
+
|
52 |
+
def random_continuous_masks(*shape: int, dim: int, device: (str | (paddle.CPUPlace, paddle.CUDAPlace, str)) = "cpu"): # type: ignore
|
53 |
+
start, end = (
|
54 |
+
paddle.sort(
|
55 |
+
x=paddle.randint(
|
56 |
+
low=0, high=shape[dim] + 1, shape=(*shape[:dim], 2, *((1,) * (len(shape) - dim - 1)))
|
57 |
+
).expand(shape=[*((-1,) * (dim + 1)), *shape[dim + 1 :]]),
|
58 |
+
axis=dim,
|
59 |
+
),
|
60 |
+
paddle.argsort(
|
61 |
+
x=paddle.randint(
|
62 |
+
low=0, high=shape[dim] + 1, shape=(*shape[:dim], 2, *((1,) * (len(shape) - dim - 1)))
|
63 |
+
).expand(shape=[*((-1,) * (dim + 1)), *shape[dim + 1 :]]),
|
64 |
+
axis=dim,
|
65 |
+
),
|
66 |
+
)[0].split(1, dim=dim)
|
67 |
+
idx = paddle.arange(start=0, end=shape[dim], dtype="int64").reshape(
|
68 |
+
*((1,) * dim), shape[dim], *((1,) * (len(shape) - dim - 1))
|
69 |
+
)
|
70 |
+
masks = (idx >= start) & (idx < end)
|
71 |
+
return masks
|
72 |
+
|
73 |
+
|
74 |
+
def _is_batch_full(batch, num_frames, max_batch_frames, max_batch_size):
|
75 |
+
if len(batch) == 0:
|
76 |
+
return 0
|
77 |
+
if len(batch) == max_batch_size:
|
78 |
+
return 1
|
79 |
+
if num_frames > max_batch_frames:
|
80 |
+
return 1
|
81 |
+
return 0
|
82 |
+
|
83 |
+
|
84 |
+
def batch_by_size(indices, num_frames_fn, max_batch_frames=80000, max_batch_size=48, required_batch_size_multiple=1):
|
85 |
+
"""
|
86 |
+
Yield mini-batches of indices bucketed by size. Batches may contain
|
87 |
+
sequences of different lengths.
|
88 |
+
|
89 |
+
Args:
|
90 |
+
indices (List[int]): ordered list of dataset indices
|
91 |
+
num_frames_fn (callable): function that returns the number of frames at
|
92 |
+
a given index
|
93 |
+
max_batch_frames (int, optional): max number of frames in each batch
|
94 |
+
(default: 80000).
|
95 |
+
max_batch_size (int, optional): max number of sentences in each
|
96 |
+
batch (default: 48).
|
97 |
+
required_batch_size_multiple: require the batch size to be multiple
|
98 |
+
of a given number
|
99 |
+
"""
|
100 |
+
bsz_mult = required_batch_size_multiple
|
101 |
+
if isinstance(indices, types.GeneratorType):
|
102 |
+
indices = np.fromiter(indices, dtype=np.int64, count=-1)
|
103 |
+
sample_len = 0
|
104 |
+
sample_lens = []
|
105 |
+
batch = []
|
106 |
+
batches = []
|
107 |
+
for i in range(len(indices)):
|
108 |
+
idx = indices[i]
|
109 |
+
num_frames = num_frames_fn(idx)
|
110 |
+
sample_lens.append(num_frames)
|
111 |
+
sample_len = max(sample_len, num_frames)
|
112 |
+
assert (
|
113 |
+
sample_len <= max_batch_frames
|
114 |
+
), "sentence at index {} of size {} exceeds max_batch_samples limit of {}!".format(
|
115 |
+
idx, sample_len, max_batch_frames
|
116 |
+
)
|
117 |
+
num_frames = (len(batch) + 1) * sample_len
|
118 |
+
if _is_batch_full(batch, num_frames, max_batch_frames, max_batch_size):
|
119 |
+
mod_len = max(bsz_mult * (len(batch) // bsz_mult), len(batch) % bsz_mult)
|
120 |
+
batches.append(batch[:mod_len])
|
121 |
+
batch = batch[mod_len:]
|
122 |
+
sample_lens = sample_lens[mod_len:]
|
123 |
+
sample_len = max(sample_lens) if len(sample_lens) > 0 else 0
|
124 |
+
batch.append(idx)
|
125 |
+
if len(batch) > 0:
|
126 |
+
batches.append(batch)
|
127 |
+
return batches
|
128 |
+
|
129 |
+
|
130 |
+
def make_positions(tensor, padding_idx):
|
131 |
+
"""Replace non-padding symbols with their position numbers.
|
132 |
+
|
133 |
+
Position numbers begin at padding_idx+1. Padding symbols are ignored.
|
134 |
+
"""
|
135 |
+
mask = tensor.not_equal(y=paddle.to_tensor(padding_idx)).astype(dtype="int32")
|
136 |
+
return (paddle.cumsum(x=mask, axis=1).astype(dtype=mask.dtype) * mask).astype(dtype="int64") + padding_idx
|
137 |
+
|
138 |
+
|
139 |
+
def softmax(x, dim):
|
140 |
+
return paddle.nn.functional.softmax(x=x, axis=dim, dtype="float32")
|
141 |
+
|
142 |
+
|
143 |
+
def unpack_dict_to_list(samples):
|
144 |
+
samples_ = []
|
145 |
+
bsz = samples.get("outputs").shape[0]
|
146 |
+
for i in range(bsz):
|
147 |
+
res = {}
|
148 |
+
for k, v in samples.items():
|
149 |
+
try:
|
150 |
+
res[k] = v[i]
|
151 |
+
except:
|
152 |
+
pass
|
153 |
+
samples_.append(res)
|
154 |
+
return samples_
|
155 |
+
|
156 |
+
|
157 |
+
def filter_kwargs(dict_to_filter, kwarg_obj):
|
158 |
+
import inspect
|
159 |
+
|
160 |
+
sig = inspect.signature(kwarg_obj)
|
161 |
+
if any(param.kind == param.VAR_KEYWORD for param in sig.parameters.values()):
|
162 |
+
return dict_to_filter.copy()
|
163 |
+
filter_keys = [
|
164 |
+
param.name
|
165 |
+
for param in sig.parameters.values()
|
166 |
+
if param.kind == param.POSITIONAL_OR_KEYWORD or param.kind == param.KEYWORD_ONLY
|
167 |
+
]
|
168 |
+
filtered_dict = {
|
169 |
+
filter_key: dict_to_filter[filter_key] for filter_key in filter_keys if filter_key in dict_to_filter
|
170 |
+
}
|
171 |
+
return filtered_dict
|
172 |
+
|
173 |
+
|
174 |
+
def load_ckpt(
|
175 |
+
cur_model,
|
176 |
+
ckpt_base_dir,
|
177 |
+
ckpt_steps=None,
|
178 |
+
prefix_in_ckpt="model",
|
179 |
+
ignored_prefixes=None,
|
180 |
+
key_in_ckpt="state_dict",
|
181 |
+
strict=True,
|
182 |
+
device="cpu",
|
183 |
+
):
|
184 |
+
if ignored_prefixes is None:
|
185 |
+
ignored_prefixes = ["model.fs2.encoder.embed_tokens"]
|
186 |
+
if not isinstance(ckpt_base_dir, pathlib.Path):
|
187 |
+
ckpt_base_dir = pathlib.Path(ckpt_base_dir)
|
188 |
+
if ckpt_base_dir.is_file():
|
189 |
+
checkpoint_path = [ckpt_base_dir]
|
190 |
+
elif ckpt_steps is not None:
|
191 |
+
checkpoint_path = [ckpt_base_dir / f"model_ckpt_steps_{int(ckpt_steps)}.ckpt"]
|
192 |
+
else:
|
193 |
+
base_dir = ckpt_base_dir
|
194 |
+
checkpoint_path = sorted(
|
195 |
+
[
|
196 |
+
ckpt_file
|
197 |
+
for ckpt_file in base_dir.iterdir()
|
198 |
+
if ckpt_file.is_file() and re.fullmatch("model_ckpt_steps_\\d+\\.ckpt", ckpt_file.name)
|
199 |
+
],
|
200 |
+
key=lambda x: int(re.search("\\d+", x.name).group(0)),
|
201 |
+
)
|
202 |
+
assert len(checkpoint_path) > 0, f"| ckpt not found in {ckpt_base_dir}."
|
203 |
+
checkpoint_path = checkpoint_path[-1]
|
204 |
+
ckpt_loaded = paddle.load(path=str(checkpoint_path))
|
205 |
+
if isinstance(cur_model, CategorizedModule):
|
206 |
+
cur_model.check_category(ckpt_loaded.get("category"))
|
207 |
+
if key_in_ckpt is None:
|
208 |
+
state_dict = ckpt_loaded
|
209 |
+
else:
|
210 |
+
state_dict = ckpt_loaded[key_in_ckpt]
|
211 |
+
if prefix_in_ckpt is not None:
|
212 |
+
state_dict = OrderedDict(
|
213 |
+
{
|
214 |
+
k[len(prefix_in_ckpt) + 1 :]: v
|
215 |
+
for k, v in state_dict.items()
|
216 |
+
if k.startswith(f"{prefix_in_ckpt}.")
|
217 |
+
if all(not k.startswith(p) for p in ignored_prefixes)
|
218 |
+
}
|
219 |
+
)
|
220 |
+
if not strict:
|
221 |
+
cur_model_state_dict = cur_model.state_dict()
|
222 |
+
unmatched_keys = []
|
223 |
+
for key, param in state_dict.items():
|
224 |
+
if key in cur_model_state_dict:
|
225 |
+
new_param = cur_model_state_dict[key]
|
226 |
+
if tuple(new_param.shape) != tuple(param.shape):
|
227 |
+
unmatched_keys.append(key)
|
228 |
+
print("| Unmatched keys: ", key, tuple(new_param.shape), tuple(param.shape))
|
229 |
+
for key in unmatched_keys:
|
230 |
+
del state_dict[key]
|
231 |
+
cur_model.set_state_dict(state_dict=state_dict)
|
232 |
+
shown_model_name = "state dict"
|
233 |
+
if prefix_in_ckpt is not None:
|
234 |
+
shown_model_name = f"'{prefix_in_ckpt}'"
|
235 |
+
elif key_in_ckpt is not None:
|
236 |
+
shown_model_name = f"'{key_in_ckpt}'"
|
237 |
+
print(f"| load {shown_model_name} from '{checkpoint_path}'.")
|
238 |
+
|
239 |
+
|
240 |
+
def remove_padding(x, padding_idx=0):
|
241 |
+
if x is None:
|
242 |
+
return None
|
243 |
+
assert len(tuple(x.shape)) in [1, 2]
|
244 |
+
if len(tuple(x.shape)) == 2:
|
245 |
+
return x[np.abs(x).sum(-1) != padding_idx]
|
246 |
+
elif len(tuple(x.shape)) == 1:
|
247 |
+
return x[x != padding_idx]
|
248 |
+
|
249 |
+
|
250 |
+
class Timer:
|
251 |
+
timer_map = {}
|
252 |
+
|
253 |
+
def __init__(self, name, print_time=False):
|
254 |
+
if name not in Timer.timer_map:
|
255 |
+
Timer.timer_map[name] = 0
|
256 |
+
self.name = name
|
257 |
+
self.print_time = print_time
|
258 |
+
|
259 |
+
def __enter__(self):
|
260 |
+
self.t = time.time()
|
261 |
+
|
262 |
+
def __exit__(self, exc_type, exc_val, exc_tb):
|
263 |
+
Timer.timer_map[self.name] += time.time() - self.t
|
264 |
+
if self.print_time:
|
265 |
+
print(self.name, Timer.timer_map[self.name])
|
266 |
+
|
267 |
+
|
268 |
+
def print_arch(model, model_name="model"):
|
269 |
+
print(f"| {model_name} Arch: ", model)
|
270 |
+
|
271 |
+
|
272 |
+
def num_params(model, print_out=True, model_name="model"):
|
273 |
+
parameters = filter(lambda p: not p.stop_gradient, model.parameters())
|
274 |
+
parameters = sum([np.prod(tuple(p.shape)) for p in parameters]) / 1000000
|
275 |
+
if print_out:
|
276 |
+
print(f"| {model_name} Trainable Parameters: %.3fM" % parameters)
|
277 |
+
return parameters
|
278 |
+
|
279 |
+
|
280 |
+
def build_object_from_class_name(cls_str, parent_cls, *args, **kwargs):
|
281 |
+
import importlib
|
282 |
+
|
283 |
+
pkg = ".".join(cls_str.split(".")[:-1])
|
284 |
+
cls_name = cls_str.split(".")[-1]
|
285 |
+
cls_type = getattr(importlib.import_module(pkg), cls_name)
|
286 |
+
if parent_cls is not None:
|
287 |
+
assert issubclass(cls_type, parent_cls), f"| {cls_type} is not subclass of {parent_cls}."
|
288 |
+
return cls_type(*args, **filter_kwargs(kwargs, cls_type))
|
289 |
+
|
290 |
+
|
291 |
+
def build_lr_scheduler_from_config(optimizer, scheduler_args):
|
292 |
+
# try:
|
293 |
+
# except ImportError:
|
294 |
+
from paddle.optimizer.lr import LRScheduler as LRScheduler
|
295 |
+
|
296 |
+
def helper(params):
|
297 |
+
if isinstance(params, list):
|
298 |
+
return [helper(s) for s in params]
|
299 |
+
elif isinstance(params, dict):
|
300 |
+
resolved = {k: helper(v) for k, v in params.items()}
|
301 |
+
if "cls" in resolved:
|
302 |
+
if (
|
303 |
+
resolved["cls"] == "torch.optim.lr_scheduler.ChainedScheduler"
|
304 |
+
and scheduler_args["scheduler_cls"] == "torch.optim.lr_scheduler.SequentialLR"
|
305 |
+
):
|
306 |
+
raise ValueError(f"ChainedScheduler cannot be part of a SequentialLR.")
|
307 |
+
resolved["optimizer"] = optimizer
|
308 |
+
obj = build_object_from_class_name(resolved["cls"], LRScheduler, **resolved)
|
309 |
+
return obj
|
310 |
+
return resolved
|
311 |
+
else:
|
312 |
+
return params
|
313 |
+
|
314 |
+
resolved = helper(scheduler_args)
|
315 |
+
resolved["optimizer"] = optimizer
|
316 |
+
return build_object_from_class_name(scheduler_args["scheduler_cls"], LRScheduler, **resolved)
|
317 |
+
|
318 |
+
|
319 |
+
def simulate_lr_scheduler(optimizer_args, scheduler_args, step_count, num_param_groups=1):
|
320 |
+
optimizer = build_object_from_class_name(
|
321 |
+
optimizer_args["optimizer_cls"],
|
322 |
+
paddle.optimizer.Optimizer,
|
323 |
+
[
|
324 |
+
{
|
325 |
+
"params": paddle.base.framework.EagerParamBase.from_tensor(tensor=paddle.to_tensor([])),
|
326 |
+
"initial_lr": optimizer_args["lr"],
|
327 |
+
}
|
328 |
+
for _ in range(num_param_groups)
|
329 |
+
],
|
330 |
+
**optimizer_args,
|
331 |
+
)
|
332 |
+
scheduler = build_lr_scheduler_from_config(optimizer, scheduler_args)
|
333 |
+
scheduler.optimizer._step_count = 1
|
334 |
+
for _ in range(step_count):
|
335 |
+
scheduler.step()
|
336 |
+
return scheduler.state_dict()
|
337 |
+
|
338 |
+
|
339 |
+
def remove_suffix(string: str, suffix: str):
|
340 |
+
if string.endswith(suffix):
|
341 |
+
string = string[: -len(suffix)]
|
342 |
+
return string
|