Spaces:
Running
Running
Yiwen Zhao
commited on
Commit
·
fb76561
1
Parent(s):
5ec9f02
unify configs & add readme
Browse files- .gitignore +1 -0
- README.md +11 -0
- svs_utils.py +33 -33
.gitignore
CHANGED
|
@@ -1 +1,2 @@
|
|
| 1 |
*cache*
|
|
|
|
|
|
| 1 |
*cache*
|
| 2 |
+
*.wav
|
README.md
ADDED
|
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Singing Dialogue System
|
| 2 |
+
|
| 3 |
+
Currently support Japanese and Chinese Singing Conversation.
|
| 4 |
+
* Espnet env
|
| 5 |
+
* Pretrained SVS model will be downloaded at ``./cache/``
|
| 6 |
+
* Modify configs at ``./svs_utils.py#L326``
|
| 7 |
+
|
| 8 |
+
```
|
| 9 |
+
cd SingingSDS
|
| 10 |
+
python svs_utils.py
|
| 11 |
+
```
|
svs_utils.py
CHANGED
|
@@ -11,7 +11,9 @@ import torch
|
|
| 11 |
import numpy as np
|
| 12 |
import random
|
| 13 |
import json
|
| 14 |
-
|
|
|
|
|
|
|
| 15 |
|
| 16 |
# the code below should be in app.py than svs_utils.py
|
| 17 |
# espnet_model_dict = {
|
|
@@ -165,7 +167,7 @@ def svs_get_batch(model_path, answer_text, lang, random_gen=True):
|
|
| 165 |
"text": phns_str,
|
| 166 |
}
|
| 167 |
|
| 168 |
-
print(batch)
|
| 169 |
return batch
|
| 170 |
|
| 171 |
|
|
@@ -319,49 +321,47 @@ def load_song_database():
|
|
| 319 |
|
| 320 |
|
| 321 |
if __name__ == "__main__":
|
| 322 |
-
import argparse
|
| 323 |
|
| 324 |
# -------- demo code for generate audio from randomly selected song ---------#
|
| 325 |
config = argparse.Namespace(
|
| 326 |
model_path="espnet/mixdata_svs_visinger2_spkembed_lang_pretrained",
|
| 327 |
cache_dir="cache",
|
| 328 |
-
device="cpu"
|
| 329 |
-
melody_source="random_select.take_lyric_continuation"
|
| 330 |
lang="zh",
|
| 331 |
)
|
| 332 |
|
| 333 |
# load model
|
| 334 |
model = svs_warmup(config)
|
| 335 |
|
| 336 |
-
|
| 337 |
-
song2note_lengths, song_db = load_song_database()
|
| 338 |
|
| 339 |
-
|
| 340 |
-
phrase_length, metadata = estimate_sentence_length(None, config, song2note_lengths)
|
| 341 |
|
| 342 |
-
|
| 343 |
-
|
| 344 |
-
|
| 345 |
-
|
| 346 |
-
config.model_path, answer_text, config.lang
|
| 347 |
-
)
|
| 348 |
-
segment_iterator = song_segment_iterator(song_db, metadata)
|
| 349 |
-
batch = align_score_and_text(segment_iterator, lyric_ls, sybs, labels, config)
|
| 350 |
-
singer_embedding = np.load(singer_embeddings[config.model_path]["singer2 (female)"])
|
| 351 |
-
lid = np.array([langs[config.lang]])
|
| 352 |
-
output_dict = model(batch, lids=lid, spembs=singer_embedding)
|
| 353 |
-
wav_info = output_dict["wav"].cpu().numpy()
|
| 354 |
-
# write wav to output_retrieved.wav
|
| 355 |
-
import soundfile as sf
|
| 356 |
|
| 357 |
-
|
|
|
|
| 358 |
|
| 359 |
-
|
| 360 |
-
|
| 361 |
-
|
| 362 |
-
|
| 363 |
-
|
| 364 |
-
|
| 365 |
-
|
| 366 |
-
|
| 367 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 11 |
import numpy as np
|
| 12 |
import random
|
| 13 |
import json
|
| 14 |
+
|
| 15 |
+
import argparse
|
| 16 |
+
import soundfile as sf
|
| 17 |
|
| 18 |
# the code below should be in app.py than svs_utils.py
|
| 19 |
# espnet_model_dict = {
|
|
|
|
| 167 |
"text": phns_str,
|
| 168 |
}
|
| 169 |
|
| 170 |
+
# print(batch)
|
| 171 |
return batch
|
| 172 |
|
| 173 |
|
|
|
|
| 321 |
|
| 322 |
|
| 323 |
if __name__ == "__main__":
|
|
|
|
| 324 |
|
| 325 |
# -------- demo code for generate audio from randomly selected song ---------#
|
| 326 |
config = argparse.Namespace(
|
| 327 |
model_path="espnet/mixdata_svs_visinger2_spkembed_lang_pretrained",
|
| 328 |
cache_dir="cache",
|
| 329 |
+
device="cuda", # "cpu"
|
| 330 |
+
melody_source="random_generate", # "random_select.take_lyric_continuation"
|
| 331 |
lang="zh",
|
| 332 |
)
|
| 333 |
|
| 334 |
# load model
|
| 335 |
model = svs_warmup(config)
|
| 336 |
|
| 337 |
+
answer_text = "天气真好\n空气清新\n气温温和\n风和日丽\n天高气爽\n阳光明媚"
|
|
|
|
| 338 |
|
| 339 |
+
sample_rate = 44100
|
|
|
|
| 340 |
|
| 341 |
+
if config.melody_source.startswith("random_select"):
|
| 342 |
+
# load song database: jhansss/kising_score_segments
|
| 343 |
+
from datasets import load_dataset
|
| 344 |
+
song2note_lengths, song_db = load_song_database()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 345 |
|
| 346 |
+
# get song_name and phrase_length
|
| 347 |
+
phrase_length, metadata = estimate_sentence_length(None, config, song2note_lengths)
|
| 348 |
|
| 349 |
+
# then, phrase_length info should be added to llm prompt, and get the answer lyrics from llm
|
| 350 |
+
# e.g. answer_text = "天气真好\n空气清新"
|
| 351 |
+
lyric_ls, sybs, labels = svs_text_preprocessor(
|
| 352 |
+
config.model_path, answer_text, config.lang
|
| 353 |
+
)
|
| 354 |
+
segment_iterator = song_segment_iterator(song_db, metadata)
|
| 355 |
+
batch = align_score_and_text(segment_iterator, lyric_ls, sybs, labels, config)
|
| 356 |
+
singer_embedding = np.load(singer_embeddings[config.model_path]["singer2 (female)"])
|
| 357 |
+
lid = np.array([langs[config.lang]])
|
| 358 |
+
output_dict = model(batch, lids=lid, spembs=singer_embedding)
|
| 359 |
+
wav_info = output_dict["wav"].cpu().numpy()
|
| 360 |
+
|
| 361 |
+
|
| 362 |
+
elif config.melody_source.startswith("random_generate"):
|
| 363 |
+
wav_info = svs_inference(config.model_path, model, answer_text, lang=config.lang, random_gen=True, fs=sample_rate)
|
| 364 |
+
|
| 365 |
+
# write wav to output_retrieved.wav
|
| 366 |
+
save_name = config.melody_source.split('.')[0]
|
| 367 |
+
sf.write(f"{save_name}.wav", wav_info, samplerate=sample_rate)
|