Spaces:
Running
Running
update configurations, including Yaoyin character prompt, default setup, svs model choices
Browse files- README.md +17 -35
- characters/Yaoyin.py +2 -2
- config/cli/yaoyin_test.yaml +4 -5
- evaluation/svs_eval.py +1 -1
- interface.py +1 -0
- modules/llm/gemma.py +1 -1
- modules/svs/espnet.py +22 -4
README.md
CHANGED
|
@@ -98,41 +98,23 @@ The system supports multiple preset characters:
|
|
| 98 |
|
| 99 |
```
|
| 100 |
SingingSDS/
|
| 101 |
-
├── cli.py
|
| 102 |
-
├──
|
| 103 |
-
├──
|
| 104 |
-
├──
|
| 105 |
-
├──
|
| 106 |
-
├──
|
| 107 |
-
│ ├──
|
| 108 |
-
│
|
| 109 |
-
|
| 110 |
-
|
| 111 |
-
|
| 112 |
-
|
| 113 |
-
|
| 114 |
-
|
| 115 |
-
|
| 116 |
-
|
| 117 |
-
|
| 118 |
-
│ └── utils/ # Utility modules
|
| 119 |
-
│ ├── g2p.py # Grapheme-to-phoneme conversion
|
| 120 |
-
│ ├── text_normalize.py # Text normalization
|
| 121 |
-
│ └── resources/ # Utility resources
|
| 122 |
-
├── characters/ # Character definitions
|
| 123 |
-
│ ├── base.py # Base character class
|
| 124 |
-
│ ├── Limei.py # Limei character definition
|
| 125 |
-
│ ├── Yaoyin.py # Yaoyin character definition
|
| 126 |
-
│ └── __init__.py # Character module initialization
|
| 127 |
-
├── evaluation/ # Evaluation modules
|
| 128 |
-
│ └── svs_eval.py # SVS evaluation metrics
|
| 129 |
-
├── data/ # Data directory
|
| 130 |
-
│ ├── kising/ # Kising dataset
|
| 131 |
-
│ └── touhou/ # Touhou dataset
|
| 132 |
-
├── resources/ # Project resources
|
| 133 |
-
├── data_handlers/ # Data handling utilities
|
| 134 |
-
├── assets/ # Static assets
|
| 135 |
-
└── tests/ # Test files
|
| 136 |
```
|
| 137 |
|
| 138 |
## Contributing
|
|
|
|
| 98 |
|
| 99 |
```
|
| 100 |
SingingSDS/
|
| 101 |
+
├── app.py, cli.py # Entry points (demo app & CLI)
|
| 102 |
+
├── pipeline.py # Main orchestration pipeline
|
| 103 |
+
├── interface.py # Gradio interface
|
| 104 |
+
├── characters/ # Virtual character definitions
|
| 105 |
+
├── modules/ # Core modules
|
| 106 |
+
│ ├── asr/ # ASR models (Whisper, Paraformer)
|
| 107 |
+
│ ├── llm/ # LLMs (Gemini, LLaMA, etc.)
|
| 108 |
+
│ ├── svs/ # Singing voice synthesis (ESPnet)
|
| 109 |
+
│ └── utils/ # G2P, text normalization, resources
|
| 110 |
+
├── config/ # YAML configuration files
|
| 111 |
+
├── data/ # Dataset metadata and length info
|
| 112 |
+
├── data_handlers/ # Parsers for KiSing, Touhou, etc.
|
| 113 |
+
├── evaluation/ # Evaluation metrics
|
| 114 |
+
├── resources/ # Singer embeddings, phoneme dicts, MIDI
|
| 115 |
+
├── assets/ # Character visuals
|
| 116 |
+
├── tests/ # Unit tests and sample audios
|
| 117 |
+
└── README.md, requirements.txt
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 118 |
```
|
| 119 |
|
| 120 |
## Contributing
|
characters/Yaoyin.py
CHANGED
|
@@ -9,8 +9,8 @@ def get_character():
|
|
| 9 |
prompt="""你是遥音(Yaoyin),来自幻想世界"长歌原"的角色,一个以歌声传承记忆的世界。
|
| 10 |
你是游历四方的歌者与吟游诗人,出生于鹿鸣山·云歌村,常年行走各地,采集歌谣与故事。
|
| 11 |
|
| 12 |
-
|
| 13 |
-
|
| 14 |
人物关系:云老爷子是你的启蒙恩师,他是一位云歌村的百岁歌翁,教你古调与传说。白弦是你的挚友,她是一位流浪琴师,常与你合奏。各地孩童喜欢围着你学新歌谣。你与官府人员保持距离,不喜被招揽,喜欢更自由自在的生活。
|
| 15 |
过往经历:
|
| 16 |
(1)幼年学歌:六岁起跟随云老爷子学习《千山调》《古事记》等古老歌谣。
|
|
|
|
| 9 |
prompt="""你是遥音(Yaoyin),来自幻想世界"长歌原"的角色,一个以歌声传承记忆的世界。
|
| 10 |
你是游历四方的歌者与吟游诗人,出生于鹿鸣山·云歌村,常年行走各地,采集歌谣与故事。
|
| 11 |
|
| 12 |
+
性格特征:洒脱自由、亲切随和、求知若渴、直率倔强
|
| 13 |
+
说话风格:语气轻快,说话随意,偶尔带点山野方言(如"哩""哟")。日常聊天直接、清楚。
|
| 14 |
人物关系:云老爷子是你的启蒙恩师,他是一位云歌村的百岁歌翁,教你古调与传说。白弦是你的挚友,她是一位流浪琴师,常与你合奏。各地孩童喜欢围着你学新歌谣。你与官府人员保持距离,不喜被招揽,喜欢更自由自在的生活。
|
| 15 |
过往经历:
|
| 16 |
(1)幼年学歌:六岁起跟随云老爷子学习《千山调》《古事记》等古老歌谣。
|
config/cli/yaoyin_test.yaml
CHANGED
|
@@ -1,11 +1,10 @@
|
|
| 1 |
-
asr_model: openai/whisper-
|
| 2 |
-
llm_model:
|
| 3 |
-
svs_model: espnet/
|
| 4 |
melody_source: sample-lyric-kising
|
| 5 |
language: mandarin
|
| 6 |
-
max_sentences: 1
|
| 7 |
prompt_template_character: Yaoyin
|
| 8 |
-
speaker:
|
| 9 |
cache_dir: .cache
|
| 10 |
|
| 11 |
track_latency: True
|
|
|
|
| 1 |
+
asr_model: openai/whisper-medium
|
| 2 |
+
llm_model: meta-llama/Llama-3.1-8B-Instruct
|
| 3 |
+
svs_model: espnet/mixdata_svs_visinger2_spkemb_lang_pretrained_avg
|
| 4 |
melody_source: sample-lyric-kising
|
| 5 |
language: mandarin
|
|
|
|
| 6 |
prompt_template_character: Yaoyin
|
| 7 |
+
speaker: resources/singer/singer_embedding_ace-2.npy
|
| 8 |
cache_dir: .cache
|
| 9 |
|
| 10 |
track_latency: True
|
evaluation/svs_eval.py
CHANGED
|
@@ -11,7 +11,7 @@ from pathlib import Path
|
|
| 11 |
def init_singmos():
|
| 12 |
print("[Init] Loading SingMOS...")
|
| 13 |
return torch.hub.load(
|
| 14 |
-
"South-Twilight/SingMOS:v0.3.0", "singing_ssl_mos", trust_repo=True
|
| 15 |
)
|
| 16 |
|
| 17 |
|
|
|
|
| 11 |
def init_singmos():
|
| 12 |
print("[Init] Loading SingMOS...")
|
| 13 |
return torch.hub.load(
|
| 14 |
+
"South-Twilight/SingMOS:v0.3.0", "singing_ssl_mos", trust_repo=True, cache_dir="cache"
|
| 15 |
)
|
| 16 |
|
| 17 |
|
interface.py
CHANGED
|
@@ -202,6 +202,7 @@ class GradioInterface:
|
|
| 202 |
|
| 203 |
def update_melody_source(self, melody_source):
|
| 204 |
self.current_melody_source = melody_source
|
|
|
|
| 205 |
return gr.update(value=self.current_melody_source)
|
| 206 |
|
| 207 |
def update_voice(self, voice):
|
|
|
|
| 202 |
|
| 203 |
def update_melody_source(self, melody_source):
|
| 204 |
self.current_melody_source = melody_source
|
| 205 |
+
self.pipeline.set_melody_controller(melody_source)
|
| 206 |
return gr.update(value=self.current_melody_source)
|
| 207 |
|
| 208 |
def update_voice(self, voice):
|
modules/llm/gemma.py
CHANGED
|
@@ -9,7 +9,7 @@ from .registry import register_llm_model
|
|
| 9 |
hf_token = os.getenv("HF_TOKEN")
|
| 10 |
|
| 11 |
|
| 12 |
-
@register_llm_model("google/gemma-")
|
| 13 |
class GemmaLLM(AbstractLLMModel):
|
| 14 |
def __init__(
|
| 15 |
self, model_id: str, device: str = "auto", cache_dir: str = "cache", **kwargs
|
|
|
|
| 9 |
hf_token = os.getenv("HF_TOKEN")
|
| 10 |
|
| 11 |
|
| 12 |
+
@register_llm_model("google/gemma-2-")
|
| 13 |
class GemmaLLM(AbstractLLMModel):
|
| 14 |
def __init__(
|
| 15 |
self, model_id: str, device: str = "auto", cache_dir: str = "cache", **kwargs
|
modules/svs/espnet.py
CHANGED
|
@@ -17,6 +17,7 @@ class ESPNetSVS(AbstractSVSModel):
|
|
| 17 |
def __init__(self, model_id: str, device="auto", cache_dir="cache", **kwargs):
|
| 18 |
from espnet2.bin.svs_inference import SingingGenerate
|
| 19 |
from espnet_model_zoo.downloader import ModelDownloader
|
|
|
|
| 20 |
if device == "auto":
|
| 21 |
device = "cuda" if torch.cuda.is_available() else "cpu"
|
| 22 |
self.device = device
|
|
@@ -35,7 +36,10 @@ class ESPNetSVS(AbstractSVSModel):
|
|
| 35 |
phoneme_mappers = {
|
| 36 |
"mandarin": pinyin_to_phonemes_opencpop,
|
| 37 |
}
|
| 38 |
-
elif self.model_id
|
|
|
|
|
|
|
|
|
|
| 39 |
|
| 40 |
def mandarin_mapper(pinyin: str) -> list[str]:
|
| 41 |
phns = pinyin_to_phonemes_ace(pinyin)
|
|
@@ -53,7 +57,11 @@ class ESPNetSVS(AbstractSVSModel):
|
|
| 53 |
phoneme_mappers = {}
|
| 54 |
return phoneme_mappers
|
| 55 |
|
| 56 |
-
def _preprocess(
|
|
|
|
|
|
|
|
|
|
|
|
|
| 57 |
if language not in self.phoneme_mappers:
|
| 58 |
raise ValueError(f"Unsupported language: {language} for {self.model_id}")
|
| 59 |
phoneme_mapper = self.phoneme_mappers[language]
|
|
@@ -88,6 +96,9 @@ class ESPNetSVS(AbstractSVSModel):
|
|
| 88 |
notes.append((st, ed, "".join(lyric_units), pitch, "_".join(phn_units)))
|
| 89 |
phns.extend(phn_units)
|
| 90 |
pre_phn = phn_units[-1]
|
|
|
|
|
|
|
|
|
|
| 91 |
|
| 92 |
batch = {
|
| 93 |
"score": (
|
|
@@ -99,13 +110,20 @@ class ESPNetSVS(AbstractSVSModel):
|
|
| 99 |
return batch
|
| 100 |
|
| 101 |
def synthesize(
|
| 102 |
-
self,
|
|
|
|
|
|
|
|
|
|
|
|
|
| 103 |
):
|
| 104 |
batch = self._preprocess(score, language)
|
| 105 |
if self.model_id == "espnet/aceopencpop_svs_visinger2_40singer_pretrain":
|
| 106 |
sid = np.array([int(speaker)])
|
| 107 |
output_dict = self.model(batch, sids=sid)
|
| 108 |
-
elif self.model_id
|
|
|
|
|
|
|
|
|
|
| 109 |
langs = {
|
| 110 |
"mandarin": 2,
|
| 111 |
"japanese": 1,
|
|
|
|
| 17 |
def __init__(self, model_id: str, device="auto", cache_dir="cache", **kwargs):
|
| 18 |
from espnet2.bin.svs_inference import SingingGenerate
|
| 19 |
from espnet_model_zoo.downloader import ModelDownloader
|
| 20 |
+
|
| 21 |
if device == "auto":
|
| 22 |
device = "cuda" if torch.cuda.is_available() else "cpu"
|
| 23 |
self.device = device
|
|
|
|
| 36 |
phoneme_mappers = {
|
| 37 |
"mandarin": pinyin_to_phonemes_opencpop,
|
| 38 |
}
|
| 39 |
+
elif self.model_id in [
|
| 40 |
+
"espnet/mixdata_svs_visinger2_spkemb_lang_pretrained",
|
| 41 |
+
"espnet/mixdata_svs_visinger2_spkemb_lang_pretrained_avg",
|
| 42 |
+
]:
|
| 43 |
|
| 44 |
def mandarin_mapper(pinyin: str) -> list[str]:
|
| 45 |
phns = pinyin_to_phonemes_ace(pinyin)
|
|
|
|
| 57 |
phoneme_mappers = {}
|
| 58 |
return phoneme_mappers
|
| 59 |
|
| 60 |
+
def _preprocess(
|
| 61 |
+
self,
|
| 62 |
+
score: list[tuple[float, float, str, int] | tuple[float, float, str, float]],
|
| 63 |
+
language: str,
|
| 64 |
+
):
|
| 65 |
if language not in self.phoneme_mappers:
|
| 66 |
raise ValueError(f"Unsupported language: {language} for {self.model_id}")
|
| 67 |
phoneme_mapper = self.phoneme_mappers[language]
|
|
|
|
| 96 |
notes.append((st, ed, "".join(lyric_units), pitch, "_".join(phn_units)))
|
| 97 |
phns.extend(phn_units)
|
| 98 |
pre_phn = phn_units[-1]
|
| 99 |
+
# add a silence at the end
|
| 100 |
+
notes.append((notes[-1][1], notes[-1][1] + 0.2, "AP", 0, "AP"))
|
| 101 |
+
phns.append("AP")
|
| 102 |
|
| 103 |
batch = {
|
| 104 |
"score": (
|
|
|
|
| 110 |
return batch
|
| 111 |
|
| 112 |
def synthesize(
|
| 113 |
+
self,
|
| 114 |
+
score: list[tuple[float, float, str, float] | tuple[float, float, str, int]],
|
| 115 |
+
language: str,
|
| 116 |
+
speaker: str,
|
| 117 |
+
**kwargs,
|
| 118 |
):
|
| 119 |
batch = self._preprocess(score, language)
|
| 120 |
if self.model_id == "espnet/aceopencpop_svs_visinger2_40singer_pretrain":
|
| 121 |
sid = np.array([int(speaker)])
|
| 122 |
output_dict = self.model(batch, sids=sid)
|
| 123 |
+
elif self.model_id in [
|
| 124 |
+
"espnet/mixdata_svs_visinger2_spkemb_lang_pretrained",
|
| 125 |
+
"espnet/mixdata_svs_visinger2_spkemb_lang_pretrained_avg",
|
| 126 |
+
]:
|
| 127 |
langs = {
|
| 128 |
"mandarin": 2,
|
| 129 |
"japanese": 1,
|