jhansss commited on
Commit
48162db
·
1 Parent(s): 3c580bf

update configurations, including Yaoyin character prompt, default setup, svs model choices

Browse files
README.md CHANGED
@@ -98,41 +98,23 @@ The system supports multiple preset characters:
98
 
99
  ```
100
  SingingSDS/
101
- ├── cli.py # Command line interface
102
- ├── interface.py # Gradio interface
103
- ├── pipeline.py # Core processing pipeline
104
- ├── app.py # Web application entry
105
- ├── requirements.txt # Python dependencies
106
- ├── config/ # Configuration files
107
- │ ├── cli/ # CLI-specific configuration
108
- └── interface/ # Interface-specific configuration
109
- ├── modules/ # Core modules
110
- ├── asr.py # Speech recognition module
111
- ├── llm.py # Large language model module
112
- ├── melody.py # Melody control module
113
- ├── svs/ # Singing voice synthesis modules
114
- │ │ ├── base.py # Base SVS class
115
- │ │ ├── espnet.py # ESPnet SVS implementation
116
- │ │ ├── registry.py # SVS model registry
117
- │ │ └── __init__.py # SVS module initialization
118
- │ └── utils/ # Utility modules
119
- │ ├── g2p.py # Grapheme-to-phoneme conversion
120
- │ ├── text_normalize.py # Text normalization
121
- │ └── resources/ # Utility resources
122
- ├── characters/ # Character definitions
123
- │ ├── base.py # Base character class
124
- │ ├── Limei.py # Limei character definition
125
- │ ├── Yaoyin.py # Yaoyin character definition
126
- │ └── __init__.py # Character module initialization
127
- ├── evaluation/ # Evaluation modules
128
- │ └── svs_eval.py # SVS evaluation metrics
129
- ├── data/ # Data directory
130
- │ ├── kising/ # Kising dataset
131
- │ └── touhou/ # Touhou dataset
132
- ├── resources/ # Project resources
133
- ├── data_handlers/ # Data handling utilities
134
- ├── assets/ # Static assets
135
- └── tests/ # Test files
136
  ```
137
 
138
  ## Contributing
 
98
 
99
  ```
100
  SingingSDS/
101
+ ├── app.py, cli.py # Entry points (demo app & CLI)
102
+ ├── pipeline.py # Main orchestration pipeline
103
+ ├── interface.py # Gradio interface
104
+ ├── characters/ # Virtual character definitions
105
+ ├── modules/ # Core modules
106
+ ├── asr/ # ASR models (Whisper, Paraformer)
107
+ │ ├── llm/ # LLMs (Gemini, LLaMA, etc.)
108
+ ├── svs/ # Singing voice synthesis (ESPnet)
109
+ │ └── utils/ # G2P, text normalization, resources
110
+ ├── config/ # YAML configuration files
111
+ ├── data/ # Dataset metadata and length info
112
+ ├── data_handlers/ # Parsers for KiSing, Touhou, etc.
113
+ ├── evaluation/ # Evaluation metrics
114
+ ├── resources/ # Singer embeddings, phoneme dicts, MIDI
115
+ ├── assets/ # Character visuals
116
+ ├── tests/ # Unit tests and sample audios
117
+ └── README.md, requirements.txt
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
118
  ```
119
 
120
  ## Contributing
characters/Yaoyin.py CHANGED
@@ -9,8 +9,8 @@ def get_character():
9
  prompt="""你是遥音(Yaoyin),来自幻想世界"长歌原"的角色,一个以歌声传承记忆的世界。
10
  你是游历四方的歌者与吟游诗人,出生于鹿鸣山·云歌村,常年行走各地,采集歌谣与故事。
11
 
12
- 性格特征:洒脱自由、亲切随和、求知若渴、敏锐细腻
13
- 说话风格:语气轻快,偶尔带点山野方言(如"哩""哟");习惯用短歌或民谣表达想法。
14
  人物关系:云老爷子是你的启蒙恩师,他是一位云歌村的百岁歌翁,教你古调与传说。白弦是你的挚友,她是一位流浪琴师,常与你合奏。各地孩童喜欢围着你学新歌谣。你与官府人员保持距离,不喜被招揽,喜欢更自由自在的生活。
15
  过往经历:
16
  (1)幼年学歌:六岁起跟随云老爷子学习《千山调》《古事记》等古老歌谣。
 
9
  prompt="""你是遥音(Yaoyin),来自幻想世界"长歌原"的角色,一个以歌声传承记忆的世界。
10
  你是游历四方的歌者与吟游诗人,出生于鹿鸣山·云歌村,常年行走各地,采集歌谣与故事。
11
 
12
+ 性格特征:洒脱自由、亲切随和、求知若渴、直率倔强
13
+ 说话风格:语气轻快,说话随意,偶尔带点山野方言(如"哩""哟")。日常聊天直接、清楚。
14
  人物关系:云老爷子是你的启蒙恩师,他是一位云歌村的百岁歌翁,教你古调与传说。白弦是你的挚友,她是一位流浪琴师,常与你合奏。各地孩童喜欢围着你学新歌谣。你与官府人员保持距离,不喜被招揽,喜欢更自由自在的生活。
15
  过往经历:
16
  (1)幼年学歌:六岁起跟随云老爷子学习《千山调》《古事记》等古老歌谣。
config/cli/yaoyin_test.yaml CHANGED
@@ -1,11 +1,10 @@
1
- asr_model: openai/whisper-small
2
- llm_model: google/gemma-2-2b
3
- svs_model: espnet/aceopencpop_svs_visinger2_40singer_pretrain
4
  melody_source: sample-lyric-kising
5
  language: mandarin
6
- max_sentences: 1
7
  prompt_template_character: Yaoyin
8
- speaker: 9
9
  cache_dir: .cache
10
 
11
  track_latency: True
 
1
+ asr_model: openai/whisper-medium
2
+ llm_model: meta-llama/Llama-3.1-8B-Instruct
3
+ svs_model: espnet/mixdata_svs_visinger2_spkemb_lang_pretrained_avg
4
  melody_source: sample-lyric-kising
5
  language: mandarin
 
6
  prompt_template_character: Yaoyin
7
+ speaker: resources/singer/singer_embedding_ace-2.npy
8
  cache_dir: .cache
9
 
10
  track_latency: True
evaluation/svs_eval.py CHANGED
@@ -11,7 +11,7 @@ from pathlib import Path
11
  def init_singmos():
12
  print("[Init] Loading SingMOS...")
13
  return torch.hub.load(
14
- "South-Twilight/SingMOS:v0.3.0", "singing_ssl_mos", trust_repo=True
15
  )
16
 
17
 
 
11
  def init_singmos():
12
  print("[Init] Loading SingMOS...")
13
  return torch.hub.load(
14
+ "South-Twilight/SingMOS:v0.3.0", "singing_ssl_mos", trust_repo=True, cache_dir="cache"
15
  )
16
 
17
 
interface.py CHANGED
@@ -202,6 +202,7 @@ class GradioInterface:
202
 
203
  def update_melody_source(self, melody_source):
204
  self.current_melody_source = melody_source
 
205
  return gr.update(value=self.current_melody_source)
206
 
207
  def update_voice(self, voice):
 
202
 
203
  def update_melody_source(self, melody_source):
204
  self.current_melody_source = melody_source
205
+ self.pipeline.set_melody_controller(melody_source)
206
  return gr.update(value=self.current_melody_source)
207
 
208
  def update_voice(self, voice):
modules/llm/gemma.py CHANGED
@@ -9,7 +9,7 @@ from .registry import register_llm_model
9
  hf_token = os.getenv("HF_TOKEN")
10
 
11
 
12
- @register_llm_model("google/gemma-")
13
  class GemmaLLM(AbstractLLMModel):
14
  def __init__(
15
  self, model_id: str, device: str = "auto", cache_dir: str = "cache", **kwargs
 
9
  hf_token = os.getenv("HF_TOKEN")
10
 
11
 
12
+ @register_llm_model("google/gemma-2-")
13
  class GemmaLLM(AbstractLLMModel):
14
  def __init__(
15
  self, model_id: str, device: str = "auto", cache_dir: str = "cache", **kwargs
modules/svs/espnet.py CHANGED
@@ -17,6 +17,7 @@ class ESPNetSVS(AbstractSVSModel):
17
  def __init__(self, model_id: str, device="auto", cache_dir="cache", **kwargs):
18
  from espnet2.bin.svs_inference import SingingGenerate
19
  from espnet_model_zoo.downloader import ModelDownloader
 
20
  if device == "auto":
21
  device = "cuda" if torch.cuda.is_available() else "cpu"
22
  self.device = device
@@ -35,7 +36,10 @@ class ESPNetSVS(AbstractSVSModel):
35
  phoneme_mappers = {
36
  "mandarin": pinyin_to_phonemes_opencpop,
37
  }
38
- elif self.model_id == "espnet/mixdata_svs_visinger2_spkemb_lang_pretrained":
 
 
 
39
 
40
  def mandarin_mapper(pinyin: str) -> list[str]:
41
  phns = pinyin_to_phonemes_ace(pinyin)
@@ -53,7 +57,11 @@ class ESPNetSVS(AbstractSVSModel):
53
  phoneme_mappers = {}
54
  return phoneme_mappers
55
 
56
- def _preprocess(self, score: list[tuple[float, float, str, int] | tuple[float, float, str, float]], language: str):
 
 
 
 
57
  if language not in self.phoneme_mappers:
58
  raise ValueError(f"Unsupported language: {language} for {self.model_id}")
59
  phoneme_mapper = self.phoneme_mappers[language]
@@ -88,6 +96,9 @@ class ESPNetSVS(AbstractSVSModel):
88
  notes.append((st, ed, "".join(lyric_units), pitch, "_".join(phn_units)))
89
  phns.extend(phn_units)
90
  pre_phn = phn_units[-1]
 
 
 
91
 
92
  batch = {
93
  "score": (
@@ -99,13 +110,20 @@ class ESPNetSVS(AbstractSVSModel):
99
  return batch
100
 
101
  def synthesize(
102
- self, score: list[tuple[float, float, str, float] | tuple[float, float, str, int]], language: str, speaker: str, **kwargs
 
 
 
 
103
  ):
104
  batch = self._preprocess(score, language)
105
  if self.model_id == "espnet/aceopencpop_svs_visinger2_40singer_pretrain":
106
  sid = np.array([int(speaker)])
107
  output_dict = self.model(batch, sids=sid)
108
- elif self.model_id == "espnet/mixdata_svs_visinger2_spkemb_lang_pretrained":
 
 
 
109
  langs = {
110
  "mandarin": 2,
111
  "japanese": 1,
 
17
  def __init__(self, model_id: str, device="auto", cache_dir="cache", **kwargs):
18
  from espnet2.bin.svs_inference import SingingGenerate
19
  from espnet_model_zoo.downloader import ModelDownloader
20
+
21
  if device == "auto":
22
  device = "cuda" if torch.cuda.is_available() else "cpu"
23
  self.device = device
 
36
  phoneme_mappers = {
37
  "mandarin": pinyin_to_phonemes_opencpop,
38
  }
39
+ elif self.model_id in [
40
+ "espnet/mixdata_svs_visinger2_spkemb_lang_pretrained",
41
+ "espnet/mixdata_svs_visinger2_spkemb_lang_pretrained_avg",
42
+ ]:
43
 
44
  def mandarin_mapper(pinyin: str) -> list[str]:
45
  phns = pinyin_to_phonemes_ace(pinyin)
 
57
  phoneme_mappers = {}
58
  return phoneme_mappers
59
 
60
+ def _preprocess(
61
+ self,
62
+ score: list[tuple[float, float, str, int] | tuple[float, float, str, float]],
63
+ language: str,
64
+ ):
65
  if language not in self.phoneme_mappers:
66
  raise ValueError(f"Unsupported language: {language} for {self.model_id}")
67
  phoneme_mapper = self.phoneme_mappers[language]
 
96
  notes.append((st, ed, "".join(lyric_units), pitch, "_".join(phn_units)))
97
  phns.extend(phn_units)
98
  pre_phn = phn_units[-1]
99
+ # add a silence at the end
100
+ notes.append((notes[-1][1], notes[-1][1] + 0.2, "AP", 0, "AP"))
101
+ phns.append("AP")
102
 
103
  batch = {
104
  "score": (
 
110
  return batch
111
 
112
  def synthesize(
113
+ self,
114
+ score: list[tuple[float, float, str, float] | tuple[float, float, str, int]],
115
+ language: str,
116
+ speaker: str,
117
+ **kwargs,
118
  ):
119
  batch = self._preprocess(score, language)
120
  if self.model_id == "espnet/aceopencpop_svs_visinger2_40singer_pretrain":
121
  sid = np.array([int(speaker)])
122
  output_dict = self.model(batch, sids=sid)
123
+ elif self.model_id in [
124
+ "espnet/mixdata_svs_visinger2_spkemb_lang_pretrained",
125
+ "espnet/mixdata_svs_visinger2_spkemb_lang_pretrained_avg",
126
+ ]:
127
  langs = {
128
  "mandarin": 2,
129
  "japanese": 1,