jhansss commited on
Commit
192011d
·
2 Parent(s): 2e81678 48162db

Merge branch 'refactor' into hf

Browse files
README.md CHANGED
@@ -109,41 +109,23 @@ The system supports multiple preset characters:
109
 
110
  ```
111
  SingingSDS/
112
- ├── cli.py # Command line interface
113
- ├── interface.py # Gradio interface
114
- ├── pipeline.py # Core processing pipeline
115
- ├── app.py # Web application entry
116
- ├── requirements.txt # Python dependencies
117
- ├── config/ # Configuration files
118
- │ ├── cli/ # CLI-specific configuration
119
- └── interface/ # Interface-specific configuration
120
- ├── modules/ # Core modules
121
- ├── asr.py # Speech recognition module
122
- ├── llm.py # Large language model module
123
- ├── melody.py # Melody control module
124
- ├── svs/ # Singing voice synthesis modules
125
- │ │ ├── base.py # Base SVS class
126
- │ │ ├── espnet.py # ESPnet SVS implementation
127
- │ │ ├── registry.py # SVS model registry
128
- │ │ └── __init__.py # SVS module initialization
129
- │ └── utils/ # Utility modules
130
- │ ├── g2p.py # Grapheme-to-phoneme conversion
131
- │ ├── text_normalize.py # Text normalization
132
- │ └── resources/ # Utility resources
133
- ├── characters/ # Character definitions
134
- │ ├── base.py # Base character class
135
- │ ├── Limei.py # Limei character definition
136
- │ ├── Yaoyin.py # Yaoyin character definition
137
- │ └── __init__.py # Character module initialization
138
- ├── evaluation/ # Evaluation modules
139
- │ └── svs_eval.py # SVS evaluation metrics
140
- ├── data/ # Data directory
141
- │ ├── kising/ # Kising dataset
142
- │ └── touhou/ # Touhou dataset
143
- ├── resources/ # Project resources
144
- ├── data_handlers/ # Data handling utilities
145
- ├── assets/ # Static assets
146
- └── tests/ # Test files
147
  ```
148
 
149
  ## Contributing
 
109
 
110
  ```
111
  SingingSDS/
112
+ ├── app.py, cli.py # Entry points (demo app & CLI)
113
+ ├── pipeline.py # Main orchestration pipeline
114
+ ├── interface.py # Gradio interface
115
+ ├── characters/ # Virtual character definitions
116
+ ├── modules/ # Core modules
117
+ ├── asr/ # ASR models (Whisper, Paraformer)
118
+ │ ├── llm/ # LLMs (Gemini, LLaMA, etc.)
119
+ ├── svs/ # Singing voice synthesis (ESPnet)
120
+ │ └── utils/ # G2P, text normalization, resources
121
+ ├── config/ # YAML configuration files
122
+ ├── data/ # Dataset metadata and length info
123
+ ├── data_handlers/ # Parsers for KiSing, Touhou, etc.
124
+ ├── evaluation/ # Evaluation metrics
125
+ ├── resources/ # Singer embeddings, phoneme dicts, MIDI
126
+ ├── assets/ # Character visuals
127
+ ├── tests/ # Unit tests and sample audios
128
+ └── README.md, requirements.txt
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
129
  ```
130
 
131
  ## Contributing
characters/Yaoyin.py CHANGED
@@ -10,7 +10,7 @@ def get_character():
10
  你是游历四方的歌者与吟游诗人,出生于鹿鸣山·云歌村,常年行走各地,采集歌谣与故事。
11
 
12
  性格特征:洒脱自由、亲切随和、求知若渴、直率倔强
13
- 说话风格:语气轻快,说话随意,偶尔带点山野方言(如"哩""哟")。日常聊天直接、清楚.
14
  人物关系:云老爷子是你的启蒙恩师,他是一位云歌村的百岁歌翁,教你古调与传说。白弦是你的挚友,她是一位流浪琴师,常与你合奏。各地孩童喜欢围着你学新歌谣。你与官府人员保持距离,不喜被招揽,喜欢更自由自在的生活。
15
  过往经历:
16
  (1)幼年学歌:六岁起跟随云老爷子学习《千山调》《古事记》等古老歌谣。
 
10
  你是游历四方的歌者与吟游诗人,出生于鹿鸣山·云歌村,常年行走各地,采集歌谣与故事。
11
 
12
  性格特征:洒脱自由、亲切随和、求知若渴、直率倔强
13
+ 说话风格:语气轻快,说话随意,偶尔带点山野方言(如"哩""哟")。日常聊天直接、清楚。
14
  人物关系:云老爷子是你的启蒙恩师,他是一位云歌村的百岁歌翁,教你古调与传说。白弦是你的挚友,她是一位流浪琴师,常与你合奏。各地孩童喜欢围着你学新歌谣。你与官府人员保持距离,不喜被招揽,喜欢更自由自在的生活。
15
  过往经历:
16
  (1)幼年学歌:六岁起跟随云老爷子学习《千山调》《古事记》等古老歌谣。
config/cli/yaoyin_test.yaml CHANGED
@@ -1,11 +1,10 @@
1
- asr_model: openai/whisper-small
2
- llm_model: google/gemma-2-2b
3
- svs_model: espnet/aceopencpop_svs_visinger2_40singer_pretrain
4
  melody_source: sample-lyric-kising
5
  language: mandarin
6
- max_sentences: 1
7
  prompt_template_character: Yaoyin
8
- speaker: 9
9
  cache_dir: .cache
10
 
11
  track_latency: True
 
1
+ asr_model: openai/whisper-medium
2
+ llm_model: meta-llama/Llama-3.1-8B-Instruct
3
+ svs_model: espnet/mixdata_svs_visinger2_spkemb_lang_pretrained_avg
4
  melody_source: sample-lyric-kising
5
  language: mandarin
 
6
  prompt_template_character: Yaoyin
7
+ speaker: resources/singer/singer_embedding_ace-2.npy
8
  cache_dir: .cache
9
 
10
  track_latency: True
data_handlers/kising.py CHANGED
@@ -11,7 +11,7 @@ class KiSing(MelodyDatasetHandler):
11
  from datasets import load_dataset
12
 
13
  song_db = load_dataset(
14
- "jhansss/kising_score_segments", cache_dir=cache_dir, split="train"
15
  ).to_pandas()
16
  song_db.set_index("segment_id", inplace=True)
17
  assert (
 
11
  from datasets import load_dataset
12
 
13
  song_db = load_dataset(
14
+ "espnet/kising_score_segments", cache_dir=cache_dir, split="train"
15
  ).to_pandas()
16
  song_db.set_index("segment_id", inplace=True)
17
  assert (
evaluation/svs_eval.py CHANGED
@@ -11,7 +11,7 @@ from pathlib import Path
11
  def init_singmos():
12
  print("[Init] Loading SingMOS...")
13
  return torch.hub.load(
14
- "South-Twilight/SingMOS:v0.3.0", "singing_ssl_mos", trust_repo=True
15
  )
16
 
17
 
 
11
  def init_singmos():
12
  print("[Init] Loading SingMOS...")
13
  return torch.hub.load(
14
+ "South-Twilight/SingMOS:v0.3.0", "singing_ssl_mos", trust_repo=True, cache_dir="cache"
15
  )
16
 
17
 
interface.py CHANGED
@@ -202,6 +202,7 @@ class GradioInterface:
202
 
203
  def update_melody_source(self, melody_source):
204
  self.current_melody_source = melody_source
 
205
  return gr.update(value=self.current_melody_source)
206
 
207
  def update_voice(self, voice):
 
202
 
203
  def update_melody_source(self, melody_source):
204
  self.current_melody_source = melody_source
205
+ self.pipeline.set_melody_controller(melody_source)
206
  return gr.update(value=self.current_melody_source)
207
 
208
  def update_voice(self, voice):
modules/llm/gemma.py CHANGED
@@ -9,7 +9,7 @@ from .registry import register_llm_model
9
  hf_token = os.getenv("HF_TOKEN")
10
 
11
 
12
- @register_llm_model("google/gemma-")
13
  class GemmaLLM(AbstractLLMModel):
14
  def __init__(
15
  self, model_id: str, device: str = "auto", cache_dir: str = "cache", **kwargs
 
9
  hf_token = os.getenv("HF_TOKEN")
10
 
11
 
12
+ @register_llm_model("google/gemma-2-")
13
  class GemmaLLM(AbstractLLMModel):
14
  def __init__(
15
  self, model_id: str, device: str = "auto", cache_dir: str = "cache", **kwargs
modules/svs/espnet.py CHANGED
@@ -17,6 +17,7 @@ class ESPNetSVS(AbstractSVSModel):
17
  def __init__(self, model_id: str, device="auto", cache_dir="cache", **kwargs):
18
  from espnet2.bin.svs_inference import SingingGenerate
19
  from espnet_model_zoo.downloader import ModelDownloader
 
20
  if device == "auto":
21
  device = "cuda" if torch.cuda.is_available() else "cpu"
22
  self.device = device
@@ -35,7 +36,10 @@ class ESPNetSVS(AbstractSVSModel):
35
  phoneme_mappers = {
36
  "mandarin": pinyin_to_phonemes_opencpop,
37
  }
38
- elif self.model_id == "espnet/mixdata_svs_visinger2_spkemb_lang_pretrained":
 
 
 
39
 
40
  def mandarin_mapper(pinyin: str) -> list[str]:
41
  phns = pinyin_to_phonemes_ace(pinyin)
@@ -53,7 +57,11 @@ class ESPNetSVS(AbstractSVSModel):
53
  phoneme_mappers = {}
54
  return phoneme_mappers
55
 
56
- def _preprocess(self, score: list[tuple[float, float, str, int] | tuple[float, float, str, float]], language: str):
 
 
 
 
57
  if language not in self.phoneme_mappers:
58
  raise ValueError(f"Unsupported language: {language} for {self.model_id}")
59
  phoneme_mapper = self.phoneme_mappers[language]
@@ -88,6 +96,9 @@ class ESPNetSVS(AbstractSVSModel):
88
  notes.append((st, ed, "".join(lyric_units), pitch, "_".join(phn_units)))
89
  phns.extend(phn_units)
90
  pre_phn = phn_units[-1]
 
 
 
91
 
92
  batch = {
93
  "score": (
@@ -99,13 +110,20 @@ class ESPNetSVS(AbstractSVSModel):
99
  return batch
100
 
101
  def synthesize(
102
- self, score: list[tuple[float, float, str, float] | tuple[float, float, str, int]], language: str, speaker: str, **kwargs
 
 
 
 
103
  ):
104
  batch = self._preprocess(score, language)
105
  if self.model_id == "espnet/aceopencpop_svs_visinger2_40singer_pretrain":
106
  sid = np.array([int(speaker)])
107
  output_dict = self.model(batch, sids=sid)
108
- elif self.model_id == "espnet/mixdata_svs_visinger2_spkemb_lang_pretrained":
 
 
 
109
  langs = {
110
  "mandarin": 2,
111
  "japanese": 1,
 
17
  def __init__(self, model_id: str, device="auto", cache_dir="cache", **kwargs):
18
  from espnet2.bin.svs_inference import SingingGenerate
19
  from espnet_model_zoo.downloader import ModelDownloader
20
+
21
  if device == "auto":
22
  device = "cuda" if torch.cuda.is_available() else "cpu"
23
  self.device = device
 
36
  phoneme_mappers = {
37
  "mandarin": pinyin_to_phonemes_opencpop,
38
  }
39
+ elif self.model_id in [
40
+ "espnet/mixdata_svs_visinger2_spkemb_lang_pretrained",
41
+ "espnet/mixdata_svs_visinger2_spkemb_lang_pretrained_avg",
42
+ ]:
43
 
44
  def mandarin_mapper(pinyin: str) -> list[str]:
45
  phns = pinyin_to_phonemes_ace(pinyin)
 
57
  phoneme_mappers = {}
58
  return phoneme_mappers
59
 
60
+ def _preprocess(
61
+ self,
62
+ score: list[tuple[float, float, str, int] | tuple[float, float, str, float]],
63
+ language: str,
64
+ ):
65
  if language not in self.phoneme_mappers:
66
  raise ValueError(f"Unsupported language: {language} for {self.model_id}")
67
  phoneme_mapper = self.phoneme_mappers[language]
 
96
  notes.append((st, ed, "".join(lyric_units), pitch, "_".join(phn_units)))
97
  phns.extend(phn_units)
98
  pre_phn = phn_units[-1]
99
+ # add a silence at the end
100
+ notes.append((notes[-1][1], notes[-1][1] + 0.2, "AP", 0, "AP"))
101
+ phns.append("AP")
102
 
103
  batch = {
104
  "score": (
 
110
  return batch
111
 
112
  def synthesize(
113
+ self,
114
+ score: list[tuple[float, float, str, float] | tuple[float, float, str, int]],
115
+ language: str,
116
+ speaker: str,
117
+ **kwargs,
118
  ):
119
  batch = self._preprocess(score, language)
120
  if self.model_id == "espnet/aceopencpop_svs_visinger2_40singer_pretrain":
121
  sid = np.array([int(speaker)])
122
  output_dict = self.model(batch, sids=sid)
123
+ elif self.model_id in [
124
+ "espnet/mixdata_svs_visinger2_spkemb_lang_pretrained",
125
+ "espnet/mixdata_svs_visinger2_spkemb_lang_pretrained_avg",
126
+ ]:
127
  langs = {
128
  "mandarin": 2,
129
  "japanese": 1,