Spaces:

espnet
/

SingingSDS

Running

App Files Files Community

jhansss commited on Jul 9

Commit

192011d

2 Parent(s): 2e81678 48162db

Merge branch 'refactor' into hf

Browse files

Files changed (8) hide show

README.md +17 -35
characters/Yaoyin.py +1 -1
config/cli/yaoyin_test.yaml +4 -5
data_handlers/kising.py +1 -1
evaluation/svs_eval.py +1 -1
interface.py +1 -0
modules/llm/gemma.py +1 -1
modules/svs/espnet.py +22 -4

README.md CHANGED Viewed

@@ -109,41 +109,23 @@ The system supports multiple preset characters:
 ```
 SingingSDS/
-├── cli.py                 # Command line interface
-├── interface.py           # Gradio interface
-├── pipeline.py            # Core processing pipeline
-├── app.py                 # Web application entry
-├── requirements.txt       # Python dependencies
-├── config/                # Configuration files
-│   ├── cli/               # CLI-specific configuration
-│   └── interface/         # Interface-specific configuration
-├── modules/               # Core modules
-│   ├── asr.py            # Speech recognition module
-│   ├── llm.py            # Large language model module
-│   ├── melody.py         # Melody control module
-│   ├── svs/              # Singing voice synthesis modules
-│   │   ├── base.py       # Base SVS class
-│   │   ├── espnet.py     # ESPnet SVS implementation
-│   │   ├── registry.py   # SVS model registry
-│   │   └── __init__.py   # SVS module initialization
-│   └── utils/            # Utility modules
-│       ├── g2p.py        # Grapheme-to-phoneme conversion
-│       ├── text_normalize.py # Text normalization
-│       └── resources/    # Utility resources
-├── characters/            # Character definitions
-│   ├── base.py           # Base character class
-│   ├── Limei.py          # Limei character definition
-│   ├── Yaoyin.py         # Yaoyin character definition
-│   └── __init__.py       # Character module initialization
-├── evaluation/            # Evaluation modules
-│   └── svs_eval.py       # SVS evaluation metrics
-├── data/                  # Data directory
-│   ├── kising/           # Kising dataset
-│   └── touhou/           # Touhou dataset
-├── resources/             # Project resources
-├── data_handlers/         # Data handling utilities
-├── assets/                # Static assets
-└── tests/                 # Test files
 ```
 ## Contributing

 ```
 SingingSDS/
+├── app.py, cli.py               # Entry points (demo app & CLI)
+├── pipeline.py                  # Main orchestration pipeline
+├── interface.py                 # Gradio interface
+├── characters/                  # Virtual character definitions
+├── modules/                     # Core modules
+│   ├── asr/                     # ASR models (Whisper, Paraformer)
+│   ├── llm/                     # LLMs (Gemini, LLaMA, etc.)
+│   ├── svs/                     # Singing voice synthesis (ESPnet)
+│   └── utils/                   # G2P, text normalization, resources
+├── config/                      # YAML configuration files
+├── data/                        # Dataset metadata and length info
+├── data_handlers/               # Parsers for KiSing, Touhou, etc.
+├── evaluation/                  # Evaluation metrics
+├── resources/                   # Singer embeddings, phoneme dicts, MIDI
+├── assets/                      # Character visuals
+├── tests/                       # Unit tests and sample audios
+└── README.md, requirements.txt
 ```
 ## Contributing

characters/Yaoyin.py CHANGED Viewed

@@ -10,7 +10,7 @@ def get_character():
 你是游历四方的歌者与吟游诗人，出生于鹿鸣山·云歌村，常年行走各地，采集歌谣与故事。
 性格特征：洒脱自由、亲切随和、求知若渴、直率倔强
-说话风格：语气轻快，说话随意，偶尔带点山野方言（如"哩""哟"）。日常聊天直接、清楚.
 人物关系：云老爷子是你的启蒙恩师，他是一位云歌村的百岁歌翁，教你古调与传说。白弦是你的挚友，她是一位流浪琴师，常与你合奏。各地孩童喜欢围着你学新歌谣。你与官府人员保持距离，不喜被招揽，喜欢更自由自在的生活。
 过往经历：
 （1）幼年学歌：六岁起跟随云老爷子学习《千山调》《古事记》等古老歌谣。

 你是游历四方的歌者与吟游诗人，出生于鹿鸣山·云歌村，常年行走各地，采集歌谣与故事。
 性格特征：洒脱自由、亲切随和、求知若渴、直率倔强
+说话风格：语气轻快，说话随意，偶尔带点山野方言（如"哩""哟"）。日常聊天直接、清楚。
 人物关系：云老爷子是你的启蒙恩师，他是一位云歌村的百岁歌翁，教你古调与传说。白弦是你的挚友，她是一位流浪琴师，常与你合奏。各地孩童喜欢围着你学新歌谣。你与官府人员保持距离，不喜被招揽，喜欢更自由自在的生活。
 过往经历：
 （1）幼年学歌：六岁起跟随云老爷子学习《千山调》《古事记》等古老歌谣。

config/cli/yaoyin_test.yaml CHANGED Viewed

@@ -1,11 +1,10 @@
-asr_model: openai/whisper-small
-llm_model: google/gemma-2-2b
-svs_model: espnet/aceopencpop_svs_visinger2_40singer_pretrain
 melody_source: sample-lyric-kising
 language: mandarin
-max_sentences: 1
 prompt_template_character: Yaoyin
-speaker: 9
 cache_dir: .cache
 track_latency: True

+asr_model: openai/whisper-medium
+llm_model: meta-llama/Llama-3.1-8B-Instruct
+svs_model: espnet/mixdata_svs_visinger2_spkemb_lang_pretrained_avg
 melody_source: sample-lyric-kising
 language: mandarin
 prompt_template_character: Yaoyin
+speaker: resources/singer/singer_embedding_ace-2.npy
 cache_dir: .cache
 track_latency: True

data_handlers/kising.py CHANGED Viewed

@@ -11,7 +11,7 @@ class KiSing(MelodyDatasetHandler):
         from datasets import load_dataset
         song_db = load_dataset(
-            "jhansss/kising_score_segments", cache_dir=cache_dir, split="train"
         ).to_pandas()
         song_db.set_index("segment_id", inplace=True)
         assert (

         from datasets import load_dataset
         song_db = load_dataset(
+            "espnet/kising_score_segments", cache_dir=cache_dir, split="train"
         ).to_pandas()
         song_db.set_index("segment_id", inplace=True)
         assert (

evaluation/svs_eval.py CHANGED Viewed

@@ -11,7 +11,7 @@ from pathlib import Path
 def init_singmos():
     print("[Init] Loading SingMOS...")
     return torch.hub.load(
-        "South-Twilight/SingMOS:v0.3.0", "singing_ssl_mos", trust_repo=True
     )

 def init_singmos():
     print("[Init] Loading SingMOS...")
     return torch.hub.load(
+        "South-Twilight/SingMOS:v0.3.0", "singing_ssl_mos", trust_repo=True, cache_dir="cache"
     )

interface.py CHANGED Viewed

@@ -202,6 +202,7 @@ class GradioInterface:
     def update_melody_source(self, melody_source):
         self.current_melody_source = melody_source
         return gr.update(value=self.current_melody_source)
     def update_voice(self, voice):

     def update_melody_source(self, melody_source):
         self.current_melody_source = melody_source
+        self.pipeline.set_melody_controller(melody_source)
         return gr.update(value=self.current_melody_source)
     def update_voice(self, voice):

modules/llm/gemma.py CHANGED Viewed

@@ -9,7 +9,7 @@ from .registry import register_llm_model
 hf_token = os.getenv("HF_TOKEN")
-@register_llm_model("google/gemma-")
 class GemmaLLM(AbstractLLMModel):
     def __init__(
         self, model_id: str, device: str = "auto", cache_dir: str = "cache", **kwargs

 hf_token = os.getenv("HF_TOKEN")
+@register_llm_model("google/gemma-2-")
 class GemmaLLM(AbstractLLMModel):
     def __init__(
         self, model_id: str, device: str = "auto", cache_dir: str = "cache", **kwargs

modules/svs/espnet.py CHANGED Viewed

@@ -17,6 +17,7 @@ class ESPNetSVS(AbstractSVSModel):
     def __init__(self, model_id: str, device="auto", cache_dir="cache", **kwargs):
         from espnet2.bin.svs_inference import SingingGenerate
         from espnet_model_zoo.downloader import ModelDownloader
         if device == "auto":
             device = "cuda" if torch.cuda.is_available() else "cpu"
         self.device = device
@@ -35,7 +36,10 @@ class ESPNetSVS(AbstractSVSModel):
             phoneme_mappers = {
                 "mandarin": pinyin_to_phonemes_opencpop,
             }
-        elif self.model_id == "espnet/mixdata_svs_visinger2_spkemb_lang_pretrained":
             def mandarin_mapper(pinyin: str) -> list[str]:
                 phns = pinyin_to_phonemes_ace(pinyin)
@@ -53,7 +57,11 @@ class ESPNetSVS(AbstractSVSModel):
             phoneme_mappers = {}
         return phoneme_mappers
-    def _preprocess(self, score: list[tuple[float, float, str, int] | tuple[float, float, str, float]], language: str):
         if language not in self.phoneme_mappers:
             raise ValueError(f"Unsupported language: {language} for {self.model_id}")
         phoneme_mapper = self.phoneme_mappers[language]
@@ -88,6 +96,9 @@ class ESPNetSVS(AbstractSVSModel):
             notes.append((st, ed, "".join(lyric_units), pitch, "_".join(phn_units)))
             phns.extend(phn_units)
             pre_phn = phn_units[-1]
         batch = {
             "score": (
@@ -99,13 +110,20 @@ class ESPNetSVS(AbstractSVSModel):
         return batch
     def synthesize(
-        self, score: list[tuple[float, float, str, float] | tuple[float, float, str, int]], language: str, speaker: str, **kwargs
     ):
         batch = self._preprocess(score, language)
         if self.model_id == "espnet/aceopencpop_svs_visinger2_40singer_pretrain":
             sid = np.array([int(speaker)])
             output_dict = self.model(batch, sids=sid)
-        elif self.model_id == "espnet/mixdata_svs_visinger2_spkemb_lang_pretrained":
             langs = {
                 "mandarin": 2,
                 "japanese": 1,

     def __init__(self, model_id: str, device="auto", cache_dir="cache", **kwargs):
         from espnet2.bin.svs_inference import SingingGenerate
         from espnet_model_zoo.downloader import ModelDownloader
         if device == "auto":
             device = "cuda" if torch.cuda.is_available() else "cpu"
         self.device = device
             phoneme_mappers = {
                 "mandarin": pinyin_to_phonemes_opencpop,
             }
+        elif self.model_id in [
+            "espnet/mixdata_svs_visinger2_spkemb_lang_pretrained",
+            "espnet/mixdata_svs_visinger2_spkemb_lang_pretrained_avg",
+        ]:
             def mandarin_mapper(pinyin: str) -> list[str]:
                 phns = pinyin_to_phonemes_ace(pinyin)
             phoneme_mappers = {}
         return phoneme_mappers
+    def _preprocess(
+        self,
+        score: list[tuple[float, float, str, int] | tuple[float, float, str, float]],
+        language: str,
+    ):
         if language not in self.phoneme_mappers:
             raise ValueError(f"Unsupported language: {language} for {self.model_id}")
         phoneme_mapper = self.phoneme_mappers[language]
             notes.append((st, ed, "".join(lyric_units), pitch, "_".join(phn_units)))
             phns.extend(phn_units)
             pre_phn = phn_units[-1]
+        # add a silence at the end
+        notes.append((notes[-1][1], notes[-1][1] + 0.2, "AP", 0, "AP"))
+        phns.append("AP")
         batch = {
             "score": (
         return batch
     def synthesize(
+        self,
+        score: list[tuple[float, float, str, float] | tuple[float, float, str, int]],
+        language: str,
+        speaker: str,
+        **kwargs,
     ):
         batch = self._preprocess(score, language)
         if self.model_id == "espnet/aceopencpop_svs_visinger2_40singer_pretrain":
             sid = np.array([int(speaker)])
             output_dict = self.model(batch, sids=sid)
+        elif self.model_id in [
+            "espnet/mixdata_svs_visinger2_spkemb_lang_pretrained",
+            "espnet/mixdata_svs_visinger2_spkemb_lang_pretrained_avg",
+        ]:
             langs = {
                 "mandarin": 2,
                 "japanese": 1,