admin commited on
Commit
b027a5c
·
1 Parent(s): ee49bef
Files changed (3) hide show
  1. app.py +57 -54
  2. requirements.txt +5 -3
  3. utils.py +50 -10
app.py CHANGED
@@ -9,18 +9,16 @@ import librosa.display
9
  import matplotlib.pyplot as plt
10
  from collections import Counter
11
  from model import EvalNet
12
- from utils import get_modelist, find_wav_files, embed_img
13
-
14
-
15
- TRANSLATE = {
16
- "m_bel": "Bel Canto, Male",
17
- "f_bel": "Bel Canto, Female",
18
- "m_folk": "Folk Singing, Male",
19
- "f_folk": "Folk Singing, Female",
20
- }
21
- CLASSES = list(TRANSLATE.keys())
22
- TEMP_DIR = "./__pycache__/tmp"
23
- SAMPLE_RATE = 22050
24
 
25
 
26
  def wav2mel(audio_path: str, width=1.6, topdb=40):
@@ -99,34 +97,38 @@ def most_common_element(input_list: list):
99
 
100
 
101
  def infer(wav_path: str, log_name: str, folder_path=TEMP_DIR):
102
- if os.path.exists(folder_path):
103
- shutil.rmtree(folder_path)
 
 
 
104
 
105
- if not wav_path:
106
- return None, "Please input an audio!"
107
 
108
- spec = log_name.split("_")[-3]
109
- os.makedirs(folder_path, exist_ok=True)
110
- try:
111
  model = EvalNet(log_name, len(TRANSLATE)).model
112
  eval("wav2%s" % spec)(wav_path)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
113
 
114
  except Exception as e:
115
- return None, f"{e}"
116
-
117
- outputs = []
118
- all_files = os.listdir(folder_path)
119
- for file_name in all_files:
120
- if file_name.lower().endswith(".jpg"):
121
- file_path = os.path.join(folder_path, file_name)
122
- input = embed_img(file_path)
123
- output: torch.Tensor = model(input)
124
- pred_id = torch.max(output.data, 1)[1]
125
- outputs.append(int(pred_id))
126
 
127
- max_count_item = most_common_element(outputs)
128
- shutil.rmtree(folder_path)
129
- return os.path.basename(wav_path), TRANSLATE[CLASSES[max_count_item]]
130
 
131
 
132
  if __name__ == "__main__":
@@ -141,36 +143,37 @@ if __name__ == "__main__":
141
  gr.Interface(
142
  fn=infer,
143
  inputs=[
144
- gr.Audio(label="Upload a recording (>40dB)", type="filepath"),
145
- gr.Dropdown(choices=models, label="Select a model", value=models[0]),
146
  ],
147
  outputs=[
148
- gr.Textbox(label="Audio filename", show_copy_button=True),
149
- gr.Textbox(label="Singing method recognition", show_copy_button=True),
 
150
  ],
151
  examples=examples,
152
  cache_examples=False,
153
  allow_flagging="never",
154
- title="It is recommended to keep the recording length around 5s, too long will affect the recognition efficiency.",
155
  )
156
 
157
  gr.Markdown(
158
- """
159
- # Cite
160
- ```bibtex
161
- @article{Zhou-2025,
162
- author = {Monan Zhou and Shenyang Xu and Zhaorui Liu and Zhaowen Wang and Feng Yu and Wei Li and Baoqiang Han},
163
- title = {CCMusic: An Open and Diverse Database for Chinese Music Information Retrieval Research},
164
- journal = {Transactions of the International Society for Music Information Retrieval},
165
- volume = {8},
166
- number = {1},
167
- pages = {22--38},
168
- month = {Mar},
169
- year = {2025},
170
- url = {https://doi.org/10.5334/tismir.194},
171
- doi = {10.5334/tismir.194}
172
- }
173
- ```"""
174
  )
175
 
176
  demo.launch()
 
9
  import matplotlib.pyplot as plt
10
  from collections import Counter
11
  from model import EvalNet
12
+ from utils import (
13
+ get_modelist,
14
+ find_wav_files,
15
+ embed_img,
16
+ _L,
17
+ SAMPLE_RATE,
18
+ TEMP_DIR,
19
+ TRANSLATE,
20
+ CLASSES,
21
+ )
 
 
22
 
23
 
24
  def wav2mel(audio_path: str, width=1.6, topdb=40):
 
97
 
98
 
99
  def infer(wav_path: str, log_name: str, folder_path=TEMP_DIR):
100
+ status = "Success"
101
+ filename = result = None
102
+ try:
103
+ if os.path.exists(folder_path):
104
+ shutil.rmtree(folder_path)
105
 
106
+ if not wav_path:
107
+ raise ValueError("请输入音频!")
108
 
109
+ spec = log_name.split("_")[-3]
110
+ os.makedirs(folder_path, exist_ok=True)
 
111
  model = EvalNet(log_name, len(TRANSLATE)).model
112
  eval("wav2%s" % spec)(wav_path)
113
+ outputs = []
114
+ all_files = os.listdir(folder_path)
115
+ for file_name in all_files:
116
+ if file_name.lower().endswith(".jpg"):
117
+ file_path = os.path.join(folder_path, file_name)
118
+ input = embed_img(file_path)
119
+ output: torch.Tensor = model(input)
120
+ pred_id = torch.max(output.data, 1)[1]
121
+ outputs.append(int(pred_id))
122
+
123
+ max_count_item = most_common_element(outputs)
124
+ shutil.rmtree(folder_path)
125
+ filename = os.path.basename(wav_path)
126
+ result = TRANSLATE[CLASSES[max_count_item]]
127
 
128
  except Exception as e:
129
+ status = f"{e}"
 
 
 
 
 
 
 
 
 
 
130
 
131
+ return status, filename, result
 
 
132
 
133
 
134
  if __name__ == "__main__":
 
143
  gr.Interface(
144
  fn=infer,
145
  inputs=[
146
+ gr.Audio(label=_L("上传录音 (>40dB)"), type="filepath"),
147
+ gr.Dropdown(choices=models, label=_L("选择模型"), value=models[0]),
148
  ],
149
  outputs=[
150
+ gr.Textbox(label=_L("状态栏"), show_copy_button=True),
151
+ gr.Textbox(label=_L("音频文件名"), show_copy_button=True),
152
+ gr.Textbox(label=_L("唱法识别"), show_copy_button=True),
153
  ],
154
  examples=examples,
155
  cache_examples=False,
156
  allow_flagging="never",
157
+ title=_L("建议录音时长保持在 5s 左右, 过长会影响识别效率"),
158
  )
159
 
160
  gr.Markdown(
161
+ f"# {_L('引用')}"
162
+ + """
163
+ ```bibtex
164
+ @article{Zhou-2025,
165
+ author = {Monan Zhou and Shenyang Xu and Zhaorui Liu and Zhaowen Wang and Feng Yu and Wei Li and Baoqiang Han},
166
+ title = {CCMusic: An Open and Diverse Database for Chinese Music Information Retrieval Research},
167
+ journal = {Transactions of the International Society for Music Information Retrieval},
168
+ volume = {8},
169
+ number = {1},
170
+ pages = {22--38},
171
+ month = {Mar},
172
+ year = {2025},
173
+ url = {https://doi.org/10.5334/tismir.194},
174
+ doi = {10.5334/tismir.194}
175
+ }
176
+ ```"""
177
  )
178
 
179
  demo.launch()
requirements.txt CHANGED
@@ -1,5 +1,7 @@
1
- torch
2
- pillow
 
 
3
  librosa
4
  matplotlib
5
- torchvision
 
1
+ torch==2.6.0+cu118
2
+ -f https://download.pytorch.org/whl/torch
3
+ torchvision==0.21.0+cu118
4
+ -f https://download.pytorch.org/whl/torchvision
5
  librosa
6
  matplotlib
7
+ modelscope[framework]==1.21.0
utils.py CHANGED
@@ -1,10 +1,53 @@
1
  import os
2
  import torch
3
  import torchvision.transforms as transforms
4
- from huggingface_hub import snapshot_download
 
5
  from PIL import Image
6
 
7
- MODEL_DIR = snapshot_download("ccmusic-database/bel_canto", cache_dir="./__pycache__")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8
 
9
 
10
  def toCUDA(x):
@@ -27,19 +70,16 @@ def find_wav_files(folder_path=f"{MODEL_DIR}/examples"):
27
 
28
 
29
  def get_modelist(model_dir=MODEL_DIR, assign_model=""):
30
- try:
31
- entries = os.listdir(model_dir)
32
- except OSError as e:
33
- print(f"Cannot access {model_dir}: {e}")
34
- return
35
-
36
  output = []
37
- for entry in entries:
 
38
  full_path = os.path.join(model_dir, entry)
 
39
  if entry == ".git" or entry == "examples":
40
- print(f"Skip .git / examples dir: {full_path}")
41
  continue
42
 
 
43
  if os.path.isdir(full_path):
44
  model = os.path.basename(full_path)
45
  if assign_model and assign_model.lower() in model:
 
1
  import os
2
  import torch
3
  import torchvision.transforms as transforms
4
+ import huggingface_hub
5
+ import modelscope
6
  from PIL import Image
7
 
8
+
9
+ EN_US = os.getenv("LANG") != "zh_CN.UTF-8"
10
+
11
+ ZH2EN = {
12
+ "上传录音 (>40dB)": "Upload a recording (>40dB)",
13
+ "选择模型": "Select a model",
14
+ "状态栏": "Status",
15
+ "音频文件名": "Audio filename",
16
+ "唱法识别": "Singing method recognition",
17
+ "建议录音时长保持在 5s 左右, 过长会影响识别效率": "It is recommended to keep the recording length around 5s, too long will affect the recognition efficiency.",
18
+ "引用": "Cite",
19
+ "男声 & 美声唱法": "Bel Canto, Male",
20
+ "女声 & 美声唱法": "Bel Canto, Female",
21
+ "男声 & 民族唱法": "Folk Singing, Male",
22
+ "女声 & 民族唱法": "Folk Singing, Female",
23
+ }
24
+
25
+ MODEL_DIR = (
26
+ huggingface_hub.snapshot_download(
27
+ "ccmusic-database/bel_canto",
28
+ cache_dir="./__pycache__",
29
+ )
30
+ if EN_US
31
+ else modelscope.snapshot_download(
32
+ "ccmusic-database/bel_canto",
33
+ cache_dir="./__pycache__",
34
+ )
35
+ )
36
+
37
+
38
+ def _L(zh_txt: str):
39
+ return ZH2EN[zh_txt] if EN_US else zh_txt
40
+
41
+
42
+ TRANSLATE = {
43
+ "m_bel": _L("男声 & 美声唱法"),
44
+ "f_bel": _L("女声 & 美声唱法"),
45
+ "m_folk": _L("男声 & 民族唱法"),
46
+ "f_folk": _L("女声 & 民族唱法"),
47
+ }
48
+ CLASSES = list(TRANSLATE.keys())
49
+ TEMP_DIR = "./__pycache__/tmp"
50
+ SAMPLE_RATE = 22050
51
 
52
 
53
  def toCUDA(x):
 
70
 
71
 
72
  def get_modelist(model_dir=MODEL_DIR, assign_model=""):
 
 
 
 
 
 
73
  output = []
74
+ for entry in os.listdir(model_dir):
75
+ # 获取完整路径
76
  full_path = os.path.join(model_dir, entry)
77
+ # 跳过'.git'文件夹
78
  if entry == ".git" or entry == "examples":
79
+ print(f"跳过 .git examples 文件夹: {full_path}")
80
  continue
81
 
82
+ # 检查条目是文件还是目录
83
  if os.path.isdir(full_path):
84
  model = os.path.basename(full_path)
85
  if assign_model and assign_model.lower() in model: