deep-div commited on
Commit
df4f8a7
·
verified ·
1 Parent(s): fa5d634

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +230 -274
app.py CHANGED
@@ -1,274 +1,230 @@
1
- #!/usr/bin/env python3
2
- #
3
- # Copyright 2022-2023 Xiaomi Corp. (authors: Fangjun Kuang)
4
- #
5
- # See LICENSE for clarification regarding multiple authors
6
- #
7
- # Licensed under the Apache License, Version 2.0 (the "License");
8
- # you may not use this file except in compliance with the License.
9
- # You may obtain a copy of the License at
10
- #
11
- # http://www.apache.org/licenses/LICENSE-2.0
12
- #
13
- # Unless required by applicable law or agreed to in writing, software
14
- # distributed under the License is distributed on an "AS IS" BASIS,
15
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16
- # See the License for the specific language governing permissions and
17
- # limitations under the License.
18
-
19
- # References:
20
- # https://gradio.app/docs/#dropdown
21
-
22
- import os
23
- import time
24
- import uuid
25
- from datetime import datetime
26
-
27
- import gradio as gr
28
- import soundfile as sf
29
-
30
- from model import get_pretrained_model, language_to_models
31
-
32
-
33
- def MyPrint(s):
34
- now = datetime.now()
35
- date_time = now.strftime("%Y-%m-%d %H:%M:%S.%f")
36
- print(f"{date_time}: {s}")
37
-
38
-
39
- title = "# Next-gen Kaldi: Text-to-speech (TTS)"
40
-
41
- description = """
42
- This space shows how to convert text to speech with Next-gen Kaldi.
43
-
44
- It is running on CPU within a docker container provided by Hugging Face.
45
-
46
- See more information by visiting the following links:
47
-
48
- - <https://github.com/k2-fsa/sherpa-onnx>
49
-
50
- If you want to deploy it locally, please see
51
- <https://k2-fsa.github.io/sherpa/>
52
-
53
- If you want to use Android APKs, please see
54
- <https://k2-fsa.github.io/sherpa/onnx/tts/apk.html>
55
-
56
- If you want to use Android text-to-speech engine APKs, please see
57
- <https://k2-fsa.github.io/sherpa/onnx/tts/apk-engine.html>
58
-
59
- If you want to download an all-in-one exe for Windows, please see
60
- <https://github.com/k2-fsa/sherpa-onnx/releases/tag/tts-models>
61
-
62
- """
63
-
64
- # css style is copied from
65
- # https://huggingface.co/spaces/alphacep/asr/blob/main/app.py#L113
66
- css = """
67
- .result {display:flex;flex-direction:column}
68
- .result_item {padding:15px;margin-bottom:8px;border-radius:15px;width:100%}
69
- .result_item_success {background-color:mediumaquamarine;color:white;align-self:start}
70
- .result_item_error {background-color:#ff7070;color:white;align-self:start}
71
- """
72
-
73
- examples = [
74
- [
75
- "Chinese (Mandarin, 普通话)",
76
- "csukuangfj/matcha-icefall-zh-baker|1 speaker",
77
- "某某银行的副行长和一些行政领导表示,他们去过长江和长白山; 经济不断增长。2024年12月31号,拨打110或者18920240511。123456块钱。",
78
- 0,
79
- 1.0,
80
- ],
81
- [
82
- "Chinese (Mandarin, 普通话)",
83
- "csukuangfj/vits-zh-hf-fanchen-wnj|1 speaker",
84
- "在一个阳光明媚的夏天,小马、小羊和小狗它们一块儿在广阔的草地上,嬉戏玩耍,这时小猴来了,还带着它心爱的足球活蹦乱跳地跑前、跑后教小马、小羊、小狗踢足球。",
85
- 0,
86
- 1.0,
87
- ],
88
- [
89
- "Chinese (Mandarin, 普通话)",
90
- "csukuangfj/vits-zh-hf-fanchen-C|187 speakers",
91
- '小米的使命是,始终坚持做"感动人心、价格厚道"的好产品,让全球每个人都能享受科技带来的美好生活。',
92
- 0,
93
- 1.0,
94
- ],
95
- ["Min-nan (闽南话)", "csukuangfj/vits-mms-nan", "ài piaǸ chiah ē iaN̂", 0, 1.0],
96
- ["Thai", "csukuangfj/vits-mms-tha", "ฉันรักคุณ", 0, 1.0],
97
- [
98
- "Chinese (Mandarin, 普通话)",
99
- "csukuangfj/sherpa-onnx-vits-zh-ll|5 speakers",
100
- "当夜幕降临,星光点点,伴随着微风拂面,我在静谧中感受着时光的流转,思念如涟漪荡漾,梦境如画卷展开,我与自然融为一体,沉静在这片宁静的美丽之中,感受着生命的奇迹与温柔。",
101
- 2,
102
- 1.0,
103
- ],
104
- ]
105
-
106
-
107
- def update_model_dropdown(language: str):
108
- if language in language_to_models:
109
- choices = language_to_models[language]
110
- return gr.Dropdown(
111
- choices=choices,
112
- value=choices[0],
113
- interactive=True,
114
- )
115
-
116
- raise ValueError(f"Unsupported language: {language}")
117
-
118
-
119
- def build_html_output(s: str, style: str = "result_item_success"):
120
- return f"""
121
- <div class='result'>
122
- <div class='result_item {style}'>
123
- {s}
124
- </div>
125
- </div>
126
- """
127
-
128
-
129
- def process(language: str, repo_id: str, text: str, sid: str, speed: float):
130
- MyPrint(f"Input text: {text}. sid: {sid}, speed: {speed}")
131
- sid = int(sid)
132
- tts = get_pretrained_model(repo_id, speed)
133
-
134
- start = time.time()
135
- audio = tts.generate(text, sid=sid)
136
- end = time.time()
137
-
138
- if len(audio.samples) == 0:
139
- raise ValueError(
140
- "Error in generating audios. Please read previous error messages."
141
- )
142
-
143
- duration = len(audio.samples) / audio.sample_rate
144
-
145
- elapsed_seconds = end - start
146
- rtf = elapsed_seconds / duration
147
-
148
- info = f"""
149
- Wave duration : {duration:.3f} s <br/>
150
- Processing time: {elapsed_seconds:.3f} s <br/>
151
- RTF: {elapsed_seconds:.3f}/{duration:.3f} = {rtf:.3f} <br/>
152
- """
153
-
154
- MyPrint(info)
155
- MyPrint(f"\nrepo_id: {repo_id}\ntext: {text}\nsid: {sid}\nspeed: {speed}")
156
-
157
- filename = str(uuid.uuid4())
158
- filename = f"{filename}.wav"
159
- sf.write(
160
- filename,
161
- audio.samples,
162
- samplerate=audio.sample_rate,
163
- subtype="PCM_16",
164
- )
165
-
166
- return filename, build_html_output(info)
167
-
168
-
169
- demo = gr.Blocks(css=css)
170
-
171
-
172
- with demo:
173
- gr.Markdown(title)
174
- language_choices = list(language_to_models.keys())
175
-
176
- language_radio = gr.Radio(
177
- label="Language",
178
- choices=language_choices,
179
- value=language_choices[0],
180
- )
181
-
182
- model_dropdown = gr.Dropdown(
183
- choices=language_to_models[language_choices[0]],
184
- label="Select a model",
185
- value=language_to_models[language_choices[0]][0],
186
- )
187
-
188
- language_radio.change(
189
- update_model_dropdown,
190
- inputs=language_radio,
191
- outputs=model_dropdown,
192
- )
193
-
194
- with gr.Tabs():
195
- with gr.TabItem("Please input your text"):
196
- input_text = gr.Textbox(
197
- label="Input text",
198
- info="Your text",
199
- lines=3,
200
- placeholder="Please input your text here",
201
- )
202
-
203
- input_sid = gr.Textbox(
204
- label="Speaker ID",
205
- info="Speaker ID",
206
- lines=1,
207
- max_lines=1,
208
- value="0",
209
- placeholder="Speaker ID. Valid only for mult-speaker model",
210
- )
211
-
212
- input_speed = gr.Slider(
213
- minimum=0.1,
214
- maximum=10,
215
- value=1,
216
- step=0.1,
217
- label="Speed (larger->faster; smaller->slower)",
218
- )
219
-
220
- input_button = gr.Button("Submit")
221
-
222
- output_audio = gr.Audio(label="Output")
223
-
224
- output_info = gr.HTML(label="Info")
225
-
226
- gr.Examples(
227
- examples=examples,
228
- fn=process,
229
- inputs=[
230
- language_radio,
231
- model_dropdown,
232
- input_text,
233
- input_sid,
234
- input_speed,
235
- ],
236
- outputs=[
237
- output_audio,
238
- output_info,
239
- ],
240
- )
241
-
242
- input_button.click(
243
- process,
244
- inputs=[
245
- language_radio,
246
- model_dropdown,
247
- input_text,
248
- input_sid,
249
- input_speed,
250
- ],
251
- outputs=[
252
- output_audio,
253
- output_info,
254
- ],
255
- )
256
-
257
- gr.Markdown(description)
258
-
259
-
260
- def download_espeak_ng_data():
261
- os.system(
262
- """
263
- cd /tmp
264
- wget -qq https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/espeak-ng-data.tar.bz2
265
- tar xf espeak-ng-data.tar.bz2
266
- """
267
- )
268
-
269
-
270
- if __name__ == "__main__":
271
- download_espeak_ng_data()
272
- formatter = "%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s"
273
-
274
- demo.launch()
 
1
+ #!/usr/bin/env python3
2
+
3
+ import os
4
+ import time
5
+ import uuid
6
+ from datetime import datetime
7
+
8
+ import gradio as gr
9
+ import soundfile as sf
10
+
11
+ from model import get_pretrained_model, language_to_models
12
+
13
+
14
+ def MyPrint(s):
15
+ now = datetime.now()
16
+ date_time = now.strftime("%Y-%m-%d %H:%M:%S.%f")
17
+ print(f"{date_time}: {s}")
18
+
19
+
20
+ title = "# Next-gen Kaldi: Text-to-speech (TTS)"
21
+
22
+ description = """
23
+ This space shows how to convert text to speech with Next-gen Kaldi (Sherpa-ONNX).
24
+
25
+ Running on CPU in a Docker container from Hugging Face.
26
+
27
+ Useful links:
28
+
29
+ - <https://github.com/k2-fsa/sherpa-onnx>
30
+ - <https://k2-fsa.github.io/sherpa/>
31
+ - <https://github.com/k2-fsa/sherpa-onnx/releases/tag/tts-models>
32
+ """
33
+
34
+ css = """
35
+ .result {display:flex;flex-direction:column}
36
+ .result_item {padding:15px;margin-bottom:8px;border-radius:15px;width:100%}
37
+ .result_item_success {background-color:mediumaquamarine;color:white;align-self:start}
38
+ .result_item_error {background-color:#ff7070;color:white;align-self:start}
39
+ """
40
+
41
+ examples = [
42
+ [
43
+ "English",
44
+ "csukuangfj/vits-piper-en_US-libritts-rhea|1 speaker",
45
+ "Hello! This is a test of the text-to-speech demo powered by Sherpa-ONNX.",
46
+ 0,
47
+ 1.0,
48
+ ],
49
+ [
50
+ "English",
51
+ "csukuangfj/vits-piper-en_GB-alba|1 speaker",
52
+ "Welcome to the world of AI voice generation. Enjoy the experience.",
53
+ 0,
54
+ 1.0,
55
+ ],
56
+ [
57
+ "English",
58
+ "csukuangfj/vits-piper-en_US-lessac-medium|1 speaker",
59
+ "Real-time voice synthesis helps developers build smarter applications.",
60
+ 0,
61
+ 1.0,
62
+ ],
63
+ ]
64
+
65
+
66
+ def update_model_dropdown(language: str):
67
+ if language in language_to_models:
68
+ choices = language_to_models[language]
69
+ return gr.Dropdown(
70
+ choices=choices,
71
+ value=choices[0],
72
+ interactive=True,
73
+ )
74
+
75
+ raise ValueError(f"Unsupported language: {language}")
76
+
77
+
78
+ def build_html_output(s: str, style: str = "result_item_success"):
79
+ return f"""
80
+ <div class='result'>
81
+ <div class='result_item {style}'>
82
+ {s}
83
+ </div>
84
+ </div>
85
+ """
86
+
87
+
88
+ def process(language: str, repo_id: str, text: str, sid: str, speed: float):
89
+ MyPrint(f"Input text: {text}. sid: {sid}, speed: {speed}")
90
+ sid = int(sid)
91
+ tts = get_pretrained_model(repo_id, speed)
92
+
93
+ start = time.time()
94
+ audio = tts.generate(text, sid=sid)
95
+ end = time.time()
96
+
97
+ if len(audio.samples) == 0:
98
+ raise ValueError(
99
+ "Error in generating audios. Please read previous error messages."
100
+ )
101
+
102
+ duration = len(audio.samples) / audio.sample_rate
103
+
104
+ elapsed_seconds = end - start
105
+ rtf = elapsed_seconds / duration
106
+
107
+ info = f"""
108
+ Wave duration : {duration:.3f} s <br/>
109
+ Processing time: {elapsed_seconds:.3f} s <br/>
110
+ RTF: {elapsed_seconds:.3f}/{duration:.3f} = {rtf:.3f} <br/>
111
+ """
112
+
113
+ MyPrint(info)
114
+ MyPrint(f"\nrepo_id: {repo_id}\ntext: {text}\nsid: {sid}\nspeed: {speed}")
115
+
116
+ filename = str(uuid.uuid4())
117
+ filename = f"{filename}.wav"
118
+ sf.write(
119
+ filename,
120
+ audio.samples,
121
+ samplerate=audio.sample_rate,
122
+ subtype="PCM_16",
123
+ )
124
+
125
+ return filename, build_html_output(info)
126
+
127
+
128
+ demo = gr.Blocks(css=css)
129
+
130
+ with demo:
131
+ gr.Markdown(title)
132
+ language_choices = list(language_to_models.keys())
133
+
134
+ language_radio = gr.Radio(
135
+ label="Language",
136
+ choices=language_choices,
137
+ value="English", # Default language
138
+ )
139
+
140
+ model_dropdown = gr.Dropdown(
141
+ choices=language_to_models["English"], # Default model list
142
+ label="Select a model",
143
+ value=language_to_models["English"][0], # Default model
144
+ )
145
+
146
+ language_radio.change(
147
+ update_model_dropdown,
148
+ inputs=language_radio,
149
+ outputs=model_dropdown,
150
+ )
151
+
152
+ with gr.Tabs():
153
+ with gr.TabItem("Please input your text"):
154
+ input_text = gr.Textbox(
155
+ label="Input text",
156
+ info="Your text",
157
+ lines=3,
158
+ placeholder="Please input your text here",
159
+ )
160
+
161
+ input_sid = gr.Textbox(
162
+ label="Speaker ID",
163
+ info="Speaker ID",
164
+ lines=1,
165
+ max_lines=1,
166
+ value="0",
167
+ placeholder="Speaker ID. Valid only for multi-speaker model",
168
+ )
169
+
170
+ input_speed = gr.Slider(
171
+ minimum=0.1,
172
+ maximum=10,
173
+ value=1,
174
+ step=0.1,
175
+ label="Speed (larger->faster; smaller->slower)",
176
+ )
177
+
178
+ input_button = gr.Button("Submit")
179
+
180
+ output_audio = gr.Audio(label="Output")
181
+
182
+ output_info = gr.HTML(label="Info")
183
+
184
+ gr.Examples(
185
+ examples=examples,
186
+ fn=process,
187
+ inputs=[
188
+ language_radio,
189
+ model_dropdown,
190
+ input_text,
191
+ input_sid,
192
+ input_speed,
193
+ ],
194
+ outputs=[
195
+ output_audio,
196
+ output_info,
197
+ ],
198
+ )
199
+
200
+ input_button.click(
201
+ process,
202
+ inputs=[
203
+ language_radio,
204
+ model_dropdown,
205
+ input_text,
206
+ input_sid,
207
+ input_speed,
208
+ ],
209
+ outputs=[
210
+ output_audio,
211
+ output_info,
212
+ ],
213
+ )
214
+
215
+ gr.Markdown(description)
216
+
217
+
218
+ def download_espeak_ng_data():
219
+ os.system(
220
+ """
221
+ cd /tmp
222
+ wget -qq https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/espeak-ng-data.tar.bz2
223
+ tar xf espeak-ng-data.tar.bz2
224
+ """
225
+ )
226
+
227
+
228
+ if __name__ == "__main__":
229
+ download_espeak_ng_data()
230
+ demo.launch()