Spaces:

Artrajz
/

vits-simple-api

Runtime error

App Files Files Community

Artrajz commited on Jul 4, 2023

Commit

b5830b6

1 Parent(s): c44addb

update

Browse files

Files changed (5) hide show

app.py +125 -65
config.py +6 -0
templates/index.html +258 -234
utils/utils.py +4 -0
voice.py +95 -46

app.py CHANGED Viewed

@@ -16,7 +16,8 @@ app.config.from_pyfile("config.py")
 scheduler = APScheduler()
 scheduler.init_app(app)
-scheduler.start()
 logzero.loglevel(logging.WARNING)
 logger = logging.getLogger("vits-simple-api")
@@ -53,7 +54,8 @@ def require_api_key(func):
 @app.route('/', methods=["GET", "POST"])
 def index():
     kwargs = {
-        "speakers": tts.voice_speakers
     }
     return render_template("index.html", **kwargs)
@@ -77,6 +79,7 @@ def voice_vits_api():
             noise = float(request.args.get("noise", app.config.get("NOISE", 0.667)))
             noisew = float(request.args.get("noisew", app.config.get("NOISEW", 0.8)))
             max = int(request.args.get("max", app.config.get("MAX", 50)))
         elif request.method == "POST":
             content_type = request.headers.get('Content-Type')
             if content_type == 'application/json':
@@ -91,6 +94,7 @@ def voice_vits_api():
             noise = float(data.get("noise", app.config.get("NOISE", 0.667)))
             noisew = float(data.get("noisew", app.config.get("NOISEW", 0.8)))
             max = int(data.get("max", app.config.get("MAX", 50)))
     except Exception as e:
         logger.error(f"[VITS] {e}")
         return make_response("parameter error", 400)
@@ -120,23 +124,37 @@ def voice_vits_api():
     if app.config.get("LANGUAGE_AUTOMATIC_DETECT", []) != []:
         speaker_lang = app.config.get("LANGUAGE_AUTOMATIC_DETECT")
     fname = f"{str(uuid.uuid1())}.{format}"
     file_type = f"audio/{format}"
-    t1 = time.time()
-    output = tts.vits_infer({"text": text,
-                             "id": id,
-                             "format": format,
-                             "length": length,
-                             "noise": noise,
-                             "noisew": noisew,
-                             "max": max,
-                             "lang": lang,
-                             "speaker_lang": speaker_lang})
-    t2 = time.time()
-    logger.info(f"[VITS] finish in {(t2 - t1):.2f}s")
-    return send_file(path_or_file=output, mimetype=file_type, download_name=fname)
 @app.route('/voice/hubert-vits', methods=["POST"])
@@ -150,6 +168,7 @@ def voice_hubert_api():
             length = float(request.form.get("length", app.config.get("LENGTH", 1)))
             noise = float(request.form.get("noise", app.config.get("NOISE", 0.667)))
             noisew = float(request.form.get("noisew", app.config.get("NOISEW", 0.8)))
         except Exception as e:
             logger.error(f"[hubert] {e}")
             return make_response("parameter error", 400)
@@ -168,18 +187,27 @@ def voice_hubert_api():
         return make_response(jsonify({"status": "error", "message": f"id {id} does not exist"}), 400)
     file_type = f"audio/{format}"
     t1 = time.time()
-    output = tts.hubert_vits_infer({"id": id,
-                                    "format": format,
-                                    "length": length,
-                                    "noise": noise,
-                                    "noisew": noisew,
-                                    "audio_path": os.path.join(app.config['UPLOAD_FOLDER'], fname)})
     t2 = time.time()
     logger.info(f"[hubert] finish in {(t2 - t1):.2f}s")
-    return send_file(path_or_file=output, mimetype=file_type, download_name=fname)
 @app.route('/voice/w2v2-vits', methods=["GET", "POST"])
@@ -196,6 +224,7 @@ def voice_w2v2_api():
             noisew = float(request.args.get("noisew", app.config.get("NOISEW", 0.8)))
             max = int(request.args.get("max", app.config.get("MAX", 50)))
             emotion = int(request.args.get("emotion", app.config.get("EMOTION", 0)))
         elif request.method == "POST":
             content_type = request.headers.get('Content-Type')
             if content_type == 'application/json':
@@ -211,6 +240,7 @@ def voice_w2v2_api():
             noisew = float(data.get("noisew", app.config.get("NOISEW", 0.8)))
             max = int(data.get("max", app.config.get("MAX", 50)))
             emotion = int(data.get("emotion", app.config.get("EMOTION", 0)))
     except Exception as e:
         logger.error(f"[w2v2] {e}")
         return make_response(f"parameter error", 400)
@@ -241,24 +271,37 @@ def voice_w2v2_api():
     if app.config.get("LANGUAGE_AUTOMATIC_DETECT", []) != []:
         speaker_lang = app.config.get("LANGUAGE_AUTOMATIC_DETECT")
     fname = f"{str(uuid.uuid1())}.{format}"
     file_type = f"audio/{format}"
     t1 = time.time()
-    output = tts.w2v2_vits_infer({"text": text,
-                                  "id": id,
-                                  "format": format,
-                                  "length": length,
-                                  "noise": noise,
-                                  "noisew": noisew,
-                                  "max": max,
-                                  "lang": lang,
-                                  "emotion": emotion,
-                                  "speaker_lang": speaker_lang})
     t2 = time.time()
-    logger.info(f"[w2v2] finish in {(t2 - t1):.2f}s")
-    return send_file(path_or_file=output, mimetype=file_type, download_name=fname)
 @app.route('/voice/conversion', methods=["POST"])
@@ -271,29 +314,35 @@ def vits_voice_conversion_api():
             original_id = int(request.form["original_id"])
             target_id = int(request.form["target_id"])
             format = request.form.get("format", voice.filename.split(".")[1])
         except Exception as e:
             logger.error(f"[vits_voice_convertsion] {e}")
             return make_response("parameter error", 400)
         fname = secure_filename(str(uuid.uuid1()) + "." + voice.filename.split(".")[1])
         audio_path = os.path.join(app.config['UPLOAD_FOLDER'], fname)
         voice.save(audio_path)
         file_type = f"audio/{format}"
-        logger.info(f"[vits_voice_convertsion] orginal_id:{original_id} target_id:{target_id}")
         t1 = time.time()
-        try:
-            output = tts.vits_voice_conversion({"audio_path": audio_path,
-                                                "original_id": original_id,
-                                                "target_id": target_id,
-                                                "format": format})
-        except Exception as e:
-            logger.info(f"[vits_voice_convertsion] {e}")
-            return make_response(jsonify({"status": "error", "message": f"synthesis failure"}), 400)
         t2 = time.time()
-        logger.info(f"finish in {(t2 - t1):.2f}s")
-        return send_file(path_or_file=output, mimetype=file_type, download_name=fname)
 @app.route('/voice/ssml', methods=["POST"])
@@ -312,20 +361,24 @@ def ssml():
     logger.debug(ssml)
-    t1 = time.time()
-    try:
-        output, format = tts.create_ssml_infer_task(ssml)
-    except Exception as e:
-        logger.info(f"[ssml] {e}")
-        return make_response(jsonify({"status": "error", "message": f"synthesis failure"}), 400)
-    t2 = time.time()
     fname = f"{str(uuid.uuid1())}.{format}"
     file_type = f"audio/{format}"
     logger.info(f"[ssml] finish in {(t2 - t1):.2f}s")
-    return send_file(path_or_file=output, mimetype=file_type, download_name=fname)
 @app.route('/voice/dimension-emotion', methods=["POST"])
@@ -333,6 +386,7 @@ def dimensional_emotion():
     if request.method == "POST":
         try:
             audio = request.files['upload']
         except Exception as e:
             logger.error(f"[dimensional_emotion] {e}")
             return make_response("parameter error", 400)
@@ -341,9 +395,15 @@ def dimensional_emotion():
     file_type = "application/octet-stream; charset=ascii"
     fname = os.path.splitext(audio.filename)[0] + ".npy"
-    output = tts.get_dimensional_emotion_npy(content)
-    return send_file(path_or_file=output, mimetype=file_type, download_name=fname)
 @app.route('/voice/check', methods=["GET", "POST"])
@@ -400,7 +460,8 @@ def check():
 # regular cleaning
-@scheduler.task('interval', id='clean_task', seconds=3600, misfire_grace_time=900)
 def clean_task():
     clean_folder(app.config["UPLOAD_FOLDER"])
     clean_folder(app.config["CACHE_PATH"])
@@ -409,4 +470,3 @@ def clean_task():
 if __name__ == '__main__':
     app.run(host='0.0.0.0', port=app.config.get("PORT", 23456), debug=app.config.get("DEBUG", False))  # 对外开放
     # app.run(host='127.0.0.1', port=app.config.get("PORT",23456), debug=True)  # 本地运行、调试

 scheduler = APScheduler()
 scheduler.init_app(app)
+if app.config.get("CLEAN_INTERVAL_SECONDS", 3600) > 0:
+    scheduler.start()
 logzero.loglevel(logging.WARNING)
 logger = logging.getLogger("vits-simple-api")
 @app.route('/', methods=["GET", "POST"])
 def index():
     kwargs = {
+        "speakers": tts.voice_speakers,
+        "speakers_count": tts.speakers_count
     }
     return render_template("index.html", **kwargs)
             noise = float(request.args.get("noise", app.config.get("NOISE", 0.667)))
             noisew = float(request.args.get("noisew", app.config.get("NOISEW", 0.8)))
             max = int(request.args.get("max", app.config.get("MAX", 50)))
+            use_streaming = request.args.get('streaming', False, type=bool)
         elif request.method == "POST":
             content_type = request.headers.get('Content-Type')
             if content_type == 'application/json':
             noise = float(data.get("noise", app.config.get("NOISE", 0.667)))
             noisew = float(data.get("noisew", app.config.get("NOISEW", 0.8)))
             max = int(data.get("max", app.config.get("MAX", 50)))
+            use_streaming = request.form.get('streaming', False, type=bool)
     except Exception as e:
         logger.error(f"[VITS] {e}")
         return make_response("parameter error", 400)
     if app.config.get("LANGUAGE_AUTOMATIC_DETECT", []) != []:
         speaker_lang = app.config.get("LANGUAGE_AUTOMATIC_DETECT")
+    if use_streaming and format.upper() != "MP3":
+        format = "mp3"
+        logger.warning("Streaming response only supports MP3 format.")
     fname = f"{str(uuid.uuid1())}.{format}"
     file_type = f"audio/{format}"
+    task = {"text": text,
+            "id": id,
+            "format": format,
+            "length": length,
+            "noise": noise,
+            "noisew": noisew,
+            "max": max,
+            "lang": lang,
+            "speaker_lang": speaker_lang}
+    if app.config.get("SAVE_AUDIO", False):
+        logger.debug(f"[VITS] {fname}")
+    if use_streaming:
+        audio = tts.stream_vits_infer(task, fname)
+        response = make_response(audio)
+        response.headers['Content-Disposition'] = f'attachment; filename={fname}'
+        response.headers['Content-Type'] = file_type
+        return response
+    else:
+        t1 = time.time()
+        audio = tts.vits_infer(task, fname)
+        t2 = time.time()
+        logger.info(f"[VITS] finish in {(t2 - t1):.2f}s")
+        return send_file(path_or_file=audio, mimetype=file_type, download_name=fname)
 @app.route('/voice/hubert-vits', methods=["POST"])
             length = float(request.form.get("length", app.config.get("LENGTH", 1)))
             noise = float(request.form.get("noise", app.config.get("NOISE", 0.667)))
             noisew = float(request.form.get("noisew", app.config.get("NOISEW", 0.8)))
+            use_streaming = request.form.get('streaming', False, type=bool)
         except Exception as e:
             logger.error(f"[hubert] {e}")
             return make_response("parameter error", 400)
         return make_response(jsonify({"status": "error", "message": f"id {id} does not exist"}), 400)
     file_type = f"audio/{format}"
+    task = {"id": id,
+            "format": format,
+            "length": length,
+            "noise": noise,
+            "noisew": noisew,
+            "audio_path": os.path.join(app.config['UPLOAD_FOLDER'], fname)}
     t1 = time.time()
+    audio = tts.hubert_vits_infer(task, fname)
     t2 = time.time()
+    if app.config.get("SAVE_AUDIO", False):
+        logger.debug(f"[hubert] {fname}")
     logger.info(f"[hubert] finish in {(t2 - t1):.2f}s")
+    if use_streaming:
+        audio = tts.generate_audio_chunks(audio)
+        response = make_response(audio)
+        response.headers['Content-Disposition'] = f'attachment; filename={fname}'
+        response.headers['Content-Type'] = file_type
+        return response
+    else:
+        return send_file(path_or_file=audio, mimetype=file_type, download_name=fname)
 @app.route('/voice/w2v2-vits', methods=["GET", "POST"])
             noisew = float(request.args.get("noisew", app.config.get("NOISEW", 0.8)))
             max = int(request.args.get("max", app.config.get("MAX", 50)))
             emotion = int(request.args.get("emotion", app.config.get("EMOTION", 0)))
+            use_streaming = request.args.get('streaming', False, type=bool)
         elif request.method == "POST":
             content_type = request.headers.get('Content-Type')
             if content_type == 'application/json':
             noisew = float(data.get("noisew", app.config.get("NOISEW", 0.8)))
             max = int(data.get("max", app.config.get("MAX", 50)))
             emotion = int(data.get("emotion", app.config.get("EMOTION", 0)))
+            use_streaming = request.form.get('streaming', False, type=bool)
     except Exception as e:
         logger.error(f"[w2v2] {e}")
         return make_response(f"parameter error", 400)
     if app.config.get("LANGUAGE_AUTOMATIC_DETECT", []) != []:
         speaker_lang = app.config.get("LANGUAGE_AUTOMATIC_DETECT")
+    if use_streaming and format.upper() != "MP3":
+        format = "mp3"
+        logger.warning("Streaming response only supports MP3 format.")
     fname = f"{str(uuid.uuid1())}.{format}"
     file_type = f"audio/{format}"
+    task = {"text": text,
+            "id": id,
+            "format": format,
+            "length": length,
+            "noise": noise,
+            "noisew": noisew,
+            "max": max,
+            "lang": lang,
+            "emotion": emotion,
+            "speaker_lang": speaker_lang}
     t1 = time.time()
+    audio = tts.w2v2_vits_infer(task, fname)
     t2 = time.time()
+    if app.config.get("SAVE_AUDIO", False):
+        logger.debug(f"[W2V2] {fname}")
+    if use_streaming:
+        audio = tts.generate_audio_chunks(audio)
+        response = make_response(audio)
+        response.headers['Content-Disposition'] = f'attachment; filename={fname}'
+        response.headers['Content-Type'] = file_type
+        return response
+    else:
+        logger.info(f"[w2v2] finish in {(t2 - t1):.2f}s")
+        return send_file(path_or_file=audio, mimetype=file_type, download_name=fname)
 @app.route('/voice/conversion', methods=["POST"])
             original_id = int(request.form["original_id"])
             target_id = int(request.form["target_id"])
             format = request.form.get("format", voice.filename.split(".")[1])
+            use_streaming = request.form.get('streaming', False, type=bool)
         except Exception as e:
             logger.error(f"[vits_voice_convertsion] {e}")
             return make_response("parameter error", 400)
+        logger.info(f"[vits_voice_convertsion] orginal_id:{original_id} target_id:{target_id}")
         fname = secure_filename(str(uuid.uuid1()) + "." + voice.filename.split(".")[1])
         audio_path = os.path.join(app.config['UPLOAD_FOLDER'], fname)
         voice.save(audio_path)
         file_type = f"audio/{format}"
+        task = {"audio_path": audio_path,
+                "original_id": original_id,
+                "target_id": target_id,
+                "format": format}
         t1 = time.time()
+        audio = tts.vits_voice_conversion(task, fname)
         t2 = time.time()
+        if app.config.get("SAVE_AUDIO", False):
+            logger.debug(f"[Voice conversion] {fname}")
+        logger.info(f"[Voice conversion] finish in {(t2 - t1):.2f}s")
+        if use_streaming:
+            audio = tts.generate_audio_chunks(audio)
+            response = make_response(audio)
+            response.headers['Content-Disposition'] = f'attachment; filename={fname}'
+            response.headers['Content-Type'] = file_type
+            return response
+        else:
+            return send_file(path_or_file=audio, mimetype=file_type, download_name=fname)
 @app.route('/voice/ssml', methods=["POST"])
     logger.debug(ssml)
     fname = f"{str(uuid.uuid1())}.{format}"
     file_type = f"audio/{format}"
+    t1 = time.time()
+    audio, format = tts.create_ssml_infer_task(ssml, fname)
+    t2 = time.time()
+    if app.config.get("SAVE_AUDIO", False):
+        logger.debug(f"[ssml] {fname}")
     logger.info(f"[ssml] finish in {(t2 - t1):.2f}s")
+    if eval(ssml.get('streaming', False)):
+        audio = tts.generate_audio_chunks(audio)
+        response = make_response(audio)
+        response.headers['Content-Disposition'] = f'attachment; filename={fname}'
+        response.headers['Content-Type'] = file_type
+        return response
+    else:
+        return send_file(path_or_file=audio, mimetype=file_type, download_name=fname)
 @app.route('/voice/dimension-emotion', methods=["POST"])
     if request.method == "POST":
         try:
             audio = request.files['upload']
+            use_streaming = request.form.get('streaming', False, type=bool)
         except Exception as e:
             logger.error(f"[dimensional_emotion] {e}")
             return make_response("parameter error", 400)
     file_type = "application/octet-stream; charset=ascii"
     fname = os.path.splitext(audio.filename)[0] + ".npy"
+    audio = tts.get_dimensional_emotion_npy(content)
+    if use_streaming:
+        audio = tts.generate_audio_chunks(audio)
+        response = make_response(audio)
+        response.headers['Content-Disposition'] = f'attachment; filename={fname}'
+        response.headers['Content-Type'] = file_type
+        return response
+    else:
+        return send_file(path_or_file=audio, mimetype=file_type, download_name=fname)
 @app.route('/voice/check', methods=["GET", "POST"])
 # regular cleaning
+@scheduler.task('interval', id='clean_task', seconds=app.config.get("CLEAN_INTERVAL_SECONDS", 3600),
+                misfire_grace_time=900)
 def clean_task():
     clean_folder(app.config["UPLOAD_FOLDER"])
     clean_folder(app.config["CACHE_PATH"])
 if __name__ == '__main__':
     app.run(host='0.0.0.0', port=app.config.get("PORT", 23456), debug=app.config.get("DEBUG", False))  # 对外开放
     # app.run(host='127.0.0.1', port=app.config.get("PORT",23456), debug=True)  # 本地运行、调试

config.py CHANGED Viewed

@@ -20,6 +20,12 @@ UPLOAD_FOLDER = ABS_PATH + "/upload"
 # Cahce path
 CACHE_PATH = ABS_PATH + "/cache"
 # zh ja ko en... If it is empty, it will be read based on the text_cleaners specified in the config.json.
 LANGUAGE_AUTOMATIC_DETECT = []

 # Cahce path
 CACHE_PATH = ABS_PATH + "/cache"
+# If CLEAN_INTERVAL_SECONDS <= 0, the cleaning task will not be executed.
+CLEAN_INTERVAL_SECONDS = 3600
+# save audio to CACHE_PATH
+SAVE_AUDIO = False
 # zh ja ko en... If it is empty, it will be read based on the text_cleaners specified in the config.json.
 LANGUAGE_AUTOMATIC_DETECT = []

templates/index.html CHANGED Viewed

@@ -1,237 +1,261 @@
 <!DOCTYPE html>
 <html lang="en">
-	<head>
-		<meta charset="UTF-8" />
-		<meta name="viewport" content="width=device-width, initial-scale=1.0" />
-		<title>vits-simple-api</title>
-		<link rel="stylesheet" href="/static/css/bootstrap.min.css" />
-	</head>
-	<body>
-		<main style="margin: 0 auto; width: 1024px">
-			<h1>
-				<a href="https://github.com/Artrajz/vits-simple-api" target="_blank" style="text-decoration: none; color: black"> vits-simple-api </a>
-			</h1>
-			<div>
-				<label>文档：</label>
-				<a href="https://github.com/Artrajz/vits-simple-api" target="_blank" style="text-decoration: none; color: black"> https://github.com/Artrajz/vits-simple-api </a>
-			</div>
-			<div>
-				<label>返回speakers(json)：</label>
-				<a id="speakersLink" href="https://artrajz-vits-simple-api.hf.space/voice/speakers" target="_blank" style="text-decoration: none; color: black">
-					https://artrajz-vits-simple-api.hf.space/voice/speakers
-				</a>
-			</div>
-			<div>
-				<label>简单调用api：</label>
-				<a id="vitsLink" href="https://artrajz-vits-simple-api.hf.space/voice/vits?text=你好,こんにちは&id=164" style="text-decoration: none; color: black">
-					https://artrajz-vits-simple-api.hf.space/voice/vits?text=你好,こんにちは&id=164
-				</a>
-			</div>
-			<!-- <div style="display: flex; justify-content: center; align-items: center"> -->
-			<div>
-				<form>
-					<div class="form-group">
-						<label>text</label>
-						<textarea class="form-control" id="inputText" rows="3" oninput="updateLink()">你好,こんにちは</textarea>
-					</div>
-					<div class="form-group">
-						<label>id</label>
-						<select class="form-control" id="inputId" oninput="updateLink()">
-							<option value="164"></option>
-							{% for speaker in speakers["VITS"] %}
-							{% if speaker["name"] == "雷电将军（雷神）" %}
-								<option value="{{speaker["id"]}}" selected>{{speaker["id"]}} | {{speaker["name"]}} | {{speaker["lang"]}}</option>
-							{% else %}
-								<option value="{{speaker["id"]}}">{{speaker["id"]}} | {{speaker["name"]}} | {{speaker["lang"]}}</option>
-							{% endif %}
-							{% endfor %}
-						</select>
-					</div>
-				</form>
-			</div>
-			<p>
-				<button class="btn btn-primary" type="button" data-toggle="collapse" data-target="#collapseExample" aria-expanded="false" aria-controls="collapseExample">
-					Advanced
-				</button>
-			</p>
-			<div class="collapse" id="collapseExample">
-				<div class="card card-body">
-					<form>
-						<div class="form-group">
-							<label>format</label>
-							<select class="form-control" id="inputFormat" oninput="updateLink()">
-								<option></option>
-								<option>wav</option>
-								<option>mp3</option>
-								<option>ogg</option>
-								<option>silk</option>
-							</select>
-						</div>
-						<div class="form-group">
-							<label>lang</label>
-							<input type="text"  class="form-control" id="inputLang" oninput="updateLink()" value="" placeholder="auto" />
-						</div>
-						<div class="form-group">
-							<label>length</label>
-							<input type="text"  class="form-control" id="inputLength" oninput="updateLink()" value="" placeholder="1" />
-						</div>
-						<div class="form-group">
-							<label>noise</label>
-							<input type="text"  class="form-control" id="inputNoise" oninput="updateLink()" value="" placeholder="0.33" />
-						</div>
-						<div class="form-group">
-							<label>noisew</label>
-							<input type="text"  class="form-control" id="inputNoisew" oninput="updateLink()" value="" placeholder="0.4" />
-						</div>
-						<div class="form-group">
-							<label>max</label>
-							<input type="text"  class="form-control" id="inputMax" oninput="updateLink()" value="" placeholder="50" />
-						</div>
-					</form>
-				</div>
-			</div>
-			<div style="display: flex; justify-content: center; align-items: center; height: 80px; margin-top: 20px; margin-bottom: 20px; border: 1px solid rgba(0,0,0,.125); border-radius: 0.25rem;">
-				<button type="button" class="btn btn-outline-secondary" id="getAudio" style="margin-right: 10px">播放器生成</button>
-				<audio id="audioPlayer" controls>
-					<source src="" type="audio/mp3" />
-					Your browser does not support the audio element.
-				</audio>
-			</div>
-			<div>自动识别语言：可识别的语言根据不同speaker而不同，方言无法自动识别</div>
-            <div>方言模型需要手动指定语言，比如粤语Cantonese要指定参数lang=gd</div>
-			<br />
-			<h2>所有模型均为网络搜集，感谢模型原作者的付出！</h2>
-			<p>
-				Nene_Nanami_Rong_Tang:
-				<a href="https://github.com/CjangCjengh/TTSModels" rel="noreferrer" target="_blank">CjangCjengh/TTSModels</a>
-			</p>
-			<p>
-				louise:
-				<a href="https://github.com/CjangCjengh/TTSModels" rel="noreferrer" target="_blank">CjangCjengh/TTSModels</a>
-			</p>
-			<p>
-				Cantonese:
-				<a href="https://github.com/CjangCjengh/TTSModels" rel="noreferrer" target="_blank">CjangCjengh/TTSModels</a>
-			</p>
-			<p>
-				shanghainese:
-				<a href="https://github.com/CjangCjengh/TTSModels" rel="noreferrer" target="_blank">CjangCjengh/TTSModels</a>
-			</p>
-			<p>
-				w2v2-vits:
-				<a href="https://github.com/CjangCjengh/TTSModels" rel="noreferrer" target="_blank">CjangCjengh/TTSModels</a>
-			</p>
-			<p>
-				vctk:
-				<a href="https://github.com/jaywalnut310/vits" rel="noreferrer" target="_blank">jaywalnut310/vits</a>
-			</p>
-			<p>
-				Bishojo Mangekyo:
-				<a href="https://github.com/Francis-Komizu/VITS" rel="noreferrer" target="_blank">Francis-Komizu/VITS</a>
-			</p>
-			<p>
-				genshin:
-				<a href="https://huggingface.co/spaces/zomehwh/vits-uma-genshin-honkai" rel="noreferrer" target="_blank">zomehwh/vits-uma-genshin-honkai</a>
-			</p>
-			<p>
-				paimon:
-				<a href="https://github.com/zixiiu/Digital_Life_Server" rel="noreferrer" target="_blank">zixiiu/Digital_Life_Server</a>
-			</p>
-			<p>
-				vits_chinese:
-				<a href="https://github.com/PlayVoice/vits_chinese" rel="noreferrer" target="_blank">PlayVoice/vits_chinese</a>
-			</p>
-		</main>
-		<script src="/static/js/jquery.slim.min.js"></script>
-		<script src="/static/js/bootstrap.bundle.min.js"></script>
-		<script>
-			function getProtocol(){
-				return 'https:' == location.protocol ? "https://": "http://";
-			}
-			function getUrl(){
-				var url = window.location.host;
-				return url;
-			}
-			var baseUrl = getProtocol() + getUrl();
-			setBaseUrl();
-			function setBaseUrl(){
-				var text = document.getElementById("inputText").value;
-				var id = document.getElementById("inputId").value;
-				var vitsLink = document.getElementById("vitsLink");
-				var speakersLink = document.getElementById("speakersLink");
-				var vitsUrl = baseUrl + "/voice/vits?text=" + text + "&id=" + id;
-				var speakersUrl = baseUrl + "/voice/speakers";
-				vitsLink.href = vitsUrl;
-				vitsLink.textContent = vitsUrl;
-				speakersLink.href = speakersUrl;
-				speakersLink.textContent = speakersUrl;
-			}
-			function getLink() {
-				var text = document.getElementById("inputText").value;
-				var id = document.getElementById("inputId").value;
-				var format = document.getElementById("inputFormat").value;
-				var lang = document.getElementById("inputLang").value;
-				var length = document.getElementById("inputLength").value;
-				var noise = document.getElementById("inputNoise").value;
-				var noisew = document.getElementById("inputNoisew").value;
-				var max = document.getElementById("inputMax").value;
-				var url = baseUrl + "/voice/vits?text=" + text + "&id=" + id;
-				if (format != "") {
-					url += "&format=" + format;
-				}
-				if (lang != "") {
-					url += "&lang=" + lang;
-				}
-				if (length != "") {
-					url += "&length=" + length;
-				}
-				if (noise != "") {
-					url += "&noise=" + noise;
-				}
-				if (noisew != "") {
-					url += "&noisew=" + noisew;
-				}
-				if (max != "") {
-					url += "&max=" + max;
-				}
-				return url;
-			}
-			function updateLink() {
-				var url = getLink();
-				var link = document.getElementById("vitsLink");
-				link.href = url;
-				link.textContent = url;
-			}
-			function setAudioSource() {
-				var url = getLink();
-				var audioPlayer = document.getElementById("audioPlayer");
-				audioPlayer.src = url;
-			}
-			var button = document.getElementById("getAudio");
-			button.addEventListener("click", function () {
-				setAudioSource();
-			});
-		</script>
-	</body>
 </html>

 <!DOCTYPE html>
 <html lang="en">
+<head>
+    <meta charset="UTF-8"/>
+    <meta name="viewport" content="width=device-width, initial-scale=1.0"/>
+    <title>vits-simple-api</title>
+    <link rel="stylesheet" href="/static/css/bootstrap.min.css"/>
+</head>
+<body>
+<main style="margin: 0 auto; width: 1024px">
+    <h1>
+        <a href="https://github.com/Artrajz/vits-simple-api" target="_blank"
+           style="text-decoration: none; color: black"> vits-simple-api </a>
+    </h1>
+    <div>
+        <label>文档：</label>
+        <a href="https://github.com/Artrajz/vits-simple-api" target="_blank"
+           style="text-decoration: none; color: black"> https://github.com/Artrajz/vits-simple-api </a>
+    </div>
+    <div>
+        <label>返回speakers(json)：</label>
+        <a id="speakersLink" href="https://artrajz-vits-simple-api.hf.space/voice/speakers" target="_blank"
+           style="text-decoration: none; color: black">
+            https://artrajz-vits-simple-api.hf.space/voice/speakers
+        </a>
+    </div>
+    <div>
+        <label>简单调用api：</label>
+        <a id="vitsLink" href="https://artrajz-vits-simple-api.hf.space/voice/vits?text=你好,こんにちは&id=164"
+           style="text-decoration: none; color: black">
+            https://artrajz-vits-simple-api.hf.space/voice/vits?text=你好,こんにちは&id=164
+        </a>
+    </div>
+    <!-- <div style="display: flex; justify-content: center; align-items: center"> -->
+    <div>
+        <form>
+            <div class="form-group">
+                <label>text</label>
+                <textarea class="form-control" id="inputText" rows="3" oninput="updateLink()">你好,こんにちは</textarea>
+            </div>
+            <div class="form-group">
+                <label>id</label>
+                <select class="form-control" id="inputId" oninput="updateLink()">
+                    {% for speaker in speakers["VITS"] %}
+                        {% if speaker["name"] == "雷电将军（雷神）" %}
+                            <option value="{{ speaker["id"] }}" selected>{{ speaker["id"] }} | {{ speaker["name"] }}
+                                | {{ speaker["lang"] }}</option>
+                        {% else %}
+                            <option value="{{ speaker["id"] }}">{{ speaker["id"] }} | {{ speaker["name"] }}
+                                | {{ speaker["lang"] }}</option>
+                        {% endif %}
+                    {% endfor %}
+                </select>
+            </div>
+        </form>
+    </div>
+    <p>
+        <button class="btn btn-primary" type="button" data-toggle="collapse" data-target="#collapseExample"
+                aria-expanded="false" aria-controls="collapseExample">
+            Advanced
+        </button>
+        {% if speakers_count == 0 %}
+            <div style="color: red;">未加载任何模型</div>
+        {% endif %}
+    </p>
+    <div class="collapse" id="collapseExample">
+        <div class="card card-body">
+            <form>
+                <div class="form-group">
+                    <label>format</label>
+                    <select class="form-control" id="inputFormat" oninput="updateLink()">
+                        <option></option>
+                        <option>wav</option>
+                        <option>mp3</option>
+                        <option>ogg</option>
+                        <option>silk</option>
+                    </select>
+                </div>
+                <div class="form-group">
+                    <label>lang</label>
+                    <input type="text" class="form-control" id="inputLang" oninput="updateLink()" value=""
+                           placeholder="auto"/>
+                </div>
+                <div class="form-group">
+                    <label>length</label>
+                    <input type="text" class="form-control" id="inputLength" oninput="updateLink()" value=""
+                           placeholder="1"/>
+                </div>
+                <div class="form-group">
+                    <label>noise</label>
+                    <input type="text" class="form-control" id="inputNoise" oninput="updateLink()" value=""
+                           placeholder="0.33"/>
+                </div>
+                <div class="form-group">
+                    <label>noisew</label>
+                    <input type="text" class="form-control" id="inputNoisew" oninput="updateLink()" value=""
+                           placeholder="0.4"/>
+                </div>
+                <div class="form-group">
+                    <label>max</label>
+                    <input type="text" class="form-control" id="inputMax" oninput="updateLink()" value=""
+                           placeholder="50"/>
+                </div>
+            </form>
+        </div>
+    </div>
+    <div style="display: flex; justify-content: center; align-items: center; height: 80px; margin-top: 20px; margin-bottom: 20px; border: 1px solid rgba(0,0,0,.125); border-radius: 0.25rem;">
+        <button type="button" class="btn btn-outline-secondary" id="getAudio" style="margin-right: 10px">播放器生成</button>
+        <audio id="audioPlayer" controls>
+            <source src="" type="audio/mp3"/>
+            Your browser does not support the audio element.
+        </audio>
+        <div class="form-group form-check">
+            <input type="checkbox" id="streaming">
+            <label class="form-check-label">流式响应</label>
+        </div>
+    </div>
+    <div>自动识别语言：可识别的语言根据不同speaker而不同，方言无法自动识别</div>
+    <div>方言模型需要手动指定语言，比如粤语Cantonese要指定参数lang=gd</div>
+    <br/>
+    <h2>所有模型均为网络搜集，感谢模型原作者的付出！</h2>
+    <p>
+        Nene_Nanami_Rong_Tang:
+        <a href="https://github.com/CjangCjengh/TTSModels" rel="noreferrer" target="_blank">CjangCjengh/TTSModels</a>
+    </p>
+    <p>
+        louise:
+        <a href="https://github.com/CjangCjengh/TTSModels" rel="noreferrer" target="_blank">CjangCjengh/TTSModels</a>
+    </p>
+    <p>
+        Cantonese:
+        <a href="https://github.com/CjangCjengh/TTSModels" rel="noreferrer" target="_blank">CjangCjengh/TTSModels</a>
+    </p>
+    <p>
+        shanghainese:
+        <a href="https://github.com/CjangCjengh/TTSModels" rel="noreferrer" target="_blank">CjangCjengh/TTSModels</a>
+    </p>
+    <p>
+        w2v2-vits:
+        <a href="https://github.com/CjangCjengh/TTSModels" rel="noreferrer" target="_blank">CjangCjengh/TTSModels</a>
+    </p>
+    <p>
+        vctk:
+        <a href="https://github.com/jaywalnut310/vits" rel="noreferrer" target="_blank">jaywalnut310/vits</a>
+    </p>
+    <p>
+        Bishojo Mangekyo:
+        <a href="https://github.com/Francis-Komizu/VITS" rel="noreferrer" target="_blank">Francis-Komizu/VITS</a>
+    </p>
+    <p>
+        genshin:
+        <a href="https://huggingface.co/spaces/zomehwh/vits-uma-genshin-honkai" rel="noreferrer" target="_blank">zomehwh/vits-uma-genshin-honkai</a>
+    </p>
+    <p>
+        paimon:
+        <a href="https://github.com/zixiiu/Digital_Life_Server" rel="noreferrer" target="_blank">zixiiu/Digital_Life_Server</a>
+    </p>
+    <p>
+        vits_chinese:
+        <a href="https://github.com/PlayVoice/vits_chinese" rel="noreferrer" target="_blank">PlayVoice/vits_chinese</a>
+    </p>
+</main>
+<script src="/static/js/jquery.slim.min.js"></script>
+<script src="/static/js/bootstrap.bundle.min.js"></script>
+<script>
+    function getProtocol() {
+        return 'https:' == location.protocol ? "https://" : "http://";
+    }
+    function getUrl() {
+        var url = window.location.host;
+        return url;
+    }
+    var baseUrl = getProtocol() + getUrl();
+    setBaseUrl();
+    function setBaseUrl() {
+        var text = document.getElementById("inputText").value;
+        var id = document.getElementById("inputId").value;
+        var vitsLink = document.getElementById("vitsLink");
+        var speakersLink = document.getElementById("speakersLink");
+        var vitsUrl = baseUrl + "/voice/vits?text=" + text + "&id=" + id;
+        var speakersUrl = baseUrl + "/voice/speakers";
+        vitsLink.href = vitsUrl;
+        vitsLink.textContent = vitsUrl;
+        speakersLink.href = speakersUrl;
+        speakersLink.textContent = speakersUrl;
+    }
+    function getLink() {
+        var text = document.getElementById("inputText").value;
+        var id = document.getElementById("inputId").value;
+        var format = document.getElementById("inputFormat").value;
+        var lang = document.getElementById("inputLang").value;
+        var length = document.getElementById("inputLength").value;
+        var noise = document.getElementById("inputNoise").value;
+        var noisew = document.getElementById("inputNoisew").value;
+        var max = document.getElementById("inputMax").value;
+        var url = baseUrl + "/voice/vits?text=" + text + "&id=" + id;
+        if (format != "") {
+            url += "&format=" + format;
+        }
+        if (lang != "") {
+            url += "&lang=" + lang;
+        }
+        if (length != "") {
+            url += "&length=" + length;
+        }
+        if (noise != "") {
+            url += "&noise=" + noise;
+        }
+        if (noisew != "") {
+            url += "&noisew=" + noisew;
+        }
+        if (max != "") {
+            url += "&max=" + max;
+        }
+        return url;
+    }
+    function updateLink() {
+        var url = getLink();
+        var link = document.getElementById("vitsLink");
+        link.href = url;
+        link.textContent = url;
+    }
+    function setAudioSource() {
+        var streaming = document.getElementById('streaming');
+        var url = getLink();
+        if (streaming.checked) {
+            url += '&streaming=true';
+        }
+        var audioPlayer = document.getElementById("audioPlayer");
+        audioPlayer.src = url;
+        audioPlayer.play();
+    }
+    var button = document.getElementById("getAudio");
+    button.addEventListener("click", function () {
+        setAudioSource();
+    });
+</script>
+</body>
 </html>

utils/utils.py CHANGED Viewed

@@ -89,3 +89,7 @@ def clean_folder(folder_path):
 # is none -> True, is not none -> False
 def check_is_none(s):
     return s is None or (isinstance(s, str) and str(s).isspace()) or str(s) == ""

 # is none -> True, is not none -> False
 def check_is_none(s):
     return s is None or (isinstance(s, str) and str(s).isspace()) or str(s) == ""
+def save_audio(audio, path):
+    with open(path,"wb") as f:
+        f.write(audio)

voice.py CHANGED Viewed

@@ -1,7 +1,6 @@
 import os
 import librosa
 import commons
-import sys
 import re
 import numpy as np
 import torch
@@ -156,7 +155,7 @@ class vits:
         return params
-    def get_audio(self, voice, auto_break=False):
         text = voice.get("text", None)
         speaker_id = voice.get("id", 0)
         length = voice.get("length", 1)
@@ -171,47 +170,57 @@ class vits:
         # 去除所有多余的空白字符
         if text is not None: text = re.sub(r'\s+', ' ', text).strip()
-        # 停顿0.75s，避免语音分段合成再拼接后的连接突兀
-        brk = np.zeros(int(0.75 * 22050), dtype=np.int16)
         tasks = []
         if self.model_type == "vits":
             sentence_list = sentence_split(text, max, lang, speaker_lang)
             for sentence in sentence_list:
-                tasks.append(
-                    self.get_infer_param(text=sentence, speaker_id=speaker_id, length_scale=length, noise_scale=noise,
-                                         noise_scale_w=noisew))
-            audios = []
-            for task in tasks:
-                audios.append(self.infer(task))
-                if auto_break:
-                    audios.append(brk)
-            audio = np.concatenate(audios, axis=0)
         elif self.model_type == "hubert":
             params = self.get_infer_param(speaker_id=speaker_id, length_scale=length, noise_scale=noise,
                                           noise_scale_w=noisew, audio_path=audio_path)
-            audio = self.infer(params)
         elif self.model_type == "w2v2":
             sentence_list = sentence_split(text, max, lang, speaker_lang)
             for sentence in sentence_list:
-                tasks.append(
-                    self.get_infer_param(text=sentence, speaker_id=speaker_id, length_scale=length, noise_scale=noise,
-                                         noise_scale_w=noisew, emotion=emotion))
-            audios = []
-            for task in tasks:
-                audios.append(self.infer(task))
-                if auto_break:
-                    audios.append(brk)
-            audio = np.concatenate(audios, axis=0)
         return audio
     def voice_conversion(self, voice):
         audio_path = voice.get("audio_path")
         original_id = voice.get("original_id")
@@ -330,6 +339,14 @@ class TTS:
         else:
             raise ValueError("Unsupported time unit: {}".format(time_unit))
     def parse_ssml(self, ssml):
         root = ET.fromstring(ssml)
         format = root.attrib.get("format", "wav")
@@ -403,7 +420,7 @@ class TTS:
         return voice_tasks, format
-    def create_ssml_infer_task(self, ssml):
         voice_tasks, format = self.parse_ssml(ssml)
         audios = []
@@ -420,38 +437,66 @@ class TTS:
                 audios.append(audio)
         audio = np.concatenate(audios, axis=0)
-        output = self.encode(voice_obj.hps_ms.data.sampling_rate, audio, format)
-        return output, format
-    def vits_infer(self, voice):
         format = voice.get("format", "wav")
         voice_obj = self._voice_obj["VITS"][voice.get("id")][1]
         voice["id"] = self._voice_obj["VITS"][voice.get("id")][0]
         audio = voice_obj.get_audio(voice, auto_break=True)
-        output = self.encode(voice_obj.hps_ms.data.sampling_rate, audio, format)
-        return output
-    def hubert_vits_infer(self, voice):
         format = voice.get("format", "wav")
         voice_obj = self._voice_obj["HUBERT-VITS"][voice.get("id")][1]
         voice["id"] = self._voice_obj["HUBERT-VITS"][voice.get("id")][0]
         audio = voice_obj.get_audio(voice)
-        output = self.encode(voice_obj.hps_ms.data.sampling_rate, audio, format)
-        return output
-    def w2v2_vits_infer(self, voice):
         format = voice.get("format", "wav")
         voice_obj = self._voice_obj["W2V2-VITS"][voice.get("id")][1]
         voice["id"] = self._voice_obj["W2V2-VITS"][voice.get("id")][0]
         audio = voice_obj.get_audio(voice, auto_break=True)
-        output = self.encode(voice_obj.hps_ms.data.sampling_rate, audio, format)
-        return output
-    def vits_voice_conversion(self, voice):
         original_id = voice.get("original_id")
         target_id = voice.get("target_id")
         format = voice.get("format")
@@ -466,10 +511,14 @@ class TTS:
         voice["target_id"] = int(self._voice_obj["VITS"][target_id][0])
         voice_obj = self._voice_obj["VITS"][original_id][1]
-        audio = voice_obj.voice_conversion(voice)
-        output = self.encode(voice_obj.hps_ms.data.sampling_rate, audio, format)
-        return output
     def get_dimensional_emotion_npy(self, audio):
         if self.dem is None:

 import os
 import librosa
 import commons
 import re
 import numpy as np
 import torch
         return params
+    def get_tasks(self, voice):
         text = voice.get("text", None)
         speaker_id = voice.get("id", 0)
         length = voice.get("length", 1)
         # 去除所有多余的空白字符
         if text is not None: text = re.sub(r'\s+', ' ', text).strip()
         tasks = []
         if self.model_type == "vits":
             sentence_list = sentence_split(text, max, lang, speaker_lang)
             for sentence in sentence_list:
+                params = self.get_infer_param(text=sentence, speaker_id=speaker_id, length_scale=length,
+                                              noise_scale=noise, noise_scale_w=noisew)
+                tasks.append(params)
         elif self.model_type == "hubert":
             params = self.get_infer_param(speaker_id=speaker_id, length_scale=length, noise_scale=noise,
                                           noise_scale_w=noisew, audio_path=audio_path)
+            tasks.append(params)
         elif self.model_type == "w2v2":
             sentence_list = sentence_split(text, max, lang, speaker_lang)
             for sentence in sentence_list:
+                params = self.get_infer_param(text=sentence, speaker_id=speaker_id, length_scale=length,
+                                              noise_scale=noise, noise_scale_w=noisew, emotion=emotion)
+                tasks.append(params)
+        return tasks
+    def get_audio(self, voice, auto_break=False):
+        tasks = self.get_tasks(voice)
+        # 停顿0.75s，避免语音分段合成再拼接后的连接突兀
+        brk = np.zeros(int(0.75 * 22050), dtype=np.int16)
+        audios = []
+        for task in tasks:
+            if auto_break:
+                chunk = np.concatenate((self.infer(task), brk), axis=0)
+            else:
+                chunk = self.infer(task)
+            audios.append(chunk)
+        audio = np.concatenate(audios, axis=0)
         return audio
+    def get_stream_audio(self, voice, auto_break=False):
+        tasks = self.get_tasks(voice)
+        brk = np.zeros(int(0.75 * 22050), dtype=np.int16)
+        for task in tasks:
+            if auto_break:
+                chunk = np.concatenate((self.infer(task), brk), axis=0)
+            else:
+                chunk = self.infer(task)
+            yield chunk
     def voice_conversion(self, voice):
         audio_path = voice.get("audio_path")
         original_id = voice.get("original_id")
         else:
             raise ValueError("Unsupported time unit: {}".format(time_unit))
+    def generate_audio_chunks(self, audio):
+        chunk_size = 4096
+        while True:
+            chunk = audio.read(chunk_size)
+            if not chunk:
+                break
+            yield chunk
     def parse_ssml(self, ssml):
         root = ET.fromstring(ssml)
         format = root.attrib.get("format", "wav")
         return voice_tasks, format
+    def create_ssml_infer_task(self, ssml, fname):
         voice_tasks, format = self.parse_ssml(ssml)
         audios = []
                 audios.append(audio)
         audio = np.concatenate(audios, axis=0)
+        encoded_audio = self.encode(voice_obj.hps_ms.data.sampling_rate, audio, format)
+        if config.SAVE_AUDIO:
+            path = f"{config.CACHE_PATH}/{fname}"
+            utils.save_audio(encoded_audio.getvalue(), path)
+        return encoded_audio, format
+    def vits_infer(self, voice, fname):
         format = voice.get("format", "wav")
         voice_obj = self._voice_obj["VITS"][voice.get("id")][1]
         voice["id"] = self._voice_obj["VITS"][voice.get("id")][0]
+        sampling_rate = voice_obj.hps_ms.data.sampling_rate
         audio = voice_obj.get_audio(voice, auto_break=True)
+        encoded_audio = self.encode(sampling_rate, audio, format)
+        if config.SAVE_AUDIO:
+            path = f"{config.CACHE_PATH}/{fname}"
+            utils.save_audio(encoded_audio.getvalue(), path)
+        return encoded_audio
+    def stream_vits_infer(self, voice, fname):
+        format = voice.get("format", "wav")
+        voice_obj = self._voice_obj["VITS"][voice.get("id")][1]
+        voice["id"] = self._voice_obj["VITS"][voice.get("id")][0]
+        sampling_rate = voice_obj.hps_ms.data.sampling_rate
+        genertator = voice_obj.get_stream_audio(voice, auto_break=True)
+        audio = BytesIO()
+        for chunk in genertator:
+            encoded_audio = self.encode(sampling_rate, chunk, format)
+            for encoded_audio_chunk in self.generate_audio_chunks(encoded_audio):
+                yield encoded_audio_chunk
+            if config.SAVE_AUDIO:
+                audio.write(encoded_audio.getvalue())
+        if config.SAVE_AUDIO:
+            path = f"{config.CACHE_PATH}/{fname}"
+            utils.save_audio(audio.getvalue(), path)
+    def hubert_vits_infer(self, voice, fname):
         format = voice.get("format", "wav")
         voice_obj = self._voice_obj["HUBERT-VITS"][voice.get("id")][1]
         voice["id"] = self._voice_obj["HUBERT-VITS"][voice.get("id")][0]
+        sampling_rate = voice_obj.hps_ms.data.sampling_rate
         audio = voice_obj.get_audio(voice)
+        encoded_audio = self.encode(sampling_rate, audio, format)
+        if config.SAVE_AUDIO:
+            path = f"{config.CACHE_PATH}/{fname}"
+            utils.save_audio(encoded_audio.getvalue(), path)
+        return encoded_audio
+    def w2v2_vits_infer(self, voice, fname):
         format = voice.get("format", "wav")
         voice_obj = self._voice_obj["W2V2-VITS"][voice.get("id")][1]
         voice["id"] = self._voice_obj["W2V2-VITS"][voice.get("id")][0]
+        sampling_rate = voice_obj.hps_ms.data.sampling_rate
         audio = voice_obj.get_audio(voice, auto_break=True)
+        encoded_audio = self.encode(sampling_rate, audio, format)
+        if config.SAVE_AUDIO:
+            path = f"{config.CACHE_PATH}/{fname}"
+            utils.save_audio(encoded_audio.getvalue(), path)
+        return encoded_audio
+    def vits_voice_conversion(self, voice, fname):
         original_id = voice.get("original_id")
         target_id = voice.get("target_id")
         format = voice.get("format")
         voice["target_id"] = int(self._voice_obj["VITS"][target_id][0])
         voice_obj = self._voice_obj["VITS"][original_id][1]
+        sampling_rate = voice_obj.hps_ms.data.sampling_rate
+        audio = voice_obj.voice_conversion(voice)
+        encoded_audio = self.encode(sampling_rate, audio, format)
+        if config.SAVE_AUDIO:
+            path = f"{config.CACHE_PATH}/{fname}"
+            utils.save_audio(encoded_audio.getvalue(), path)
+        return encoded_audio
     def get_dimensional_emotion_npy(self, audio):
         if self.dem is None: