Spaces:
Running
Running
Upload folder using huggingface_hub
Browse files- pyproject.toml +1 -1
- src/vocalizr/__main__.py +1 -0
- src/vocalizr/model.py +97 -99
- uv.lock +1 -1
pyproject.toml
CHANGED
@@ -5,7 +5,7 @@ description = "Voice Generator part of the Chatacter Backend"
|
|
5 |
readme = "README.md"
|
6 |
requires-python = ">=3.13, <3.14"
|
7 |
dependencies = [
|
8 |
-
"gradio[mcp]>=5.
|
9 |
"kokoro>=0.9.4",
|
10 |
"soundfile>=0.13.1",
|
11 |
"pip>=25.1.1",
|
|
|
5 |
readme = "README.md"
|
6 |
requires-python = ">=3.13, <3.14"
|
7 |
dependencies = [
|
8 |
+
"gradio[mcp]>=5.35.0",
|
9 |
"kokoro>=0.9.4",
|
10 |
"soundfile>=0.13.1",
|
11 |
"pip>=25.1.1",
|
src/vocalizr/__main__.py
CHANGED
@@ -15,6 +15,7 @@ def main() -> None:
|
|
15 |
show_api=True,
|
16 |
enable_monitoring=True,
|
17 |
show_error=True,
|
|
|
18 |
)
|
19 |
|
20 |
|
|
|
15 |
show_api=True,
|
16 |
enable_monitoring=True,
|
17 |
show_error=True,
|
18 |
+
pwa=True,
|
19 |
)
|
20 |
|
21 |
|
src/vocalizr/model.py
CHANGED
@@ -1,99 +1,97 @@
|
|
1 |
-
from typing import Any, Generator, Literal
|
2 |
-
|
3 |
-
from gradio import Error
|
4 |
-
from kokoro import KPipeline
|
5 |
-
from loguru import logger
|
6 |
-
from numpy import float32
|
7 |
-
from
|
8 |
-
from
|
9 |
-
|
10 |
-
|
11 |
-
|
12 |
-
|
13 |
-
|
14 |
-
|
15 |
-
|
16 |
-
|
17 |
-
|
18 |
-
|
19 |
-
|
20 |
-
|
21 |
-
|
22 |
-
|
23 |
-
|
24 |
-
:
|
25 |
-
|
26 |
-
|
27 |
-
|
28 |
-
|
29 |
-
|
30 |
-
|
31 |
-
|
32 |
-
|
33 |
-
|
34 |
-
|
35 |
-
|
36 |
-
|
37 |
-
|
38 |
-
|
39 |
-
|
40 |
-
|
41 |
-
|
42 |
-
|
43 |
-
|
44 |
-
|
45 |
-
|
46 |
-
|
47 |
-
|
48 |
-
|
49 |
-
|
50 |
-
|
51 |
-
|
52 |
-
|
53 |
-
|
54 |
-
:param
|
55 |
-
|
56 |
-
|
57 |
-
|
58 |
-
|
59 |
-
|
60 |
-
|
61 |
-
|
62 |
-
|
63 |
-
|
64 |
-
|
65 |
-
:param debug: Whether to enable debug mode. Defaults to False.
|
66 |
-
|
67 |
-
|
68 |
-
|
69 |
-
:
|
70 |
-
|
71 |
-
|
72 |
-
|
73 |
-
|
74 |
-
|
75 |
-
|
76 |
-
|
77 |
-
logger.exception("
|
78 |
-
|
79 |
-
|
80 |
-
|
81 |
-
|
82 |
-
|
83 |
-
|
84 |
-
|
85 |
-
|
86 |
-
|
87 |
-
|
88 |
-
|
89 |
-
|
90 |
-
|
91 |
-
|
92 |
-
|
93 |
-
|
94 |
-
|
95 |
-
|
96 |
-
|
97 |
-
|
98 |
-
first = False
|
99 |
-
yield 24000, zeros(1).numpy()
|
|
|
1 |
+
from typing import Any, Generator, Literal
|
2 |
+
|
3 |
+
from gradio import Error
|
4 |
+
from kokoro import KPipeline
|
5 |
+
from loguru import logger
|
6 |
+
from numpy import dtype, float32, ndarray
|
7 |
+
from soundfile import write
|
8 |
+
from torch import zeros
|
9 |
+
|
10 |
+
from vocalizr import AUDIO_FILE_PATH, PIPELINE
|
11 |
+
|
12 |
+
|
13 |
+
@logger.catch
|
14 |
+
def save_file_wav(audio: ndarray[tuple[float32], dtype[float32]]) -> None:
|
15 |
+
"""
|
16 |
+
Saves an audio array to a WAV file using the specified sampling rate. If the saving
|
17 |
+
operation fails, it logs the exception and raises a RuntimeError.
|
18 |
+
|
19 |
+
:param ndarray[tuple[float32],dtype[float32]] audio: The audio data to be saved.
|
20 |
+
Must be a NumPy array of data type float32, representing the audio signal
|
21 |
+
to be written to the file.
|
22 |
+
|
23 |
+
:return: This function does not return a value.
|
24 |
+
:rtype: None
|
25 |
+
"""
|
26 |
+
try:
|
27 |
+
logger.info(f"Saving audio to {AUDIO_FILE_PATH}")
|
28 |
+
write(file=AUDIO_FILE_PATH, data=audio, samplerate=24000)
|
29 |
+
except Exception as e:
|
30 |
+
logger.exception(f"Failed to save audio to {AUDIO_FILE_PATH}: {e}")
|
31 |
+
raise RuntimeError(f"Failed to save audio to {AUDIO_FILE_PATH}: {e}") from e
|
32 |
+
|
33 |
+
|
34 |
+
@logger.catch
|
35 |
+
def generate_audio_for_text(
|
36 |
+
text: str,
|
37 |
+
voice: str = "af_heart",
|
38 |
+
speed: float = 1.0,
|
39 |
+
save_file: bool = False,
|
40 |
+
debug: bool = False,
|
41 |
+
char_limit: int = -1,
|
42 |
+
) -> Generator[
|
43 |
+
tuple[Literal[24000], ndarray[tuple[float32], dtype[float32]]]
|
44 |
+
| tuple[int, ndarray],
|
45 |
+
Any,
|
46 |
+
None,
|
47 |
+
]:
|
48 |
+
"""
|
49 |
+
Generates audio from the provided text using the specified voice and speed.
|
50 |
+
It allows saving the generated audio to a file if required. The function
|
51 |
+
yields tuples containing the audio sampling rate and the audio data as a
|
52 |
+
NumPy array.
|
53 |
+
|
54 |
+
:param str text: The input text to generate audio for. If CHAR_LIMIT is set to a
|
55 |
+
positive value, the text will be truncated to fit that limit.
|
56 |
+
|
57 |
+
:param str voice: The voice profile to use for audio generation.
|
58 |
+
Defaults to "af_heart".
|
59 |
+
|
60 |
+
:param float speed: The speed modifier for audio generation. Defaults to 1.0.
|
61 |
+
|
62 |
+
:param bool save_file: Whether to save the generated audio to a file. Defaults
|
63 |
+
to False.
|
64 |
+
|
65 |
+
:param bool debug: Whether to enable debug mode. Defaults to False.
|
66 |
+
|
67 |
+
:param int char_limit: The maximum number of characters to include in the input
|
68 |
+
|
69 |
+
:return: A generator that yields tuples, where the first element is the
|
70 |
+
fixed sampling rate of 24,000 Hz, and the second element is a NumPy
|
71 |
+
array representing the generated audio data.
|
72 |
+
:rtype: Generator[tuple[Literal[24000], NDArray[float32]], Any, None]
|
73 |
+
"""
|
74 |
+
if not text:
|
75 |
+
logger.exception("No text provided")
|
76 |
+
elif len(text) < 4:
|
77 |
+
logger.exception(f"Text too short: {text} with length {len(text)}")
|
78 |
+
text = text if char_limit == -1 else text.strip()[:char_limit]
|
79 |
+
generator: Generator[KPipeline.Result, None, None] = PIPELINE(
|
80 |
+
text=text, voice=voice, speed=speed
|
81 |
+
)
|
82 |
+
first = True
|
83 |
+
for _, _, audio in generator:
|
84 |
+
if audio is None or isinstance(audio, str):
|
85 |
+
logger.exception(f"Unexpected type (audio): {type(audio)}")
|
86 |
+
raise Error(message=f"Unexpected type (audio): {type(audio)}")
|
87 |
+
if debug:
|
88 |
+
logger.info(f"Generating audio for '{text}'")
|
89 |
+
audio_np: ndarray[tuple[float32], dtype[float32]] = audio.numpy()
|
90 |
+
if save_file:
|
91 |
+
if debug:
|
92 |
+
logger.info(f"Saving audio file at {AUDIO_FILE_PATH}")
|
93 |
+
save_file_wav(audio=audio_np)
|
94 |
+
yield 24000, audio_np
|
95 |
+
if first:
|
96 |
+
first = False
|
97 |
+
yield 24000, zeros(1).numpy()
|
|
|
|
uv.lock
CHANGED
@@ -1894,7 +1894,7 @@ dev = [
|
|
1894 |
|
1895 |
[package.metadata]
|
1896 |
requires-dist = [
|
1897 |
-
{ name = "gradio", extras = ["mcp"], specifier = ">=5.
|
1898 |
{ name = "kokoro", specifier = ">=0.9.4" },
|
1899 |
{ name = "pip", specifier = ">=25.1.1" },
|
1900 |
{ name = "soundfile", specifier = ">=0.13.1" },
|
|
|
1894 |
|
1895 |
[package.metadata]
|
1896 |
requires-dist = [
|
1897 |
+
{ name = "gradio", extras = ["mcp"], specifier = ">=5.35.0" },
|
1898 |
{ name = "kokoro", specifier = ">=0.9.4" },
|
1899 |
{ name = "pip", specifier = ">=25.1.1" },
|
1900 |
{ name = "soundfile", specifier = ">=0.13.1" },
|