Spaces:
Running
on
Zero
Running
on
Zero
Update app.py
Browse files
app.py
CHANGED
@@ -1,5 +1,5 @@
|
|
1 |
import torch
|
2 |
-
import spaces
|
3 |
from transformers import AutoTokenizer, AutoModelForCausalLM
|
4 |
import gradio as gr
|
5 |
from snac import SNAC
|
@@ -38,40 +38,22 @@ def redistribute_codes(row):
|
|
38 |
audio_hat = snac_model.decode(codes)
|
39 |
return audio_hat.cpu()[0, 0]
|
40 |
|
41 |
-
# Load the SNAC model
|
42 |
snac_model = SNAC.from_pretrained("hubertsiuzdak/snac_24khz").to("cuda")
|
43 |
|
44 |
-
# Load
|
45 |
-
|
46 |
-
|
47 |
-
|
48 |
-
|
49 |
-
},
|
50 |
-
"Ceylia": {
|
51 |
-
"tokenizer": AutoTokenizer.from_pretrained('prithivMLmods/Llama-3B-Mono-Ceylia'),
|
52 |
-
"model": AutoModelForCausalLM.from_pretrained('prithivMLmods/Llama-3B-Mono-Ceylia', torch_dtype=torch.bfloat16).cuda()
|
53 |
-
},
|
54 |
-
"Cooper": {
|
55 |
-
"tokenizer": AutoTokenizer.from_pretrained('prithivMLmods/Llama-3B-Mono-Cooper'),
|
56 |
-
"model": AutoModelForCausalLM.from_pretrained('prithivMLmods/Llama-3B-Mono-Cooper', torch_dtype=torch.bfloat16).cuda()
|
57 |
-
},
|
58 |
-
"Jim": {
|
59 |
-
"tokenizer": AutoTokenizer.from_pretrained('prithivMLmods/Llama-3B-Mono-Jim'),
|
60 |
-
"model": AutoModelForCausalLM.from_pretrained('prithivMLmods/Llama-3B-Mono-Jim', torch_dtype=torch.bfloat16).cuda()
|
61 |
-
},
|
62 |
-
}
|
63 |
|
64 |
@spaces.GPU
|
65 |
-
def generate_audio(text, temperature, top_p, max_new_tokens
|
66 |
"""
|
67 |
-
Given input text
|
68 |
"""
|
69 |
-
|
70 |
-
|
71 |
-
tokenizer = chosen["tokenizer"]
|
72 |
-
model = chosen["model"]
|
73 |
-
|
74 |
-
prompt = f'<custom_token_3><|begin_of_text|>{text}<|eot_id|><custom_token_4><custom_token_5><custom_token_1>'
|
75 |
input_ids = tokenizer(prompt, add_special_tokens=False, return_tensors='pt').to('cuda')
|
76 |
|
77 |
with torch.no_grad():
|
@@ -91,47 +73,25 @@ def generate_audio(text, temperature, top_p, max_new_tokens, model_name):
|
|
91 |
y_np = y_tensor.detach().cpu().numpy()
|
92 |
return (24000, y_np)
|
93 |
|
94 |
-
# Example texts with emotion tokens
|
95 |
-
example_texts = [
|
96 |
-
["Hi, my name is Alex. <laugh> It's a wonderful day! <chuckle> I love coding."],
|
97 |
-
["I woke up feeling sleepy. <yawn> I need coffee! <sniffle> But I'm ready to work."],
|
98 |
-
["Oh no, I forgot my keys! <groan> <uhm> Maybe I'll try again later. <sigh>"],
|
99 |
-
["This is amazing! <gasp> Really, it's fantastic. <giggles>"]
|
100 |
-
]
|
101 |
-
|
102 |
# Gradio Interface
|
103 |
with gr.Blocks() as demo:
|
104 |
-
#
|
105 |
-
|
106 |
-
gr.Markdown("# Choose Model")
|
107 |
-
model_choice = gr.Dropdown(choices=list(models.keys()), value="Luna", label="Model")
|
108 |
-
|
109 |
-
gr.Markdown("# Single Speaker Audio Generation")
|
110 |
-
gr.Markdown("Generate speech audio using one of the single-speaker models. Use the examples below to see how emotion tokens like `<laugh>`, `<chuckle>`, `<sigh>`, etc. can be incorporated.")
|
111 |
|
112 |
with gr.Row():
|
113 |
text_input = gr.Textbox(lines=4, label="Input Text")
|
114 |
|
115 |
-
# Examples with emotion tokens
|
116 |
-
gr.Examples(
|
117 |
-
examples=example_texts,
|
118 |
-
inputs=text_input,
|
119 |
-
label="Emotion Examples",
|
120 |
-
cache_examples=False
|
121 |
-
)
|
122 |
-
|
123 |
with gr.Row():
|
124 |
temp_slider = gr.Slider(minimum=0.1, maximum=2.0, step=0.1, value=0.9, label="Temperature")
|
125 |
top_p_slider = gr.Slider(minimum=0.1, maximum=1.0, step=0.05, value=0.8, label="Top-p")
|
126 |
-
tokens_slider = gr.Slider(minimum=100, maximum=
|
127 |
|
128 |
output_audio = gr.Audio(type="numpy", label="Generated Audio")
|
129 |
generate_button = gr.Button("Generate Audio")
|
130 |
|
131 |
-
# Pass the selected model name along with other parameters
|
132 |
generate_button.click(
|
133 |
fn=generate_audio,
|
134 |
-
inputs=[text_input, temp_slider, top_p_slider, tokens_slider
|
135 |
outputs=output_audio
|
136 |
)
|
137 |
|
|
|
1 |
import torch
|
2 |
+
import spaces
|
3 |
from transformers import AutoTokenizer, AutoModelForCausalLM
|
4 |
import gradio as gr
|
5 |
from snac import SNAC
|
|
|
38 |
audio_hat = snac_model.decode(codes)
|
39 |
return audio_hat.cpu()[0, 0]
|
40 |
|
41 |
+
# Load the SNAC model for audio decoding
|
42 |
snac_model = SNAC.from_pretrained("hubertsiuzdak/snac_24khz").to("cuda")
|
43 |
|
44 |
+
# Load the single-speaker language model
|
45 |
+
tokenizer = AutoTokenizer.from_pretrained('prithivMLmods/Llama-3B-Mono-Luna')
|
46 |
+
model = AutoModelForCausalLM.from_pretrained(
|
47 |
+
'prithivMLmods/Llama-3B-Mono-Luna', torch_dtype=torch.bfloat16
|
48 |
+
).cuda()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
49 |
|
50 |
@spaces.GPU
|
51 |
+
def generate_audio(text, temperature, top_p, max_new_tokens):
|
52 |
"""
|
53 |
+
Given input text, generate speech audio.
|
54 |
"""
|
55 |
+
speaker = "Luna"
|
56 |
+
prompt = f'<custom_token_3><|begin_of_text|>{speaker}: {text}<|eot_id|><custom_token_4><custom_token_5><custom_token_1>'
|
|
|
|
|
|
|
|
|
57 |
input_ids = tokenizer(prompt, add_special_tokens=False, return_tensors='pt').to('cuda')
|
58 |
|
59 |
with torch.no_grad():
|
|
|
73 |
y_np = y_tensor.detach().cpu().numpy()
|
74 |
return (24000, y_np)
|
75 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
76 |
# Gradio Interface
|
77 |
with gr.Blocks() as demo:
|
78 |
+
gr.Markdown("# Llama-3B-Mono-Luna - Single Speaker Audio Generation")
|
79 |
+
gr.Markdown("Generate speech audio using the `prithivMLmods/Llama-3B-Mono-Luna` model.")
|
|
|
|
|
|
|
|
|
|
|
80 |
|
81 |
with gr.Row():
|
82 |
text_input = gr.Textbox(lines=4, label="Input Text")
|
83 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
84 |
with gr.Row():
|
85 |
temp_slider = gr.Slider(minimum=0.1, maximum=2.0, step=0.1, value=0.9, label="Temperature")
|
86 |
top_p_slider = gr.Slider(minimum=0.1, maximum=1.0, step=0.05, value=0.8, label="Top-p")
|
87 |
+
tokens_slider = gr.Slider(minimum=100, maximum=2000, step=50, value=1200, label="Max New Tokens")
|
88 |
|
89 |
output_audio = gr.Audio(type="numpy", label="Generated Audio")
|
90 |
generate_button = gr.Button("Generate Audio")
|
91 |
|
|
|
92 |
generate_button.click(
|
93 |
fn=generate_audio,
|
94 |
+
inputs=[text_input, temp_slider, top_p_slider, tokens_slider],
|
95 |
outputs=output_audio
|
96 |
)
|
97 |
|