prithivMLmods commited on
Commit
185e14a
·
verified ·
1 Parent(s): a24564e

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +15 -55
app.py CHANGED
@@ -1,5 +1,5 @@
1
  import torch
2
- import spaces
3
  from transformers import AutoTokenizer, AutoModelForCausalLM
4
  import gradio as gr
5
  from snac import SNAC
@@ -38,40 +38,22 @@ def redistribute_codes(row):
38
  audio_hat = snac_model.decode(codes)
39
  return audio_hat.cpu()[0, 0]
40
 
41
- # Load the SNAC model (shared by all)
42
  snac_model = SNAC.from_pretrained("hubertsiuzdak/snac_24khz").to("cuda")
43
 
44
- # Load all the single-speaker language models
45
- models = {
46
- "Luna": {
47
- "tokenizer": AutoTokenizer.from_pretrained('prithivMLmods/Llama-3B-Mono-Luna'),
48
- "model": AutoModelForCausalLM.from_pretrained('prithivMLmods/Llama-3B-Mono-Luna', torch_dtype=torch.bfloat16).cuda()
49
- },
50
- "Ceylia": {
51
- "tokenizer": AutoTokenizer.from_pretrained('prithivMLmods/Llama-3B-Mono-Ceylia'),
52
- "model": AutoModelForCausalLM.from_pretrained('prithivMLmods/Llama-3B-Mono-Ceylia', torch_dtype=torch.bfloat16).cuda()
53
- },
54
- "Cooper": {
55
- "tokenizer": AutoTokenizer.from_pretrained('prithivMLmods/Llama-3B-Mono-Cooper'),
56
- "model": AutoModelForCausalLM.from_pretrained('prithivMLmods/Llama-3B-Mono-Cooper', torch_dtype=torch.bfloat16).cuda()
57
- },
58
- "Jim": {
59
- "tokenizer": AutoTokenizer.from_pretrained('prithivMLmods/Llama-3B-Mono-Jim'),
60
- "model": AutoModelForCausalLM.from_pretrained('prithivMLmods/Llama-3B-Mono-Jim', torch_dtype=torch.bfloat16).cuda()
61
- },
62
- }
63
 
64
  @spaces.GPU
65
- def generate_audio(text, temperature, top_p, max_new_tokens, model_name):
66
  """
67
- Given input text and model parameters, generate speech audio using the chosen model.
68
  """
69
- # Retrieve the chosen tokenizer and model
70
- chosen = models[model_name]
71
- tokenizer = chosen["tokenizer"]
72
- model = chosen["model"]
73
-
74
- prompt = f'<custom_token_3><|begin_of_text|>{text}<|eot_id|><custom_token_4><custom_token_5><custom_token_1>'
75
  input_ids = tokenizer(prompt, add_special_tokens=False, return_tensors='pt').to('cuda')
76
 
77
  with torch.no_grad():
@@ -91,47 +73,25 @@ def generate_audio(text, temperature, top_p, max_new_tokens, model_name):
91
  y_np = y_tensor.detach().cpu().numpy()
92
  return (24000, y_np)
93
 
94
- # Example texts with emotion tokens
95
- example_texts = [
96
- ["Hi, my name is Alex. <laugh> It's a wonderful day! <chuckle> I love coding."],
97
- ["I woke up feeling sleepy. <yawn> I need coffee! <sniffle> But I'm ready to work."],
98
- ["Oh no, I forgot my keys! <groan> <uhm> Maybe I'll try again later. <sigh>"],
99
- ["This is amazing! <gasp> Really, it's fantastic. <giggles>"]
100
- ]
101
-
102
  # Gradio Interface
103
  with gr.Blocks() as demo:
104
- # Sidebar for model selection
105
- with gr.Sidebar():
106
- gr.Markdown("# Choose Model")
107
- model_choice = gr.Dropdown(choices=list(models.keys()), value="Luna", label="Model")
108
-
109
- gr.Markdown("# Single Speaker Audio Generation")
110
- gr.Markdown("Generate speech audio using one of the single-speaker models. Use the examples below to see how emotion tokens like `<laugh>`, `<chuckle>`, `<sigh>`, etc. can be incorporated.")
111
 
112
  with gr.Row():
113
  text_input = gr.Textbox(lines=4, label="Input Text")
114
 
115
- # Examples with emotion tokens
116
- gr.Examples(
117
- examples=example_texts,
118
- inputs=text_input,
119
- label="Emotion Examples",
120
- cache_examples=False
121
- )
122
-
123
  with gr.Row():
124
  temp_slider = gr.Slider(minimum=0.1, maximum=2.0, step=0.1, value=0.9, label="Temperature")
125
  top_p_slider = gr.Slider(minimum=0.1, maximum=1.0, step=0.05, value=0.8, label="Top-p")
126
- tokens_slider = gr.Slider(minimum=100, maximum=3500, step=50, value=1200, label="Max New Tokens")
127
 
128
  output_audio = gr.Audio(type="numpy", label="Generated Audio")
129
  generate_button = gr.Button("Generate Audio")
130
 
131
- # Pass the selected model name along with other parameters
132
  generate_button.click(
133
  fn=generate_audio,
134
- inputs=[text_input, temp_slider, top_p_slider, tokens_slider, model_choice],
135
  outputs=output_audio
136
  )
137
 
 
1
  import torch
2
+ import spaces
3
  from transformers import AutoTokenizer, AutoModelForCausalLM
4
  import gradio as gr
5
  from snac import SNAC
 
38
  audio_hat = snac_model.decode(codes)
39
  return audio_hat.cpu()[0, 0]
40
 
41
+ # Load the SNAC model for audio decoding
42
  snac_model = SNAC.from_pretrained("hubertsiuzdak/snac_24khz").to("cuda")
43
 
44
+ # Load the single-speaker language model
45
+ tokenizer = AutoTokenizer.from_pretrained('prithivMLmods/Llama-3B-Mono-Luna')
46
+ model = AutoModelForCausalLM.from_pretrained(
47
+ 'prithivMLmods/Llama-3B-Mono-Luna', torch_dtype=torch.bfloat16
48
+ ).cuda()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
49
 
50
  @spaces.GPU
51
+ def generate_audio(text, temperature, top_p, max_new_tokens):
52
  """
53
+ Given input text, generate speech audio.
54
  """
55
+ speaker = "Luna"
56
+ prompt = f'<custom_token_3><|begin_of_text|>{speaker}: {text}<|eot_id|><custom_token_4><custom_token_5><custom_token_1>'
 
 
 
 
57
  input_ids = tokenizer(prompt, add_special_tokens=False, return_tensors='pt').to('cuda')
58
 
59
  with torch.no_grad():
 
73
  y_np = y_tensor.detach().cpu().numpy()
74
  return (24000, y_np)
75
 
 
 
 
 
 
 
 
 
76
  # Gradio Interface
77
  with gr.Blocks() as demo:
78
+ gr.Markdown("# Llama-3B-Mono-Luna - Single Speaker Audio Generation")
79
+ gr.Markdown("Generate speech audio using the `prithivMLmods/Llama-3B-Mono-Luna` model.")
 
 
 
 
 
80
 
81
  with gr.Row():
82
  text_input = gr.Textbox(lines=4, label="Input Text")
83
 
 
 
 
 
 
 
 
 
84
  with gr.Row():
85
  temp_slider = gr.Slider(minimum=0.1, maximum=2.0, step=0.1, value=0.9, label="Temperature")
86
  top_p_slider = gr.Slider(minimum=0.1, maximum=1.0, step=0.05, value=0.8, label="Top-p")
87
+ tokens_slider = gr.Slider(minimum=100, maximum=2000, step=50, value=1200, label="Max New Tokens")
88
 
89
  output_audio = gr.Audio(type="numpy", label="Generated Audio")
90
  generate_button = gr.Button("Generate Audio")
91
 
 
92
  generate_button.click(
93
  fn=generate_audio,
94
+ inputs=[text_input, temp_slider, top_p_slider, tokens_slider],
95
  outputs=output_audio
96
  )
97