Spaces:

prithivMLmods
/

Callisto-OCR3-2B

Running on Zero

App Files Files Community

prithivMLmods commited on 24 days ago

Commit

185e14a

verified ·

1 Parent(s): a24564e

Update app.py

Browse files

Files changed (1) hide show

app.py +15 -55

app.py CHANGED Viewed

@@ -1,5 +1,5 @@
 import torch
-import spaces
 from transformers import AutoTokenizer, AutoModelForCausalLM
 import gradio as gr
 from snac import SNAC
@@ -38,40 +38,22 @@ def redistribute_codes(row):
         audio_hat = snac_model.decode(codes)
         return audio_hat.cpu()[0, 0]
-# Load the SNAC model (shared by all)
 snac_model = SNAC.from_pretrained("hubertsiuzdak/snac_24khz").to("cuda")
-# Load all the single-speaker language models
-models = {
-    "Luna": {
-        "tokenizer": AutoTokenizer.from_pretrained('prithivMLmods/Llama-3B-Mono-Luna'),
-        "model": AutoModelForCausalLM.from_pretrained('prithivMLmods/Llama-3B-Mono-Luna', torch_dtype=torch.bfloat16).cuda()
-    },
-    "Ceylia": {
-        "tokenizer": AutoTokenizer.from_pretrained('prithivMLmods/Llama-3B-Mono-Ceylia'),
-        "model": AutoModelForCausalLM.from_pretrained('prithivMLmods/Llama-3B-Mono-Ceylia', torch_dtype=torch.bfloat16).cuda()
-    },
-    "Cooper": {
-        "tokenizer": AutoTokenizer.from_pretrained('prithivMLmods/Llama-3B-Mono-Cooper'),
-        "model": AutoModelForCausalLM.from_pretrained('prithivMLmods/Llama-3B-Mono-Cooper', torch_dtype=torch.bfloat16).cuda()
-    },
-    "Jim": {
-        "tokenizer": AutoTokenizer.from_pretrained('prithivMLmods/Llama-3B-Mono-Jim'),
-        "model": AutoModelForCausalLM.from_pretrained('prithivMLmods/Llama-3B-Mono-Jim', torch_dtype=torch.bfloat16).cuda()
-    },
-}
 @spaces.GPU
-def generate_audio(text, temperature, top_p, max_new_tokens, model_name):
     """
-    Given input text and model parameters, generate speech audio using the chosen model.
     """
-    # Retrieve the chosen tokenizer and model
-    chosen = models[model_name]
-    tokenizer = chosen["tokenizer"]
-    model = chosen["model"]
-    prompt = f'<custom_token_3><|begin_of_text|>{text}<|eot_id|><custom_token_4><custom_token_5><custom_token_1>'
     input_ids = tokenizer(prompt, add_special_tokens=False, return_tensors='pt').to('cuda')
     with torch.no_grad():
@@ -91,47 +73,25 @@ def generate_audio(text, temperature, top_p, max_new_tokens, model_name):
     y_np = y_tensor.detach().cpu().numpy()
     return (24000, y_np)
-# Example texts with emotion tokens
-example_texts = [
-    ["Hi, my name is Alex. <laugh> It's a wonderful day! <chuckle> I love coding."],
-    ["I woke up feeling sleepy. <yawn> I need coffee! <sniffle> But I'm ready to work."],
-    ["Oh no, I forgot my keys! <groan> <uhm> Maybe I'll try again later. <sigh>"],
-    ["This is amazing! <gasp> Really, it's fantastic. <giggles>"]
-]
 # Gradio Interface
 with gr.Blocks() as demo:
-    # Sidebar for model selection
-    with gr.Sidebar():
-        gr.Markdown("# Choose Model")
-        model_choice = gr.Dropdown(choices=list(models.keys()), value="Luna", label="Model")
-    gr.Markdown("# Single Speaker Audio Generation")
-    gr.Markdown("Generate speech audio using one of the single-speaker models. Use the examples below to see how emotion tokens like `<laugh>`, `<chuckle>`, `<sigh>`, etc. can be incorporated.")
     with gr.Row():
         text_input = gr.Textbox(lines=4, label="Input Text")
-    # Examples with emotion tokens
-    gr.Examples(
-        examples=example_texts,
-        inputs=text_input,
-        label="Emotion Examples",
-        cache_examples=False
-    )
     with gr.Row():
         temp_slider = gr.Slider(minimum=0.1, maximum=2.0, step=0.1, value=0.9, label="Temperature")
         top_p_slider = gr.Slider(minimum=0.1, maximum=1.0, step=0.05, value=0.8, label="Top-p")
-        tokens_slider = gr.Slider(minimum=100, maximum=3500, step=50, value=1200, label="Max New Tokens")
     output_audio = gr.Audio(type="numpy", label="Generated Audio")
     generate_button = gr.Button("Generate Audio")
-    # Pass the selected model name along with other parameters
     generate_button.click(
         fn=generate_audio,
-        inputs=[text_input, temp_slider, top_p_slider, tokens_slider, model_choice],
         outputs=output_audio
     )

 import torch
+ import spaces
 from transformers import AutoTokenizer, AutoModelForCausalLM
 import gradio as gr
 from snac import SNAC
         audio_hat = snac_model.decode(codes)
         return audio_hat.cpu()[0, 0]
+# Load the SNAC model for audio decoding
 snac_model = SNAC.from_pretrained("hubertsiuzdak/snac_24khz").to("cuda")
+# Load the single-speaker language model
+tokenizer = AutoTokenizer.from_pretrained('prithivMLmods/Llama-3B-Mono-Luna')
+model = AutoModelForCausalLM.from_pretrained(
+    'prithivMLmods/Llama-3B-Mono-Luna', torch_dtype=torch.bfloat16
+).cuda()
 @spaces.GPU
+def generate_audio(text, temperature, top_p, max_new_tokens):
     """
+    Given input text, generate speech audio.
     """
+    speaker = "Luna"
+    prompt = f'<custom_token_3><|begin_of_text|>{speaker}: {text}<|eot_id|><custom_token_4><custom_token_5><custom_token_1>'
     input_ids = tokenizer(prompt, add_special_tokens=False, return_tensors='pt').to('cuda')
     with torch.no_grad():
     y_np = y_tensor.detach().cpu().numpy()
     return (24000, y_np)
 # Gradio Interface
 with gr.Blocks() as demo:
+    gr.Markdown("# Llama-3B-Mono-Luna - Single Speaker Audio Generation")
+    gr.Markdown("Generate speech audio using the `prithivMLmods/Llama-3B-Mono-Luna` model.")
     with gr.Row():
         text_input = gr.Textbox(lines=4, label="Input Text")
     with gr.Row():
         temp_slider = gr.Slider(minimum=0.1, maximum=2.0, step=0.1, value=0.9, label="Temperature")
         top_p_slider = gr.Slider(minimum=0.1, maximum=1.0, step=0.05, value=0.8, label="Top-p")
+        tokens_slider = gr.Slider(minimum=100, maximum=2000, step=50, value=1200, label="Max New Tokens")
     output_audio = gr.Audio(type="numpy", label="Generated Audio")
     generate_button = gr.Button("Generate Audio")
     generate_button.click(
         fn=generate_audio,
+        inputs=[text_input, temp_slider, top_p_slider, tokens_slider],
         outputs=output_audio
     )