Spaces:
Running
on
Zero
Running
on
Zero
Update app.py
Browse files
app.py
CHANGED
|
@@ -49,14 +49,10 @@ model_m = Qwen2VLForConditionalGeneration.from_pretrained(
|
|
| 49 |
torch_dtype=torch.float16
|
| 50 |
).to("cuda").eval()
|
| 51 |
|
| 52 |
-
#
|
| 53 |
-
# Use any of these tags at the start of your prompt to trigger TTS.
|
| 54 |
TTS_VOICE_MAP = {
|
| 55 |
"@jennyneural": "en-US-JennyNeural",
|
| 56 |
"@guyneural": "en-US-GuyNeural",
|
| 57 |
-
"@arianeural": "en-US-AriaNeural",
|
| 58 |
-
"@michaelneural": "en-US-MichaelNeural",
|
| 59 |
-
"@olivianeural": "en-US-OliviaNeural",
|
| 60 |
}
|
| 61 |
|
| 62 |
async def text_to_speech(text: str, voice: str, output_file="output.mp3"):
|
|
@@ -129,7 +125,7 @@ def generate(input_dict: dict, chat_history: list[dict],
|
|
| 129 |
repetition_penalty: float = 1.2):
|
| 130 |
"""
|
| 131 |
Generates chatbot responses with support for multimodal input, video processing,
|
| 132 |
-
and Edge TTS when using the new tags
|
| 133 |
Special command:
|
| 134 |
- "@video-infer": triggers video processing using Callisto OCR3.
|
| 135 |
"""
|
|
@@ -285,13 +281,10 @@ demo = gr.ChatInterface(
|
|
| 285 |
examples=[
|
| 286 |
["Write the code that converts temperatures between Celsius and Fahrenheit in short"],
|
| 287 |
[{"text": "Create a short story based on the image.", "files": ["examples/1.jpg"]}],
|
| 288 |
-
["@
|
| 289 |
[{"text": "@video-infer Describe the video", "files": ["examples/Missing.mp4"]}],
|
| 290 |
[{"text": "@video-infer Describe the Ad", "files": ["examples/coca.mp4"]}],
|
| 291 |
-
["@
|
| 292 |
-
["@AriaNeural Provide an overview of the solar system."],
|
| 293 |
-
["@MichaelNeural Summarize the benefits of a healthy lifestyle."],
|
| 294 |
-
["@OliviaNeural Tell me a joke."]
|
| 295 |
],
|
| 296 |
cache_examples=False,
|
| 297 |
description="# **Pocket Llama**",
|
|
|
|
| 49 |
torch_dtype=torch.float16
|
| 50 |
).to("cuda").eval()
|
| 51 |
|
| 52 |
+
# Edge TTS voices mapping for new tags.
|
|
|
|
| 53 |
TTS_VOICE_MAP = {
|
| 54 |
"@jennyneural": "en-US-JennyNeural",
|
| 55 |
"@guyneural": "en-US-GuyNeural",
|
|
|
|
|
|
|
|
|
|
| 56 |
}
|
| 57 |
|
| 58 |
async def text_to_speech(text: str, voice: str, output_file="output.mp3"):
|
|
|
|
| 125 |
repetition_penalty: float = 1.2):
|
| 126 |
"""
|
| 127 |
Generates chatbot responses with support for multimodal input, video processing,
|
| 128 |
+
and Edge TTS when using the new tags @JennyNeural or @GuyNeural.
|
| 129 |
Special command:
|
| 130 |
- "@video-infer": triggers video processing using Callisto OCR3.
|
| 131 |
"""
|
|
|
|
| 281 |
examples=[
|
| 282 |
["Write the code that converts temperatures between Celsius and Fahrenheit in short"],
|
| 283 |
[{"text": "Create a short story based on the image.", "files": ["examples/1.jpg"]}],
|
| 284 |
+
["@JennyNeural Who was Nikola Tesla and what were his contributions?"],
|
| 285 |
[{"text": "@video-infer Describe the video", "files": ["examples/Missing.mp4"]}],
|
| 286 |
[{"text": "@video-infer Describe the Ad", "files": ["examples/coca.mp4"]}],
|
| 287 |
+
["@GuyNeural Explain how rainbows are formed."]
|
|
|
|
|
|
|
|
|
|
| 288 |
],
|
| 289 |
cache_examples=False,
|
| 290 |
description="# **Pocket Llama**",
|