Spaces:

prithivMLmods
/

Multimodal-VLM-Thinking

Running on Zero

App Files Files Community

prithivMLmods commited on Apr 3

Commit

3af868a

verified ·

1 Parent(s): 7d47057

Update app.py

Browse files

Files changed (1) hide show

app.py +4 -11

app.py CHANGED Viewed

@@ -49,14 +49,10 @@ model_m = Qwen2VLForConditionalGeneration.from_pretrained(
     torch_dtype=torch.float16
 ).to("cuda").eval()
-# Extended Edge TTS voices mapping for new tags.
-# Use any of these tags at the start of your prompt to trigger TTS.
 TTS_VOICE_MAP = {
     "@jennyneural": "en-US-JennyNeural",
     "@guyneural": "en-US-GuyNeural",
-    "@arianeural": "en-US-AriaNeural",
-    "@michaelneural": "en-US-MichaelNeural",
-    "@olivianeural": "en-US-OliviaNeural",
 }
 async def text_to_speech(text: str, voice: str, output_file="output.mp3"):
@@ -129,7 +125,7 @@ def generate(input_dict: dict, chat_history: list[dict],
              repetition_penalty: float = 1.2):
     """
     Generates chatbot responses with support for multimodal input, video processing,
-    and Edge TTS when using the new tags for TTS.
     Special command:
       - "@video-infer": triggers video processing using Callisto OCR3.
     """
@@ -285,13 +281,10 @@ demo = gr.ChatInterface(
     examples=[
         ["Write the code that converts temperatures between Celsius and Fahrenheit in short"],
         [{"text": "Create a short story based on the image.", "files": ["examples/1.jpg"]}],
-        ["@GuyNeural Explain how rainbows are formed."],
         [{"text": "@video-infer Describe the video", "files": ["examples/Missing.mp4"]}],
         [{"text": "@video-infer Describe the Ad", "files": ["examples/coca.mp4"]}],
-        ["@JennyNeural Who was Nikola Tesla and what were his contributions?"],
-        ["@AriaNeural Provide an overview of the solar system."],
-        ["@MichaelNeural Summarize the benefits of a healthy lifestyle."],
-        ["@OliviaNeural Tell me a joke."]
     ],
     cache_examples=False,
     description="# **Pocket Llama**",

     torch_dtype=torch.float16
 ).to("cuda").eval()
+# Edge TTS voices mapping for new tags.
 TTS_VOICE_MAP = {
     "@jennyneural": "en-US-JennyNeural",
     "@guyneural": "en-US-GuyNeural",
 }
 async def text_to_speech(text: str, voice: str, output_file="output.mp3"):
              repetition_penalty: float = 1.2):
     """
     Generates chatbot responses with support for multimodal input, video processing,
+    and Edge TTS when using the new tags @JennyNeural or @GuyNeural.
     Special command:
       - "@video-infer": triggers video processing using Callisto OCR3.
     """
     examples=[
         ["Write the code that converts temperatures between Celsius and Fahrenheit in short"],
         [{"text": "Create a short story based on the image.", "files": ["examples/1.jpg"]}],
+        ["@JennyNeural Who was Nikola Tesla and what were his contributions?"],
         [{"text": "@video-infer Describe the video", "files": ["examples/Missing.mp4"]}],
         [{"text": "@video-infer Describe the Ad", "files": ["examples/coca.mp4"]}],
+        ["@GuyNeural Explain how rainbows are formed."]
     ],
     cache_examples=False,
     description="# **Pocket Llama**",