Spaces:
Running
on
A100
Running
on
A100
Update app.py
Browse files
app.py
CHANGED
@@ -22,12 +22,12 @@ model_think = PeftModel.from_pretrained(
|
|
22 |
torch_dtype=torch.float16,
|
23 |
)
|
24 |
|
25 |
-
# ---------------------------------
|
26 |
-
# MULTI-TURN MODEL SETUP
|
27 |
-
# ---------------------------------
|
28 |
-
MODEL_BASE_MULTI = snapshot_download(repo_id="nvidia/audio-flamingo-3-chat")
|
29 |
-
model_multi = llava.load(MODEL_BASE_MULTI, model_base=None, devices=[0])
|
30 |
-
generation_config_multi = model_multi.default_generation_config
|
31 |
|
32 |
|
33 |
# ---------------------------------
|
@@ -42,14 +42,14 @@ def single_turn_infer(audio_file, prompt_text):
|
|
42 |
except Exception as e:
|
43 |
return f"β Error: {str(e)}"
|
44 |
|
45 |
-
def speech_prompt_infer(audio_prompt_file):
|
46 |
-
|
47 |
-
|
48 |
-
|
49 |
-
|
50 |
-
|
51 |
-
|
52 |
-
|
53 |
|
54 |
def think_infer(audio_file, prompt_text):
|
55 |
try:
|
@@ -63,24 +63,24 @@ def think_infer(audio_file, prompt_text):
|
|
63 |
# ---------------------------------
|
64 |
# MULTI-TURN INFERENCE FUNCTION
|
65 |
# ---------------------------------
|
66 |
-
def multi_turn_chat(user_input, audio_file, history, current_audio):
|
67 |
-
|
68 |
-
|
69 |
-
|
70 |
|
71 |
-
|
72 |
-
|
73 |
|
74 |
-
|
75 |
-
|
76 |
|
77 |
-
|
78 |
|
79 |
-
|
80 |
-
|
81 |
-
|
82 |
-
|
83 |
-
|
84 |
# ---------------------------------
|
85 |
# INTERFACE
|
86 |
# ---------------------------------
|
@@ -193,48 +193,51 @@ with gr.Blocks(css="""
|
|
193 |
btn_think.click(fn=think_infer, inputs=[audio_input_think, prompt_input_think], outputs=output_think)
|
194 |
# ---------------- MULTI-TURN CHAT ----------------
|
195 |
with gr.Tab("π¬ Multi-Turn Chat"):
|
196 |
-
chatbot = gr.Chatbot(label="Audio Chatbot")
|
197 |
-
audio_input_multi = gr.Audio(type="filepath", label="Upload or Replace Audio Context")
|
198 |
-
user_input_multi = gr.Textbox(label="Your message", placeholder="Ask a question about the audio...", lines=8)
|
199 |
-
btn_multi = gr.Button("Send")
|
200 |
-
history_state = gr.State([]) # Chat history
|
201 |
-
current_audio_state = gr.State(None) # Most recent audio file path
|
202 |
-
|
203 |
-
btn_multi.click(
|
204 |
-
|
205 |
-
|
206 |
-
|
207 |
-
)
|
208 |
-
gr.Examples(
|
209 |
-
|
210 |
-
|
211 |
-
|
212 |
-
|
213 |
-
|
214 |
-
|
215 |
-
)
|
|
|
|
|
216 |
|
217 |
with gr.Tab("π£οΈ Speech Prompt"):
|
218 |
-
gr.Markdown("Use your **voice** to talk to the model.")
|
219 |
-
|
220 |
-
with gr.Row():
|
221 |
-
|
222 |
-
|
223 |
-
|
224 |
-
|
225 |
-
|
226 |
-
|
227 |
-
|
228 |
-
|
229 |
-
|
230 |
-
|
231 |
-
|
232 |
-
|
233 |
-
|
234 |
-
|
235 |
-
|
236 |
-
btn_speech.click(fn=speech_prompt_infer, inputs=speech_input, outputs=response_box)
|
237 |
-
|
|
|
238 |
|
239 |
# ---------------- ABOUT ----------------
|
240 |
with gr.Tab("π About"):
|
|
|
22 |
torch_dtype=torch.float16,
|
23 |
)
|
24 |
|
25 |
+
# # ---------------------------------
|
26 |
+
# # MULTI-TURN MODEL SETUP
|
27 |
+
# # ---------------------------------
|
28 |
+
# MODEL_BASE_MULTI = snapshot_download(repo_id="nvidia/audio-flamingo-3-chat")
|
29 |
+
# model_multi = llava.load(MODEL_BASE_MULTI, model_base=None, devices=[0])
|
30 |
+
# generation_config_multi = model_multi.default_generation_config
|
31 |
|
32 |
|
33 |
# ---------------------------------
|
|
|
42 |
except Exception as e:
|
43 |
return f"β Error: {str(e)}"
|
44 |
|
45 |
+
# def speech_prompt_infer(audio_prompt_file):
|
46 |
+
# try:
|
47 |
+
# sound = llava.Sound(audio_prompt_file)
|
48 |
+
# full_prompt = "<sound>"
|
49 |
+
# response = model_multi.generate_content([sound, full_prompt], generation_config=generation_config_single)
|
50 |
+
# return response
|
51 |
+
# except Exception as e:
|
52 |
+
# return f"β Error: {str(e)}"
|
53 |
|
54 |
def think_infer(audio_file, prompt_text):
|
55 |
try:
|
|
|
63 |
# ---------------------------------
|
64 |
# MULTI-TURN INFERENCE FUNCTION
|
65 |
# ---------------------------------
|
66 |
+
# def multi_turn_chat(user_input, audio_file, history, current_audio):
|
67 |
+
# try:
|
68 |
+
# if audio_file is not None:
|
69 |
+
# current_audio = audio_file # Update state if a new file is uploaded
|
70 |
|
71 |
+
# if current_audio is None:
|
72 |
+
# return history + [("System", "β Please upload an audio file before chatting.")], history, current_audio
|
73 |
|
74 |
+
# sound = llava.Sound(current_audio)
|
75 |
+
# prompt = f"<sound>\n{user_input}"
|
76 |
|
77 |
+
# response = model_multi.generate_content([sound, prompt], generation_config=generation_config_multi)
|
78 |
|
79 |
+
# history.append((user_input, response))
|
80 |
+
# return history, history, current_audio
|
81 |
+
# except Exception as e:
|
82 |
+
# history.append((user_input, f"β Error: {str(e)}"))
|
83 |
+
# return history, history, current_audio
|
84 |
# ---------------------------------
|
85 |
# INTERFACE
|
86 |
# ---------------------------------
|
|
|
193 |
btn_think.click(fn=think_infer, inputs=[audio_input_think, prompt_input_think], outputs=output_think)
|
194 |
# ---------------- MULTI-TURN CHAT ----------------
|
195 |
with gr.Tab("π¬ Multi-Turn Chat"):
|
196 |
+
# chatbot = gr.Chatbot(label="Audio Chatbot")
|
197 |
+
# audio_input_multi = gr.Audio(type="filepath", label="Upload or Replace Audio Context")
|
198 |
+
# user_input_multi = gr.Textbox(label="Your message", placeholder="Ask a question about the audio...", lines=8)
|
199 |
+
# btn_multi = gr.Button("Send")
|
200 |
+
# history_state = gr.State([]) # Chat history
|
201 |
+
# current_audio_state = gr.State(None) # Most recent audio file path
|
202 |
+
|
203 |
+
# btn_multi.click(
|
204 |
+
# fn=multi_turn_chat,
|
205 |
+
# inputs=[user_input_multi, audio_input_multi, history_state, current_audio_state],
|
206 |
+
# outputs=[chatbot, history_state, current_audio_state]
|
207 |
+
# )
|
208 |
+
# gr.Examples(
|
209 |
+
# examples=[
|
210 |
+
# ["static/chat/audio1.mp3", "This track feels really peaceful and introspective. What elements make it feel so calming and meditative?"],
|
211 |
+
# ["static/chat/audio2.mp3", "Switching gears, this one is super energetic and synthetic. If I wanted to remix the calming folk piece into something closer to this, what would you suggest?"],
|
212 |
+
# ],
|
213 |
+
# inputs=[audio_input_multi, user_input_multi],
|
214 |
+
# label="π§ͺ Try Examples"
|
215 |
+
# )
|
216 |
+
# Add the link to another Gradio demo here
|
217 |
+
gr.Markdown("π [Check out our other Gradio demo here](https://huggingface.co/spaces/nvidia/audio-flamingo-3-chat)")
|
218 |
|
219 |
with gr.Tab("π£οΈ Speech Prompt"):
|
220 |
+
# gr.Markdown("Use your **voice** to talk to the model.")
|
221 |
+
|
222 |
+
# with gr.Row():
|
223 |
+
# with gr.Column():
|
224 |
+
# speech_input = gr.Audio(type="filepath", label="Speak or Upload Audio")
|
225 |
+
# btn_speech = gr.Button("Submit")
|
226 |
+
# gr.Examples(
|
227 |
+
# examples=[
|
228 |
+
# ["static/voice/voice_0.mp3"],
|
229 |
+
# ["static/voice/voice_1.mp3"],
|
230 |
+
# ["static/voice/voice_2.mp3"],
|
231 |
+
# ],
|
232 |
+
# inputs=speech_input,
|
233 |
+
# label="π§ͺ Try Examples"
|
234 |
+
# )
|
235 |
+
# with gr.Column():
|
236 |
+
# response_box = gr.Textbox(label="Model Response", lines=15)
|
237 |
+
|
238 |
+
# btn_speech.click(fn=speech_prompt_infer, inputs=speech_input, outputs=response_box)
|
239 |
+
# Add the link to another Gradio demo here
|
240 |
+
gr.Markdown("π [Check out our other Gradio demo here](https://huggingface.co/spaces/nvidia/audio-flamingo-3-chat)")
|
241 |
|
242 |
# ---------------- ABOUT ----------------
|
243 |
with gr.Tab("π About"):
|