SreyanG-NVIDIA commited on
Commit
2da3b49
Β·
verified Β·
1 Parent(s): c8e5a16

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +71 -68
app.py CHANGED
@@ -22,12 +22,12 @@ model_think = PeftModel.from_pretrained(
22
  torch_dtype=torch.float16,
23
  )
24
 
25
- # ---------------------------------
26
- # MULTI-TURN MODEL SETUP
27
- # ---------------------------------
28
- MODEL_BASE_MULTI = snapshot_download(repo_id="nvidia/audio-flamingo-3-chat")
29
- model_multi = llava.load(MODEL_BASE_MULTI, model_base=None, devices=[0])
30
- generation_config_multi = model_multi.default_generation_config
31
 
32
 
33
  # ---------------------------------
@@ -42,14 +42,14 @@ def single_turn_infer(audio_file, prompt_text):
42
  except Exception as e:
43
  return f"❌ Error: {str(e)}"
44
 
45
- def speech_prompt_infer(audio_prompt_file):
46
- try:
47
- sound = llava.Sound(audio_prompt_file)
48
- full_prompt = "<sound>"
49
- response = model_multi.generate_content([sound, full_prompt], generation_config=generation_config_single)
50
- return response
51
- except Exception as e:
52
- return f"❌ Error: {str(e)}"
53
 
54
  def think_infer(audio_file, prompt_text):
55
  try:
@@ -63,24 +63,24 @@ def think_infer(audio_file, prompt_text):
63
  # ---------------------------------
64
  # MULTI-TURN INFERENCE FUNCTION
65
  # ---------------------------------
66
- def multi_turn_chat(user_input, audio_file, history, current_audio):
67
- try:
68
- if audio_file is not None:
69
- current_audio = audio_file # Update state if a new file is uploaded
70
 
71
- if current_audio is None:
72
- return history + [("System", "❌ Please upload an audio file before chatting.")], history, current_audio
73
 
74
- sound = llava.Sound(current_audio)
75
- prompt = f"<sound>\n{user_input}"
76
 
77
- response = model_multi.generate_content([sound, prompt], generation_config=generation_config_multi)
78
 
79
- history.append((user_input, response))
80
- return history, history, current_audio
81
- except Exception as e:
82
- history.append((user_input, f"❌ Error: {str(e)}"))
83
- return history, history, current_audio
84
  # ---------------------------------
85
  # INTERFACE
86
  # ---------------------------------
@@ -193,48 +193,51 @@ with gr.Blocks(css="""
193
  btn_think.click(fn=think_infer, inputs=[audio_input_think, prompt_input_think], outputs=output_think)
194
  # ---------------- MULTI-TURN CHAT ----------------
195
  with gr.Tab("πŸ’¬ Multi-Turn Chat"):
196
- chatbot = gr.Chatbot(label="Audio Chatbot")
197
- audio_input_multi = gr.Audio(type="filepath", label="Upload or Replace Audio Context")
198
- user_input_multi = gr.Textbox(label="Your message", placeholder="Ask a question about the audio...", lines=8)
199
- btn_multi = gr.Button("Send")
200
- history_state = gr.State([]) # Chat history
201
- current_audio_state = gr.State(None) # Most recent audio file path
202
-
203
- btn_multi.click(
204
- fn=multi_turn_chat,
205
- inputs=[user_input_multi, audio_input_multi, history_state, current_audio_state],
206
- outputs=[chatbot, history_state, current_audio_state]
207
- )
208
- gr.Examples(
209
- examples=[
210
- ["static/chat/audio1.mp3", "This track feels really peaceful and introspective. What elements make it feel so calming and meditative?"],
211
- ["static/chat/audio2.mp3", "Switching gears, this one is super energetic and synthetic. If I wanted to remix the calming folk piece into something closer to this, what would you suggest?"],
212
- ],
213
- inputs=[audio_input_multi, user_input_multi],
214
- label="πŸ§ͺ Try Examples"
215
- )
 
 
216
 
217
  with gr.Tab("πŸ—£οΈ Speech Prompt"):
218
- gr.Markdown("Use your **voice** to talk to the model.")
219
-
220
- with gr.Row():
221
- with gr.Column():
222
- speech_input = gr.Audio(type="filepath", label="Speak or Upload Audio")
223
- btn_speech = gr.Button("Submit")
224
- gr.Examples(
225
- examples=[
226
- ["static/voice/voice_0.mp3"],
227
- ["static/voice/voice_1.mp3"],
228
- ["static/voice/voice_2.mp3"],
229
- ],
230
- inputs=speech_input,
231
- label="πŸ§ͺ Try Examples"
232
- )
233
- with gr.Column():
234
- response_box = gr.Textbox(label="Model Response", lines=15)
235
-
236
- btn_speech.click(fn=speech_prompt_infer, inputs=speech_input, outputs=response_box)
237
-
 
238
 
239
  # ---------------- ABOUT ----------------
240
  with gr.Tab("πŸ“„ About"):
 
22
  torch_dtype=torch.float16,
23
  )
24
 
25
+ # # ---------------------------------
26
+ # # MULTI-TURN MODEL SETUP
27
+ # # ---------------------------------
28
+ # MODEL_BASE_MULTI = snapshot_download(repo_id="nvidia/audio-flamingo-3-chat")
29
+ # model_multi = llava.load(MODEL_BASE_MULTI, model_base=None, devices=[0])
30
+ # generation_config_multi = model_multi.default_generation_config
31
 
32
 
33
  # ---------------------------------
 
42
  except Exception as e:
43
  return f"❌ Error: {str(e)}"
44
 
45
+ # def speech_prompt_infer(audio_prompt_file):
46
+ # try:
47
+ # sound = llava.Sound(audio_prompt_file)
48
+ # full_prompt = "<sound>"
49
+ # response = model_multi.generate_content([sound, full_prompt], generation_config=generation_config_single)
50
+ # return response
51
+ # except Exception as e:
52
+ # return f"❌ Error: {str(e)}"
53
 
54
  def think_infer(audio_file, prompt_text):
55
  try:
 
63
  # ---------------------------------
64
  # MULTI-TURN INFERENCE FUNCTION
65
  # ---------------------------------
66
+ # def multi_turn_chat(user_input, audio_file, history, current_audio):
67
+ # try:
68
+ # if audio_file is not None:
69
+ # current_audio = audio_file # Update state if a new file is uploaded
70
 
71
+ # if current_audio is None:
72
+ # return history + [("System", "❌ Please upload an audio file before chatting.")], history, current_audio
73
 
74
+ # sound = llava.Sound(current_audio)
75
+ # prompt = f"<sound>\n{user_input}"
76
 
77
+ # response = model_multi.generate_content([sound, prompt], generation_config=generation_config_multi)
78
 
79
+ # history.append((user_input, response))
80
+ # return history, history, current_audio
81
+ # except Exception as e:
82
+ # history.append((user_input, f"❌ Error: {str(e)}"))
83
+ # return history, history, current_audio
84
  # ---------------------------------
85
  # INTERFACE
86
  # ---------------------------------
 
193
  btn_think.click(fn=think_infer, inputs=[audio_input_think, prompt_input_think], outputs=output_think)
194
  # ---------------- MULTI-TURN CHAT ----------------
195
  with gr.Tab("πŸ’¬ Multi-Turn Chat"):
196
+ # chatbot = gr.Chatbot(label="Audio Chatbot")
197
+ # audio_input_multi = gr.Audio(type="filepath", label="Upload or Replace Audio Context")
198
+ # user_input_multi = gr.Textbox(label="Your message", placeholder="Ask a question about the audio...", lines=8)
199
+ # btn_multi = gr.Button("Send")
200
+ # history_state = gr.State([]) # Chat history
201
+ # current_audio_state = gr.State(None) # Most recent audio file path
202
+
203
+ # btn_multi.click(
204
+ # fn=multi_turn_chat,
205
+ # inputs=[user_input_multi, audio_input_multi, history_state, current_audio_state],
206
+ # outputs=[chatbot, history_state, current_audio_state]
207
+ # )
208
+ # gr.Examples(
209
+ # examples=[
210
+ # ["static/chat/audio1.mp3", "This track feels really peaceful and introspective. What elements make it feel so calming and meditative?"],
211
+ # ["static/chat/audio2.mp3", "Switching gears, this one is super energetic and synthetic. If I wanted to remix the calming folk piece into something closer to this, what would you suggest?"],
212
+ # ],
213
+ # inputs=[audio_input_multi, user_input_multi],
214
+ # label="πŸ§ͺ Try Examples"
215
+ # )
216
+ # Add the link to another Gradio demo here
217
+ gr.Markdown("πŸ”— [Check out our other Gradio demo here](https://huggingface.co/spaces/nvidia/audio-flamingo-3-chat)")
218
 
219
  with gr.Tab("πŸ—£οΈ Speech Prompt"):
220
+ # gr.Markdown("Use your **voice** to talk to the model.")
221
+
222
+ # with gr.Row():
223
+ # with gr.Column():
224
+ # speech_input = gr.Audio(type="filepath", label="Speak or Upload Audio")
225
+ # btn_speech = gr.Button("Submit")
226
+ # gr.Examples(
227
+ # examples=[
228
+ # ["static/voice/voice_0.mp3"],
229
+ # ["static/voice/voice_1.mp3"],
230
+ # ["static/voice/voice_2.mp3"],
231
+ # ],
232
+ # inputs=speech_input,
233
+ # label="πŸ§ͺ Try Examples"
234
+ # )
235
+ # with gr.Column():
236
+ # response_box = gr.Textbox(label="Model Response", lines=15)
237
+
238
+ # btn_speech.click(fn=speech_prompt_infer, inputs=speech_input, outputs=response_box)
239
+ # Add the link to another Gradio demo here
240
+ gr.Markdown("πŸ”— [Check out our other Gradio demo here](https://huggingface.co/spaces/nvidia/audio-flamingo-3-chat)")
241
 
242
  # ---------------- ABOUT ----------------
243
  with gr.Tab("πŸ“„ About"):