not-lain commited on
Commit
722ecec
1 Parent(s): de811b0

changed image api method

Browse files
Files changed (1) hide show
  1. app.py +106 -91
app.py CHANGED
@@ -1,4 +1,13 @@
1
  # Welcome to Team Tonic's MultiMed
 
 
 
 
 
 
 
 
 
2
  import os
3
  import numpy as np
4
  import base64
@@ -11,7 +20,6 @@ import dotenv
11
  from transformers import AutoProcessor, SeamlessM4TModel
12
  import torchaudio
13
  dotenv.load_dotenv()
14
- from gradio_client import Client
15
 
16
  client = Client("https://facebook-seamless-m4t.hf.space/--replicas/frq8b/")
17
 
@@ -22,19 +30,11 @@ DEFAULT_TARGET_LANGUAGE = "English"
22
 
23
  device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
24
 
25
- from lang_list import (
26
- LANGUAGE_NAME_TO_CODE,
27
- S2ST_TARGET_LANGUAGE_NAMES,
28
- S2TT_TARGET_LANGUAGE_NAMES,
29
- T2TT_TARGET_LANGUAGE_NAMES,
30
- TEXT_SOURCE_LANGUAGE_NAMES,
31
- LANG_TO_SPKR_ID,
32
- )
33
 
34
  device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
35
 
36
- #processor = AutoProcessor.from_pretrained("ylacombe/hf-seamless-m4t-large")
37
- #model = SeamlessM4TModel.from_pretrained("ylacombe/hf-seamless-m4t-large").to(device)
38
 
39
 
40
  def process_speech(sound):
@@ -46,13 +46,13 @@ def process_speech(sound):
46
  audio_source="microphone",
47
  input_audio_mic=sound,
48
  input_audio_file=None,
49
- input_text=None,
50
  source_language=None,
51
  target_language="English")
52
  print(result)
53
  return result[1]
54
-
55
-
56
  def process_speech_using_model(sound):
57
  """
58
  processing sound using seamless_m4t
@@ -60,34 +60,33 @@ def process_speech_using_model(sound):
60
  # task_name = "T2TT"
61
  arr, org_sr = torchaudio.load(sound)
62
  target_language_code = LANGUAGE_NAME_TO_CODE[DEFAULT_TARGET_LANGUAGE]
63
- new_arr = torchaudio.functional.resample(arr, orig_freq=org_sr, new_freq=AUDIO_SAMPLE_RATE)
 
64
  max_length = int(MAX_INPUT_AUDIO_LENGTH * AUDIO_SAMPLE_RATE)
65
  if new_arr.shape[1] > max_length:
66
  new_arr = new_arr[:, :max_length]
67
- gr.Warning(f"Input audio is too long. Only the first {MAX_INPUT_AUDIO_LENGTH} seconds is used.")
68
- input_data = processor(audios = new_arr, sampling_rate=AUDIO_SAMPLE_RATE, return_tensors="pt").to(device)
69
- tokens_ids = model.generate(**input_data, generate_speech=False, tgt_lang=target_language_code, num_beams=5, do_sample=True)[0].cpu().squeeze().detach().tolist()
 
 
 
70
  text_out = processor.decode(tokens_ids, skip_special_tokens=True)
71
 
72
  return text_out
73
-
74
 
75
  def convert_image_to_required_format(image):
76
  """
77
  convert image from numpy to base64
78
  """
79
- img = base64.b64encode(image).decode('utf-8')
80
- image_name = np.random.randint(0, 10)
81
- with open(f'{image_name}.png', 'wb') as f:
82
- f.write(base64.b64decode(img))
83
- return image_name
84
-
85
-
86
 
87
 
88
  def process_image_with_openai(image):
89
- image_name = convert_image_to_required_format(image)
90
- openai_api_key = os.getenv('OPENAI_API_KEY')
91
  oai_org = os.getenv('OAI_ORG')
92
  if openai_api_key is None:
93
  raise Exception("OPENAI_API_KEY not found in environment variables")
@@ -97,7 +96,18 @@ def process_image_with_openai(image):
97
  "messages": [
98
  {
99
  "role": "user",
100
- "content": image_name
 
 
 
 
 
 
 
 
 
 
 
101
  }
102
  ],
103
  "max_tokens": 300
@@ -186,65 +196,68 @@ def query_vectara(text):
186
  headers=api_key_header
187
  )
188
 
189
- if response.status_code == 200:
190
- query_data = response.json()
191
- if query_data:
192
- sources_info = []
193
-
194
- # Extract the summary.
195
- summary = query_data['responseSet'][0]['summary'][0]['text']
196
-
197
- # Iterate over all response sets
198
- for response_set in query_data.get('responseSet', []):
199
- # Extract sources
200
- for source in response_set.get('response', [])[:5]: # Limit to top 5 sources.
201
- source_metadata = source.get('metadata', [])
202
- source_info = {}
203
-
204
- for metadata in source_metadata:
205
- metadata_name = metadata.get('name', '')
206
- metadata_value = metadata.get('value', '')
207
-
208
- if metadata_name == 'title':
209
- source_info['title'] = metadata_value
210
- elif metadata_name == 'author':
211
- source_info['author'] = metadata_value
212
- elif metadata_name == 'pageNumber':
213
- source_info['page number'] = metadata_value
214
-
215
- if source_info:
216
- sources_info.append(source_info)
217
-
218
- result = {"summary": summary, "sources": sources_info}
219
- return f"{json.dumps(result, indent=2)}"
220
- else:
221
- return "No data found in the response."
222
- else:
223
- return f"Error: {response.status_code}"
224
-
225
-
226
- def convert_to_markdown(vectara_response_json):
227
- vectara_response = json.loads(vectara_response_json)
228
- if vectara_response:
229
- summary = vectara_response.get('summary', 'No summary available')
230
- sources_info = vectara_response.get('sources', [])
231
-
232
- # Format the summary as Markdown
233
- markdown_summary = f'**Summary:** {summary}\n\n'
234
-
235
- # Format the sources as a numbered list
236
- markdown_sources = ""
237
- for i, source_info in enumerate(sources_info):
238
- author = source_info.get('author', 'Unknown author')
239
- title = source_info.get('title', 'Unknown title')
240
- page_number = source_info.get('page number', 'Unknown page number')
241
- markdown_sources += f"{i+1}. {title} by {author}, Page {page_number}\n"
242
-
243
- return f"{markdown_summary}**Sources:**\n{markdown_sources}"
244
- else:
245
- return "No data found in the response."
 
246
  # Main function to handle the Gradio interface logic
247
- def process_and_query(text, image,audio):
 
 
248
  try:
249
  # If an image is provided, process it with OpenAI and use the response as the text query for Vectara
250
  if image is not None:
@@ -260,7 +273,7 @@ def process_and_query(text, image,audio):
260
  # audio = base64.b64encode(audio).decode('utf-8')
261
  text = process_speech(audio)
262
  print(text)
263
-
264
  # Now, use the text (either provided by the user or obtained from OpenAI) to query Vectara
265
  vectara_response_json = query_vectara(text)
266
  markdown_output = convert_to_markdown(vectara_response_json)
@@ -268,17 +281,19 @@ def process_and_query(text, image,audio):
268
  except Exception as e:
269
  return str(e)
270
 
 
271
  # Define the Gradio interface
272
  iface = gr.Interface(
273
  fn=process_and_query,
274
  inputs=[
275
  gr.Textbox(label="Input Text"),
276
  gr.Image(label="Upload Image"),
277
- gr.Audio(label="talk", type="filepath", sources="microphone", visible=True),
 
278
  ],
279
  outputs=[gr.Markdown(label="Output Text")],
280
  title="👋🏻Welcome to ⚕🗣️😷MultiMed - Access Chat ⚕🗣️😷",
281
- description = '''
282
  ### How To Use ⚕🗣️😷MultiMed⚕:
283
  #### 🗣️📝Interact with ⚕🗣️😷MultiMed⚕ in any language using audio or text!
284
  #### 🗣️📝 This is an educational and accessible conversational tool to improve wellness and sanitation in support of public health.
@@ -298,4 +313,4 @@ iface = gr.Interface(
298
  ],
299
  )
300
 
301
- iface.launch()
 
1
  # Welcome to Team Tonic's MultiMed
2
+ from lang_list import (
3
+ LANGUAGE_NAME_TO_CODE,
4
+ S2ST_TARGET_LANGUAGE_NAMES,
5
+ S2TT_TARGET_LANGUAGE_NAMES,
6
+ T2TT_TARGET_LANGUAGE_NAMES,
7
+ TEXT_SOURCE_LANGUAGE_NAMES,
8
+ LANG_TO_SPKR_ID,
9
+ )
10
+ from gradio_client import Client
11
  import os
12
  import numpy as np
13
  import base64
 
20
  from transformers import AutoProcessor, SeamlessM4TModel
21
  import torchaudio
22
  dotenv.load_dotenv()
 
23
 
24
  client = Client("https://facebook-seamless-m4t.hf.space/--replicas/frq8b/")
25
 
 
30
 
31
  device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
32
 
 
 
 
 
 
 
 
 
33
 
34
  device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
35
 
36
+ # processor = AutoProcessor.from_pretrained("ylacombe/hf-seamless-m4t-large")
37
+ # model = SeamlessM4TModel.from_pretrained("ylacombe/hf-seamless-m4t-large").to(device)
38
 
39
 
40
  def process_speech(sound):
 
46
  audio_source="microphone",
47
  input_audio_mic=sound,
48
  input_audio_file=None,
49
+ input_text=None,
50
  source_language=None,
51
  target_language="English")
52
  print(result)
53
  return result[1]
54
+
55
+
56
  def process_speech_using_model(sound):
57
  """
58
  processing sound using seamless_m4t
 
60
  # task_name = "T2TT"
61
  arr, org_sr = torchaudio.load(sound)
62
  target_language_code = LANGUAGE_NAME_TO_CODE[DEFAULT_TARGET_LANGUAGE]
63
+ new_arr = torchaudio.functional.resample(
64
+ arr, orig_freq=org_sr, new_freq=AUDIO_SAMPLE_RATE)
65
  max_length = int(MAX_INPUT_AUDIO_LENGTH * AUDIO_SAMPLE_RATE)
66
  if new_arr.shape[1] > max_length:
67
  new_arr = new_arr[:, :max_length]
68
+ gr.Warning(
69
+ f"Input audio is too long. Only the first {MAX_INPUT_AUDIO_LENGTH} seconds is used.")
70
+ input_data = processor(
71
+ audios=new_arr, sampling_rate=AUDIO_SAMPLE_RATE, return_tensors="pt").to(device)
72
+ tokens_ids = model.generate(**input_data, generate_speech=False, tgt_lang=target_language_code,
73
+ num_beams=5, do_sample=True)[0].cpu().squeeze().detach().tolist()
74
  text_out = processor.decode(tokens_ids, skip_special_tokens=True)
75
 
76
  return text_out
77
+
78
 
79
  def convert_image_to_required_format(image):
80
  """
81
  convert image from numpy to base64
82
  """
83
+ base64_image = base64.b64encode(image).decode('utf-8')
84
+ return base64_image
 
 
 
 
 
85
 
86
 
87
  def process_image_with_openai(image):
88
+ base64_image = convert_image_to_required_format(image)
89
+ openai_api_key = os.getenv('OPENAI_API_KEY')
90
  oai_org = os.getenv('OAI_ORG')
91
  if openai_api_key is None:
92
  raise Exception("OPENAI_API_KEY not found in environment variables")
 
96
  "messages": [
97
  {
98
  "role": "user",
99
+ "content": [
100
+ {
101
+ "type": "text",
102
+ "text": "What's in this image?"
103
+ },
104
+ {
105
+ "type": "image_url",
106
+ "image_url" : {
107
+ "url": f"data:image/jpeg;base64,{base64_image}"
108
+ }
109
+ }
110
+ ]
111
  }
112
  ],
113
  "max_tokens": 300
 
196
  headers=api_key_header
197
  )
198
 
199
+ if response.status_code == 200:
200
+ query_data = response.json()
201
+ if query_data:
202
+ sources_info = []
203
+
204
+ # Extract the summary.
205
+ summary = query_data['responseSet'][0]['summary'][0]['text']
206
+
207
+ # Iterate over all response sets
208
+ for response_set in query_data.get('responseSet', []):
209
+ # Extract sources
210
+ # Limit to top 5 sources.
211
+ for source in response_set.get('response', [])[:5]:
212
+ source_metadata = source.get('metadata', [])
213
+ source_info = {}
214
+
215
+ for metadata in source_metadata:
216
+ metadata_name = metadata.get('name', '')
217
+ metadata_value = metadata.get('value', '')
218
+
219
+ if metadata_name == 'title':
220
+ source_info['title'] = metadata_value
221
+ elif metadata_name == 'author':
222
+ source_info['author'] = metadata_value
223
+ elif metadata_name == 'pageNumber':
224
+ source_info['page number'] = metadata_value
225
+
226
+ if source_info:
227
+ sources_info.append(source_info)
228
+
229
+ result = {"summary": summary, "sources": sources_info}
230
+ return f"{json.dumps(result, indent=2)}"
231
+ else:
232
+ return "No data found in the response."
233
+ else:
234
+ return f"Error: {response.status_code}"
235
+
236
+
237
+ def convert_to_markdown(vectara_response_json):
238
+ vectara_response = json.loads(vectara_response_json)
239
+ if vectara_response:
240
+ summary = vectara_response.get('summary', 'No summary available')
241
+ sources_info = vectara_response.get('sources', [])
242
+
243
+ # Format the summary as Markdown
244
+ markdown_summary = f'**Summary:** {summary}\n\n'
245
+
246
+ # Format the sources as a numbered list
247
+ markdown_sources = ""
248
+ for i, source_info in enumerate(sources_info):
249
+ author = source_info.get('author', 'Unknown author')
250
+ title = source_info.get('title', 'Unknown title')
251
+ page_number = source_info.get('page number', 'Unknown page number')
252
+ markdown_sources += f"{i+1}. {title} by {author}, Page {page_number}\n"
253
+
254
+ return f"{markdown_summary}**Sources:**\n{markdown_sources}"
255
+ else:
256
+ return "No data found in the response."
257
  # Main function to handle the Gradio interface logic
258
+
259
+
260
+ def process_and_query(text, image, audio):
261
  try:
262
  # If an image is provided, process it with OpenAI and use the response as the text query for Vectara
263
  if image is not None:
 
273
  # audio = base64.b64encode(audio).decode('utf-8')
274
  text = process_speech(audio)
275
  print(text)
276
+
277
  # Now, use the text (either provided by the user or obtained from OpenAI) to query Vectara
278
  vectara_response_json = query_vectara(text)
279
  markdown_output = convert_to_markdown(vectara_response_json)
 
281
  except Exception as e:
282
  return str(e)
283
 
284
+
285
  # Define the Gradio interface
286
  iface = gr.Interface(
287
  fn=process_and_query,
288
  inputs=[
289
  gr.Textbox(label="Input Text"),
290
  gr.Image(label="Upload Image"),
291
+ gr.Audio(label="talk", type="filepath",
292
+ sources="microphone", visible=True),
293
  ],
294
  outputs=[gr.Markdown(label="Output Text")],
295
  title="👋🏻Welcome to ⚕🗣️😷MultiMed - Access Chat ⚕🗣️😷",
296
+ description='''
297
  ### How To Use ⚕🗣️😷MultiMed⚕:
298
  #### 🗣️📝Interact with ⚕🗣️😷MultiMed⚕ in any language using audio or text!
299
  #### 🗣️📝 This is an educational and accessible conversational tool to improve wellness and sanitation in support of public health.
 
313
  ],
314
  )
315
 
316
+ iface.launch()