MultiMedTulu

Runtime error

App Files Files Community

not-lain commited on Nov 7, 2023

Commit

722ecec

1 Parent(s): de811b0

changed image api method

Browse files

Files changed (1) hide show

app.py +106 -91

app.py CHANGED Viewed

@@ -1,4 +1,13 @@
 # Welcome to Team Tonic's MultiMed
 import os
 import numpy as np
 import base64
@@ -11,7 +20,6 @@ import dotenv
 from transformers import AutoProcessor, SeamlessM4TModel
 import torchaudio
 dotenv.load_dotenv()
-from gradio_client import Client
 client = Client("https://facebook-seamless-m4t.hf.space/--replicas/frq8b/")
@@ -22,19 +30,11 @@ DEFAULT_TARGET_LANGUAGE = "English"
 device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
-from lang_list import (
-    LANGUAGE_NAME_TO_CODE,
-    S2ST_TARGET_LANGUAGE_NAMES,
-    S2TT_TARGET_LANGUAGE_NAMES,
-    T2TT_TARGET_LANGUAGE_NAMES,
-    TEXT_SOURCE_LANGUAGE_NAMES,
-    LANG_TO_SPKR_ID,
-)
 device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
-#processor = AutoProcessor.from_pretrained("ylacombe/hf-seamless-m4t-large")
-#model = SeamlessM4TModel.from_pretrained("ylacombe/hf-seamless-m4t-large").to(device)
 def process_speech(sound):
@@ -46,13 +46,13 @@ def process_speech(sound):
                             audio_source="microphone",
                             input_audio_mic=sound,
                             input_audio_file=None,
-                            input_text=None,
                             source_language=None,
                             target_language="English")
     print(result)
     return result[1]
 def process_speech_using_model(sound):
     """
     processing sound using seamless_m4t
@@ -60,34 +60,33 @@ def process_speech_using_model(sound):
     # task_name = "T2TT"
     arr, org_sr = torchaudio.load(sound)
     target_language_code = LANGUAGE_NAME_TO_CODE[DEFAULT_TARGET_LANGUAGE]
-    new_arr = torchaudio.functional.resample(arr, orig_freq=org_sr, new_freq=AUDIO_SAMPLE_RATE)
     max_length = int(MAX_INPUT_AUDIO_LENGTH * AUDIO_SAMPLE_RATE)
     if new_arr.shape[1] > max_length:
         new_arr = new_arr[:, :max_length]
-        gr.Warning(f"Input audio is too long. Only the first {MAX_INPUT_AUDIO_LENGTH} seconds is used.")
-    input_data = processor(audios = new_arr, sampling_rate=AUDIO_SAMPLE_RATE, return_tensors="pt").to(device)
-    tokens_ids = model.generate(**input_data, generate_speech=False, tgt_lang=target_language_code, num_beams=5, do_sample=True)[0].cpu().squeeze().detach().tolist()
     text_out = processor.decode(tokens_ids, skip_special_tokens=True)
     return text_out
 def convert_image_to_required_format(image):
     """
     convert image from numpy to base64
     """
-    img = base64.b64encode(image).decode('utf-8')
-    image_name = np.random.randint(0, 10)
-    with open(f'{image_name}.png', 'wb') as f:
-        f.write(base64.b64decode(img))
-    return image_name
 def process_image_with_openai(image):
-    image_name = convert_image_to_required_format(image)
-    openai_api_key = os.getenv('OPENAI_API_KEY')
     oai_org = os.getenv('OAI_ORG')
     if openai_api_key is None:
         raise Exception("OPENAI_API_KEY not found in environment variables")
@@ -97,7 +96,18 @@ def process_image_with_openai(image):
         "messages": [
             {
                 "role": "user",
-                "content": image_name
             }
         ],
         "max_tokens": 300
@@ -186,65 +196,68 @@ def query_vectara(text):
         headers=api_key_header
     )
-    if response.status_code == 200:
-        query_data = response.json()
-        if query_data:
-            sources_info = []
-            # Extract the summary.
-            summary = query_data['responseSet'][0]['summary'][0]['text']
-            # Iterate over all response sets
-            for response_set in query_data.get('responseSet', []):
-                # Extract sources
-                for source in response_set.get('response', [])[:5]:  # Limit to top 5 sources.
-                    source_metadata = source.get('metadata', [])
-                    source_info = {}
-                    for metadata in source_metadata:
-                        metadata_name = metadata.get('name', '')
-                        metadata_value = metadata.get('value', '')
-                        if metadata_name == 'title':
-                            source_info['title'] = metadata_value
-                        elif metadata_name == 'author':
-                            source_info['author'] = metadata_value
-                        elif metadata_name == 'pageNumber':
-                            source_info['page number'] = metadata_value
-                    if source_info:
-                        sources_info.append(source_info)
-            result = {"summary": summary, "sources": sources_info}
-            return f"{json.dumps(result, indent=2)}"
-        else:
-            return "No data found in the response."
-    else:
-        return f"Error: {response.status_code}"
-def convert_to_markdown(vectara_response_json):
-    vectara_response = json.loads(vectara_response_json)
-    if vectara_response:
-        summary = vectara_response.get('summary', 'No summary available')
-        sources_info = vectara_response.get('sources', [])
-        # Format the summary as Markdown
-        markdown_summary = f'**Summary:** {summary}\n\n'
-        # Format the sources as a numbered list
-        markdown_sources = ""
-        for i, source_info in enumerate(sources_info):
-            author = source_info.get('author', 'Unknown author')
-            title = source_info.get('title', 'Unknown title')
-            page_number = source_info.get('page number', 'Unknown page number')
-            markdown_sources += f"{i+1}. {title} by {author}, Page {page_number}\n"
-        return f"{markdown_summary}**Sources:**\n{markdown_sources}"
-    else:
-        return "No data found in the response."
 # Main function to handle the Gradio interface logic
-def process_and_query(text, image,audio):
     try:
         # If an image is provided, process it with OpenAI and use the response as the text query for Vectara
         if image is not None:
@@ -260,7 +273,7 @@ def process_and_query(text, image,audio):
             # audio = base64.b64encode(audio).decode('utf-8')
             text = process_speech(audio)
             print(text)
         # Now, use the text (either provided by the user or obtained from OpenAI) to query Vectara
         vectara_response_json = query_vectara(text)
         markdown_output = convert_to_markdown(vectara_response_json)
@@ -268,17 +281,19 @@ def process_and_query(text, image,audio):
     except Exception as e:
         return str(e)
 # Define the Gradio interface
 iface = gr.Interface(
     fn=process_and_query,
     inputs=[
         gr.Textbox(label="Input Text"),
         gr.Image(label="Upload Image"),
-        gr.Audio(label="talk", type="filepath", sources="microphone", visible=True),
     ],
     outputs=[gr.Markdown(label="Output Text")],
     title="👋🏻Welcome to ⚕🗣️😷MultiMed - Access Chat ⚕🗣️😷",
-    description = '''
             ### How To Use ⚕🗣️😷MultiMed⚕:
             #### 🗣️📝Interact with ⚕🗣️😷MultiMed⚕ in any language using audio or text!
             #### 🗣️📝 This is an educational and accessible conversational tool to improve wellness and sanitation in support of public health.
@@ -298,4 +313,4 @@ iface = gr.Interface(
     ],
 )
-iface.launch()

 # Welcome to Team Tonic's MultiMed
+from lang_list import (
+    LANGUAGE_NAME_TO_CODE,
+    S2ST_TARGET_LANGUAGE_NAMES,
+    S2TT_TARGET_LANGUAGE_NAMES,
+    T2TT_TARGET_LANGUAGE_NAMES,
+    TEXT_SOURCE_LANGUAGE_NAMES,
+    LANG_TO_SPKR_ID,
+)
+from gradio_client import Client
 import os
 import numpy as np
 import base64
 from transformers import AutoProcessor, SeamlessM4TModel
 import torchaudio
 dotenv.load_dotenv()
 client = Client("https://facebook-seamless-m4t.hf.space/--replicas/frq8b/")
 device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
 device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
+# processor = AutoProcessor.from_pretrained("ylacombe/hf-seamless-m4t-large")
+# model = SeamlessM4TModel.from_pretrained("ylacombe/hf-seamless-m4t-large").to(device)
 def process_speech(sound):
                             audio_source="microphone",
                             input_audio_mic=sound,
                             input_audio_file=None,
+                            input_text=None,
                             source_language=None,
                             target_language="English")
     print(result)
     return result[1]
 def process_speech_using_model(sound):
     """
     processing sound using seamless_m4t
     # task_name = "T2TT"
     arr, org_sr = torchaudio.load(sound)
     target_language_code = LANGUAGE_NAME_TO_CODE[DEFAULT_TARGET_LANGUAGE]
+    new_arr = torchaudio.functional.resample(
+        arr, orig_freq=org_sr, new_freq=AUDIO_SAMPLE_RATE)
     max_length = int(MAX_INPUT_AUDIO_LENGTH * AUDIO_SAMPLE_RATE)
     if new_arr.shape[1] > max_length:
         new_arr = new_arr[:, :max_length]
+        gr.Warning(
+            f"Input audio is too long. Only the first {MAX_INPUT_AUDIO_LENGTH} seconds is used.")
+    input_data = processor(
+        audios=new_arr, sampling_rate=AUDIO_SAMPLE_RATE, return_tensors="pt").to(device)
+    tokens_ids = model.generate(**input_data, generate_speech=False, tgt_lang=target_language_code,
+                                num_beams=5, do_sample=True)[0].cpu().squeeze().detach().tolist()
     text_out = processor.decode(tokens_ids, skip_special_tokens=True)
     return text_out
 def convert_image_to_required_format(image):
     """
     convert image from numpy to base64
     """
+    base64_image = base64.b64encode(image).decode('utf-8')
+    return base64_image
 def process_image_with_openai(image):
+    base64_image = convert_image_to_required_format(image)
+    openai_api_key = os.getenv('OPENAI_API_KEY')
     oai_org = os.getenv('OAI_ORG')
     if openai_api_key is None:
         raise Exception("OPENAI_API_KEY not found in environment variables")
         "messages": [
             {
                 "role": "user",
+                "content": [
+                    {
+                        "type": "text",
+                        "text": "What's in this image?"
+                    },
+                    {
+                        "type": "image_url",
+                        "image_url" : {
+                            "url": f"data:image/jpeg;base64,{base64_image}"
+                        }
+                    }
+                ]
             }
         ],
         "max_tokens": 300
         headers=api_key_header
     )
+    if response.status_code == 200:
+        query_data = response.json()
+        if query_data:
+            sources_info = []
+            # Extract the summary.
+            summary = query_data['responseSet'][0]['summary'][0]['text']
+            # Iterate over all response sets
+            for response_set in query_data.get('responseSet', []):
+                # Extract sources
+                # Limit to top 5 sources.
+                for source in response_set.get('response', [])[:5]:
+                    source_metadata = source.get('metadata', [])
+                    source_info = {}
+                    for metadata in source_metadata:
+                        metadata_name = metadata.get('name', '')
+                        metadata_value = metadata.get('value', '')
+                        if metadata_name == 'title':
+                            source_info['title'] = metadata_value
+                        elif metadata_name == 'author':
+                            source_info['author'] = metadata_value
+                        elif metadata_name == 'pageNumber':
+                            source_info['page number'] = metadata_value
+                    if source_info:
+                        sources_info.append(source_info)
+            result = {"summary": summary, "sources": sources_info}
+            return f"{json.dumps(result, indent=2)}"
+        else:
+            return "No data found in the response."
+    else:
+        return f"Error: {response.status_code}"
+def convert_to_markdown(vectara_response_json):
+    vectara_response = json.loads(vectara_response_json)
+    if vectara_response:
+        summary = vectara_response.get('summary', 'No summary available')
+        sources_info = vectara_response.get('sources', [])
+        # Format the summary as Markdown
+        markdown_summary = f'**Summary:** {summary}\n\n'
+        # Format the sources as a numbered list
+        markdown_sources = ""
+        for i, source_info in enumerate(sources_info):
+            author = source_info.get('author', 'Unknown author')
+            title = source_info.get('title', 'Unknown title')
+            page_number = source_info.get('page number', 'Unknown page number')
+            markdown_sources += f"{i+1}. {title} by {author}, Page {page_number}\n"
+        return f"{markdown_summary}**Sources:**\n{markdown_sources}"
+    else:
+        return "No data found in the response."
 # Main function to handle the Gradio interface logic
+def process_and_query(text, image, audio):
     try:
         # If an image is provided, process it with OpenAI and use the response as the text query for Vectara
         if image is not None:
             # audio = base64.b64encode(audio).decode('utf-8')
             text = process_speech(audio)
             print(text)
         # Now, use the text (either provided by the user or obtained from OpenAI) to query Vectara
         vectara_response_json = query_vectara(text)
         markdown_output = convert_to_markdown(vectara_response_json)
     except Exception as e:
         return str(e)
 # Define the Gradio interface
 iface = gr.Interface(
     fn=process_and_query,
     inputs=[
         gr.Textbox(label="Input Text"),
         gr.Image(label="Upload Image"),
+        gr.Audio(label="talk", type="filepath",
+                 sources="microphone", visible=True),
     ],
     outputs=[gr.Markdown(label="Output Text")],
     title="👋🏻Welcome to ⚕🗣️😷MultiMed - Access Chat ⚕🗣️😷",
+    description='''
             ### How To Use ⚕🗣️😷MultiMed⚕:
             #### 🗣️📝Interact with ⚕🗣️😷MultiMed⚕ in any language using audio or text!
             #### 🗣️📝 This is an educational and accessible conversational tool to improve wellness and sanitation in support of public health.
     ],
 )
+iface.launch()