Spaces:

whyu
/

MM-Vet_Evaluator

Running

App Files Files Community

whyu commited on Sep 21

Commit

8c7c7b7

1 Parent(s): 7c3c46a

user enters api key

Browse files

Files changed (1) hide show

app.py +100 -34

app.py CHANGED Viewed

@@ -2,6 +2,8 @@ import gradio as gr
 import openai
 import json
 import os
 from tqdm import tqdm
 import pandas as pd
 import numpy as np
@@ -9,12 +11,13 @@ from collections import Counter
 import time
 from zipfile import ZipFile
-openai.api_key = os.environ.get("AZURE_OPENAI_KEY")
-openai.api_base = os.environ.get("AZURE_OPENAI_ENDPOINT")
-openai.api_type = 'azure'
-openai.api_version = os.environ.get("AZURE_OPENAI_API_VERSION")
-deployment_id = os.environ.get("AZURE_OPENAI_DEP_ID")
-gpt_model = deployment_id
@@ -33,8 +36,30 @@ Can you explain this meme? | This meme is poking fun at the fact that the names
 """
-def grade(file_obj, progress=gr.Progress()):
     # load metadata
     # Download mm-vet.zip and `unzip mm-vet.zip` and change the path below
     mmvet_path = "mm-vet"
@@ -104,21 +129,16 @@ def grade(file_obj, progress=gr.Progress()):
     ###### change your model name ######
-    model = file_obj.name.split("/")[-1][:-5]
     # result_path = "results"
     num_run = 1 # we set 5 in the paper
     # model_results_file = os.path.join(result_path, f"{model}.json")
     model_results_file = file_obj.name
-    # grade results for each sample to svae
-    grade_file = f'{model}_{gpt_model}-grade-{num_run}runs.json'
-    # grade_file = os.path.join(result_path, grade_file)
-    # score results regarding capabilities/capability integration to save
-    cap_score_file = f'{model}_{sub_set_name}{gpt_model}-cap-score-{num_run}runs.csv'
-    # cap_score_file = os.path.join(result_path, cap_score_file)
-    cap_int_score_file = f'{model}_{sub_set_name}{gpt_model}-cap-int-score-{num_run}runs.csv'
-    # cap_int_score_file = os.path.join(result_path, cap_int_score_file)
@@ -170,8 +190,8 @@ def grade(file_obj, progress=gr.Progress()):
                 while not grade_sample_run_complete:
                     try:
                         response = openai.ChatCompletion.create(
-                            # model=gpt_model,
-                            engine=gpt_model,
                             max_tokens=3,
                             temperature=temperature,
                             messages=messages)
@@ -191,8 +211,8 @@ def grade(file_obj, progress=gr.Progress()):
                                 {"role": "user", "content": question},
                                 ]
                                 response = openai.ChatCompletion.create(
-                                    # model=gpt_model,
-                                    engine=gpt_model,
                                     max_tokens=3,
                                     temperature=temperature,
                                     messages=messages)
@@ -205,7 +225,8 @@ def grade(file_obj, progress=gr.Progress()):
                                     score = 0.0
                                     flag = False
                         grade_sample_run_complete = True
-                    except:
                         # gpt4 may have token rate limit
                         num_sleep += 1
                         if num_sleep > 12:
@@ -217,14 +238,18 @@ def grade(file_obj, progress=gr.Progress()):
                         time.sleep(5)
                 if len(sample_grade['model']) >= j + 1:
-                    sample_grade['model'][j] = response['model']
-                    sample_grade['content'][j] = content
                     sample_grade['score'][j] = score
                 else:
-                    sample_grade['model'].append(response['model'])
-                    sample_grade['content'].append(content)
                     sample_grade['score'].append(score)
                 grade_results[id] = sample_grade
                 with open(grade_file, 'w') as f:
@@ -280,12 +305,13 @@ def grade(file_obj, progress=gr.Progress()):
     df2.to_csv(cap_int_score_file)
     files = [cap_score_file, cap_int_score_file, grade_file]
-    zip_file = f"results.zip"
     with ZipFile(zip_file, "w") as zipObj:
-        for idx, file in enumerate(files):
-            zipObj.write(file, file)
-    for file in files:
-        os.remove(file)
     return zip_file
@@ -296,6 +322,39 @@ def grade(file_obj, progress=gr.Progress()):
 #     outputs="file")
 markdown = """
 <p align="center">
 <img src="https://github-production-user-asset-6210df.s3.amazonaws.com/49296856/258254299-29c00dae-8201-4128-b341-dad4663b544a.jpg" width="400"> <br>
@@ -304,7 +363,7 @@ markdown = """
 # [MM-Vet: Evaluating Large Multimodal Models for Integrated Capabilities](https://arxiv.org/abs/2308.02490)
-In this demo, we offer MM-Vet LLM-based (GPT-4) evaluator to grade open-ended outputs from your models.
 Plese upload your json file of your model results containing `{v1_0: ..., v1_1: ..., }`like [this json file](https://raw.githubusercontent.com/yuweihao/MM-Vet/main/results/llava_llama2_13b_chat.json).
@@ -316,10 +375,17 @@ The grading results will be downloaded as a zip file.
 with gr.Blocks() as demo:
     gr.Markdown(markdown)
     with gr.Row():
         inp = gr.File(file_types=[".json"])
         out = gr.File(file_types=[".zip"])
-    inp.change(grade, inp, out)
 if __name__ == "__main__":
-    demo.queue().launch()

 import openai
 import json
 import os
+import uuid
+import tempfile
 from tqdm import tqdm
 import pandas as pd
 import numpy as np
 import time
 from zipfile import ZipFile
+# For Azure OpenAI
+# openai.api_key = os.environ.get("AZURE_OPENAI_KEY")
+# openai.api_base = os.environ.get("AZURE_OPENAI_ENDPOINT")
+# openai.api_type = 'azure'
+# openai.api_version = os.environ.get("AZURE_OPENAI_API_VERSION")
+# deployment_id = os.environ.get("AZURE_OPENAI_DEP_ID")
+# gpt_model = deployment_id
 """
+import threading, shutil
+def schedule_cleanup(paths, delay=600):
+    def _clean():
+        time.sleep(delay)
+        for p in (paths if isinstance(paths, (list, tuple)) else [paths]):
+            try:
+                if os.path.isdir(p):
+                    shutil.rmtree(p, ignore_errors=True)
+                elif os.path.isfile(p):
+                    os.remove(p)
+            except:
+                pass
+    threading.Thread(target=_clean, daemon=True).start()
+def grade(file_obj, key, model, progress=gr.Progress()):
+    # set set api key
+    openai.api_key = key
+    gpt_model = model
+    workdir = tempfile.mkdtemp(prefix="mmvet_grade_")
+    uid = uuid.uuid4().hex
     # load metadata
     # Download mm-vet.zip and `unzip mm-vet.zip` and change the path below
     mmvet_path = "mm-vet"
     ###### change your model name ######
+    model_name = os.path.basename(file_obj.name)[:-5]
     # result_path = "results"
     num_run = 1 # we set 5 in the paper
     # model_results_file = os.path.join(result_path, f"{model}.json")
     model_results_file = file_obj.name
+    grade_file = os.path.join(workdir, f'{model_name}_{gpt_model}-grade-{num_run}runs_{uid}.json')
+    cap_score_file = os.path.join(workdir, f'{model_name}_{sub_set_name}{gpt_model}-cap-score-{num_run}runs_{uid}.csv')
+    cap_int_score_file = os.path.join(workdir, f'{model_name}_{sub_set_name}{gpt_model}-cap-int-score-{num_run}runs_{uid}.csv')
+    zip_file = os.path.join(workdir, f"results_{uid}.zip")
                 while not grade_sample_run_complete:
                     try:
                         response = openai.ChatCompletion.create(
+                            model=gpt_model,
+                            # engine=gpt_model, # For Azure OpenAI
                             max_tokens=3,
                             temperature=temperature,
                             messages=messages)
                                 {"role": "user", "content": question},
                                 ]
                                 response = openai.ChatCompletion.create(
+                                    model=gpt_model,
+                                    # engine=gpt_model, # For Azure OpenAI
                                     max_tokens=3,
                                     temperature=temperature,
                                     messages=messages)
                                     score = 0.0
                                     flag = False
                         grade_sample_run_complete = True
+                    except Exception as e:
+                        print(e)
                         # gpt4 may have token rate limit
                         num_sleep += 1
                         if num_sleep > 12:
                         time.sleep(5)
+                resp_model = str(response.get('model', gpt_model))
+                content_str = str(content)
                 if len(sample_grade['model']) >= j + 1:
+                    sample_grade['model'][j] = resp_model
+                    sample_grade['content'][j] = content_str
                     sample_grade['score'][j] = score
                 else:
+                    sample_grade['model'].append(resp_model)
+                    sample_grade['content'].append(content_str)
                     sample_grade['score'].append(score)
                 grade_results[id] = sample_grade
                 with open(grade_file, 'w') as f:
     df2.to_csv(cap_int_score_file)
     files = [cap_score_file, cap_int_score_file, grade_file]
     with ZipFile(zip_file, "w") as zipObj:
+        for fpath in files:
+            arcname = os.path.basename(fpath)
+            zipObj.write(fpath, arcname)
+    for fpath in files:
+        os.remove(fpath)
+    schedule_cleanup([zip_file, workdir], delay=600)
     return zip_file
 #     outputs="file")
+# --- Validate key and model before running grading ---
+def validate_key_and_model(key: str, model: str):
+    openai.api_key = key.strip()  # strip leading/trailing spaces
+    try:
+        # This call is fast and checks both key validity and model availability
+        openai.Model.retrieve(model)
+        return True, "OK"
+    except openai.error.AuthenticationError:
+        return False, "Invalid OpenAI API key. Please check and try again."
+    except openai.error.InvalidRequestError as e:
+        msg = str(e)
+        if "does not exist" in msg or "You do not have access" in msg or "model_not_found" in msg:
+            return False, f"API key is valid, but you do not have access to model `{model}`."
+        return False, f"Invalid request: {msg}"
+    except openai.error.RateLimitError:
+        return False, "Rate limit or quota exceeded. Please try again later."
+    except openai.error.APIConnectionError:
+        return False, "Failed to connect to OpenAI service. Please check your network."
+    except openai.error.OpenAIError as e:
+        return False, f"OpenAI returned an error: {e}"
+    except Exception as e:
+        return False, f"Unexpected error: {e}"
+# --- Wrapper for the grading function ---
+def run_grade(file_obj, key, model, progress=gr.Progress(track_tqdm=True)):
+    ok, msg = validate_key_and_model(key, model)
+    if not ok:
+        # This will be visible to the user in the Gradio UI
+        raise gr.Error(msg)
+    return grade(file_obj, key, model, progress=progress)
 markdown = """
 <p align="center">
 <img src="https://github-production-user-asset-6210df.s3.amazonaws.com/49296856/258254299-29c00dae-8201-4128-b341-dad4663b544a.jpg" width="400"> <br>
 # [MM-Vet: Evaluating Large Multimodal Models for Integrated Capabilities](https://arxiv.org/abs/2308.02490)
+This demo uses LLM-based (GPT-4) evaluator to grade open-ended outputs from your models.
 Plese upload your json file of your model results containing `{v1_0: ..., v1_1: ..., }`like [this json file](https://raw.githubusercontent.com/yuweihao/MM-Vet/main/results/llava_llama2_13b_chat.json).
 with gr.Blocks() as demo:
     gr.Markdown(markdown)
+    key = gr.Textbox(label="Enter your OpenAI API Key (this space will not save your API Key)", type="password")
+    model = gr.Dropdown(
+        choices=["gpt-4-0613", "gpt-4-turbo"],
+        value="gpt-4-0613",
+        label="Select GPt-4 model version (gpt-4-0613 is the default and gpt-4-turbo is cheaper)"
+    )
     with gr.Row():
         inp = gr.File(file_types=[".json"])
         out = gr.File(file_types=[".zip"])
+    btn = gr.Button("Start grading", variant="primary")
+    btn.click(fn=run_grade, inputs=[inp, key, model], outputs=out)
 if __name__ == "__main__":
+    demo.queue(max_size=8).launch()