Update app.py
Browse files
app.py
CHANGED
|
@@ -35,8 +35,8 @@ os.makedirs("scored", exist_ok=True)
|
|
| 35 |
|
| 36 |
# # Display the results
|
| 37 |
eval_results = load_dataset(RESULTS_DATASET, 'scores', token=TOKEN, download_mode="force_redownload", ignore_verifications=True)
|
| 38 |
-
def get_dataframe_from_results(eval_results, split):
|
| 39 |
-
local_df = eval_results[split]
|
| 40 |
local_df = local_df.remove_columns(["Mail"])
|
| 41 |
df = pd.DataFrame(local_df)
|
| 42 |
df = df.sort_values(by=["Final Pass Rate"], ascending=False)
|
|
@@ -45,9 +45,10 @@ def get_dataframe_from_results(eval_results, split):
|
|
| 45 |
return df
|
| 46 |
|
| 47 |
|
| 48 |
-
|
| 49 |
-
|
| 50 |
-
|
|
|
|
| 51 |
|
| 52 |
|
| 53 |
# def restart_space():
|
|
@@ -67,6 +68,7 @@ def add_new_eval(
|
|
| 67 |
val_or_test: str,
|
| 68 |
eval_mode: str,
|
| 69 |
model: str,
|
|
|
|
| 70 |
planning_strategy: str,
|
| 71 |
organization: str,
|
| 72 |
mail: str,
|
|
@@ -86,7 +88,7 @@ def add_new_eval(
|
|
| 86 |
api.upload_file(
|
| 87 |
repo_id=RESULTS_DATASET,
|
| 88 |
path_or_fileobj=path_to_file.name,
|
| 89 |
-
path_in_repo=f"{organization}/{val_or_test}_{eval_mode}_{planning_strategy}_raw_{datetime.datetime.today()}.jsonl",
|
| 90 |
repo_type="dataset",
|
| 91 |
token=TOKEN
|
| 92 |
)
|
|
@@ -94,14 +96,14 @@ def add_new_eval(
|
|
| 94 |
# Compute score
|
| 95 |
file_path = path_to_file.name
|
| 96 |
result = eval_score(val_or_test,file_path=file_path,TOKEN=TOKEN)
|
| 97 |
-
with open(f"scored/{organization}_{val_or_test}_{eval_mode}_{planning_strategy}.jsonl", "w") as scored_file:
|
| 98 |
scored_file.write(json.dumps(result) + "\n")
|
| 99 |
|
| 100 |
# Save scored file
|
| 101 |
api.upload_file(
|
| 102 |
repo_id=RESULTS_DATASET,
|
| 103 |
-
path_or_fileobj=f"scored/{organization}_{val_or_test}_{eval_mode}_{planning_strategy}.jsonl",
|
| 104 |
-
path_in_repo=f"{organization}/{model}/{val_or_test}_{eval_mode}_{planning_strategy}_scored_{datetime.datetime.today()}.jsonl",
|
| 105 |
repo_type="dataset",
|
| 106 |
token=TOKEN
|
| 107 |
)
|
|
@@ -109,6 +111,7 @@ def add_new_eval(
|
|
| 109 |
# Actual submission
|
| 110 |
eval_entry = {
|
| 111 |
"Model": model,
|
|
|
|
| 112 |
"Planning Strategy": planning_strategy,
|
| 113 |
"Organization": organization,
|
| 114 |
"Mail": mail,
|
|
@@ -119,21 +122,23 @@ def add_new_eval(
|
|
| 119 |
"Hard Constraint Macro Pass Rate":result['Hard Constraint Macro Pass Rate'],
|
| 120 |
"Final Pass Rate":result['Final Pass Rate']
|
| 121 |
}
|
| 122 |
-
|
| 123 |
-
eval_results[val_or_test] = eval_results[val_or_test].add_item(eval_entry)
|
| 124 |
|
| 125 |
print(eval_results)
|
| 126 |
|
| 127 |
eval_results.push_to_hub(RESULTS_DATASET, config_name = 'scores', token=TOKEN)
|
| 128 |
|
| 129 |
-
return format_log(f"Model {model} submitted by {organization} successfully. \nPlease refresh the leaderboard, and wait a bit to see the score displayed")
|
| 130 |
|
| 131 |
|
| 132 |
def refresh():
|
| 133 |
eval_results = load_dataset(RESULTS_DATASET, 'scores', token=TOKEN, download_mode="force_redownload", ignore_verifications=True)
|
| 134 |
-
|
| 135 |
-
|
| 136 |
-
|
|
|
|
|
|
|
| 137 |
|
| 138 |
# def upload_file(files):
|
| 139 |
# file_paths = [file.name for file in files]
|
|
@@ -145,13 +150,22 @@ with demo:
|
|
| 145 |
gr.HTML(TITLE)
|
| 146 |
gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
|
| 147 |
|
| 148 |
-
with gr.Tab("Results: Validation"):
|
| 149 |
-
|
| 150 |
-
value=
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 151 |
)
|
| 152 |
-
with gr.Tab("Results: Test"):
|
| 153 |
-
|
| 154 |
-
value=
|
| 155 |
)
|
| 156 |
|
| 157 |
refresh_button = gr.Button("Refresh")
|
|
@@ -159,8 +173,10 @@ with demo:
|
|
| 159 |
refresh,
|
| 160 |
inputs=[],
|
| 161 |
outputs=[
|
| 162 |
-
|
| 163 |
-
|
|
|
|
|
|
|
| 164 |
],
|
| 165 |
)
|
| 166 |
with gr.Accordion("Submit a new file for evaluation"):
|
|
@@ -169,6 +185,7 @@ with demo:
|
|
| 169 |
level_of_test = gr.Radio(["validation", "test"], value="validation", label="Split")
|
| 170 |
eval_mode = gr.Radio(["two-stage", "sole-planning"], value="two-stage", label="Eval Mode")
|
| 171 |
model = gr.Textbox(label="Foundation Model")
|
|
|
|
| 172 |
planning_strategy = gr.Textbox(label="Planning Strategy")
|
| 173 |
with gr.Column():
|
| 174 |
organization = gr.Textbox(label="Organization")
|
|
@@ -184,6 +201,7 @@ with demo:
|
|
| 184 |
level_of_test,
|
| 185 |
eval_mode,
|
| 186 |
model,
|
|
|
|
| 187 |
planning_strategy,
|
| 188 |
organization,
|
| 189 |
mail,
|
|
@@ -192,8 +210,6 @@ with demo:
|
|
| 192 |
submission_result,
|
| 193 |
)
|
| 194 |
|
| 195 |
-
# scheduler = BackgroundScheduler()
|
| 196 |
-
# scheduler.add_job(restart_space, "interval", seconds=3600)
|
| 197 |
-
# scheduler.start()
|
| 198 |
demo.launch(debug=True)
|
| 199 |
|
|
|
|
|
|
| 35 |
|
| 36 |
# # Display the results
|
| 37 |
eval_results = load_dataset(RESULTS_DATASET, 'scores', token=TOKEN, download_mode="force_redownload", ignore_verifications=True)
|
| 38 |
+
def get_dataframe_from_results(eval_results, split, mode):
|
| 39 |
+
local_df = eval_results[f'{split}_{mode}']
|
| 40 |
local_df = local_df.remove_columns(["Mail"])
|
| 41 |
df = pd.DataFrame(local_df)
|
| 42 |
df = df.sort_values(by=["Final Pass Rate"], ascending=False)
|
|
|
|
| 45 |
return df
|
| 46 |
|
| 47 |
|
| 48 |
+
eval_dataframe_val_twostage = get_dataframe_from_results(eval_results=eval_results, split="validation", mode='twostage')
|
| 49 |
+
eval_dataframe_val_soleplanning = get_dataframe_from_results(eval_results=eval_results, split="validation", mode='soleplanning')
|
| 50 |
+
eval_dataframe_test_twostage = get_dataframe_from_results(eval_results=eval_results, split="test",mode='twostage')
|
| 51 |
+
eval_dataframe_test_soleplanning = get_dataframe_from_results(eval_results=eval_results, split="test",mode='soleplanning')
|
| 52 |
|
| 53 |
|
| 54 |
# def restart_space():
|
|
|
|
| 68 |
val_or_test: str,
|
| 69 |
eval_mode: str,
|
| 70 |
model: str,
|
| 71 |
+
tooluse_strategy: str,
|
| 72 |
planning_strategy: str,
|
| 73 |
organization: str,
|
| 74 |
mail: str,
|
|
|
|
| 88 |
api.upload_file(
|
| 89 |
repo_id=RESULTS_DATASET,
|
| 90 |
path_or_fileobj=path_to_file.name,
|
| 91 |
+
path_in_repo=f"{organization}/{val_or_test}_{eval_mode}_{tooluse_strategy}_{planning_strategy}_raw_{datetime.datetime.today()}.jsonl",
|
| 92 |
repo_type="dataset",
|
| 93 |
token=TOKEN
|
| 94 |
)
|
|
|
|
| 96 |
# Compute score
|
| 97 |
file_path = path_to_file.name
|
| 98 |
result = eval_score(val_or_test,file_path=file_path,TOKEN=TOKEN)
|
| 99 |
+
with open(f"scored/{organization}_{val_or_test}_{eval_mode}_{tooluse_strategy}_{planning_strategy}.jsonl", "w") as scored_file:
|
| 100 |
scored_file.write(json.dumps(result) + "\n")
|
| 101 |
|
| 102 |
# Save scored file
|
| 103 |
api.upload_file(
|
| 104 |
repo_id=RESULTS_DATASET,
|
| 105 |
+
path_or_fileobj=f"scored/{organization}_{val_or_test}_{eval_mode}_{tooluse_strategy}_{planning_strategy}.jsonl",
|
| 106 |
+
path_in_repo=f"{organization}/{model}/{val_or_test}_{eval_mode}_{tooluse_strategy}_{planning_strategy}_scored_{datetime.datetime.today()}.jsonl",
|
| 107 |
repo_type="dataset",
|
| 108 |
token=TOKEN
|
| 109 |
)
|
|
|
|
| 111 |
# Actual submission
|
| 112 |
eval_entry = {
|
| 113 |
"Model": model,
|
| 114 |
+
"Tool-use Strategy": tooluse_strategy,
|
| 115 |
"Planning Strategy": planning_strategy,
|
| 116 |
"Organization": organization,
|
| 117 |
"Mail": mail,
|
|
|
|
| 122 |
"Hard Constraint Macro Pass Rate":result['Hard Constraint Macro Pass Rate'],
|
| 123 |
"Final Pass Rate":result['Final Pass Rate']
|
| 124 |
}
|
| 125 |
+
eval_mode = eval_mode.replace('-','')
|
| 126 |
+
eval_results[f'{val_or_test}_{eval_mode}'] = eval_results[f'{val_or_test}_{eval_mode}'].add_item(eval_entry)
|
| 127 |
|
| 128 |
print(eval_results)
|
| 129 |
|
| 130 |
eval_results.push_to_hub(RESULTS_DATASET, config_name = 'scores', token=TOKEN)
|
| 131 |
|
| 132 |
+
return format_log(f"Model: {model} | Tool-use Strategy: {tooluse_strategy} | Planning Strategy: {planning_strategy} | submitted by {organization} successfully. \nPlease refresh the leaderboard, and wait a bit to see the score displayed (Validation ~2mins, Test ~7mins).")
|
| 133 |
|
| 134 |
|
| 135 |
def refresh():
|
| 136 |
eval_results = load_dataset(RESULTS_DATASET, 'scores', token=TOKEN, download_mode="force_redownload", ignore_verifications=True)
|
| 137 |
+
eval_dataframe_val_twostage = get_dataframe_from_results(eval_results=eval_results, split="validation", mode='twostage')
|
| 138 |
+
eval_dataframe_val_soleplanning = get_dataframe_from_results(eval_results=eval_results, split="validation", mode='soleplanning')
|
| 139 |
+
eval_dataframe_test_twostage = get_dataframe_from_results(eval_results=eval_results, split="test",mode='twostage')
|
| 140 |
+
eval_dataframe_test_soleplanning = get_dataframe_from_results(eval_results=eval_results, split="test",mode='soleplanning')
|
| 141 |
+
return eval_dataframe_val_twostage, eval_dataframe_val_soleplanning, eval_dataframe_test_twostage, eval_dataframe_test_soleplanning
|
| 142 |
|
| 143 |
# def upload_file(files):
|
| 144 |
# file_paths = [file.name for file in files]
|
|
|
|
| 150 |
gr.HTML(TITLE)
|
| 151 |
gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
|
| 152 |
|
| 153 |
+
with gr.Tab("Results: Validation | Two-Stage "):
|
| 154 |
+
leaderboard_table_val_twostage = gr.components.Dataframe(
|
| 155 |
+
value=eval_dataframe_val_twostage, interactive=False,
|
| 156 |
+
)
|
| 157 |
+
with gr.Tab("Results: Validation | Sole-Planning"):
|
| 158 |
+
leaderboard_table_val_soleplanning = gr.components.Dataframe(
|
| 159 |
+
value=eval_dataframe_val_soleplanning, interactive=False,
|
| 160 |
+
)
|
| 161 |
+
|
| 162 |
+
with gr.Tab("Results: Test | Two-Stage "):
|
| 163 |
+
leaderboard_table_test_twostage = gr.components.Dataframe(
|
| 164 |
+
value=eval_dataframe_test_twostage, interactive=False,
|
| 165 |
)
|
| 166 |
+
with gr.Tab("Results: Test | Sole-Planning"):
|
| 167 |
+
leaderboard_table_test_soleplanning = gr.components.Dataframe(
|
| 168 |
+
value=eval_dataframe_test_soleplanning, interactive=False,
|
| 169 |
)
|
| 170 |
|
| 171 |
refresh_button = gr.Button("Refresh")
|
|
|
|
| 173 |
refresh,
|
| 174 |
inputs=[],
|
| 175 |
outputs=[
|
| 176 |
+
leaderboard_table_val_twostage,
|
| 177 |
+
leaderboard_table_val_soleplanning,
|
| 178 |
+
leaderboard_table_test_twostage,
|
| 179 |
+
leaderboard_table_test_soleplanning,
|
| 180 |
],
|
| 181 |
)
|
| 182 |
with gr.Accordion("Submit a new file for evaluation"):
|
|
|
|
| 185 |
level_of_test = gr.Radio(["validation", "test"], value="validation", label="Split")
|
| 186 |
eval_mode = gr.Radio(["two-stage", "sole-planning"], value="two-stage", label="Eval Mode")
|
| 187 |
model = gr.Textbox(label="Foundation Model")
|
| 188 |
+
tooluse_strategy = gr.Textbox(label="Tool-use Strategy")
|
| 189 |
planning_strategy = gr.Textbox(label="Planning Strategy")
|
| 190 |
with gr.Column():
|
| 191 |
organization = gr.Textbox(label="Organization")
|
|
|
|
| 201 |
level_of_test,
|
| 202 |
eval_mode,
|
| 203 |
model,
|
| 204 |
+
tooluse_strategy,
|
| 205 |
planning_strategy,
|
| 206 |
organization,
|
| 207 |
mail,
|
|
|
|
| 210 |
submission_result,
|
| 211 |
)
|
| 212 |
|
|
|
|
|
|
|
|
|
|
| 213 |
demo.launch(debug=True)
|
| 214 |
|
| 215 |
+
|