gmb/update-leaderboard
#4
by
gmancino-ball
- opened
app.py
CHANGED
@@ -6,37 +6,11 @@ import subprocess
|
|
6 |
import os
|
7 |
|
8 |
## Save results path
|
|
|
9 |
results_path = Path("competition_cache/cached_results")
|
10 |
TASKS = ["video-challenge-pilot-config", "video-challenge-task-1-config"]
|
11 |
valid_splits = ["public", "private"]
|
12 |
|
13 |
-
## Check for files initially
|
14 |
-
if False:
|
15 |
-
print(f"Checking for task data")
|
16 |
-
print(os.listdir())
|
17 |
-
for task in TASKS:
|
18 |
-
if not os.path.exists(results_path):
|
19 |
-
print(f"{task} not found, running script")
|
20 |
-
try:
|
21 |
-
process = subprocess.Popen(
|
22 |
-
["python3", "utils.py"],
|
23 |
-
stdout=subprocess.PIPE,
|
24 |
-
stderr=subprocess.PIPE,
|
25 |
-
text=True,
|
26 |
-
)
|
27 |
-
try:
|
28 |
-
stdout, stderr = process.communicate(timeout=300)
|
29 |
-
except subprocess.TimeoutExpired:
|
30 |
-
process.kill()
|
31 |
-
stdout, stderr = process.communicate()
|
32 |
-
print(f"{task} script timed out.")
|
33 |
-
|
34 |
-
print("OUTPUT:", stdout)
|
35 |
-
print("ERROR:", stderr)
|
36 |
-
except Exception as e:
|
37 |
-
print(f"Failed to run subprocess for {task}: {e}")
|
38 |
-
print("Task checking complete")
|
39 |
-
|
40 |
|
41 |
#####################################################################
|
42 |
## Data loading ##
|
@@ -174,18 +148,15 @@ with st.sidebar:
|
|
174 |
|
175 |
if password == hf_token:
|
176 |
if st.button("Pull New Results"):
|
177 |
-
with st.spinner("
|
178 |
try:
|
179 |
process = subprocess.Popen(
|
180 |
["python3", "utils.py"],
|
181 |
-
# stdout=subprocess.PIPE,
|
182 |
-
# stderr=subprocess.PIPE,
|
183 |
text=True, # Decode stdout/stderr as text
|
184 |
)
|
185 |
st.info(f"Background task started with PID: {process.pid}")
|
186 |
process.wait()
|
187 |
process.kill()
|
188 |
-
# stdout, stderr = process.communicate()
|
189 |
if process.returncode != 0:
|
190 |
st.error("The process did not finish successfully.")
|
191 |
else:
|
@@ -219,6 +190,12 @@ with st.sidebar:
|
|
219 |
|
220 |
|
221 |
def show_leaderboard(results, task):
|
|
|
|
|
|
|
|
|
|
|
|
|
222 |
cols = [
|
223 |
"generated_accuracy",
|
224 |
"real_accuracy",
|
@@ -240,7 +217,7 @@ def show_leaderboard(results, task):
|
|
240 |
# width="small",
|
241 |
),
|
242 |
"generated_accuracy": st.column_config.NumberColumn(
|
243 |
-
"
|
244 |
format="compact",
|
245 |
min_value=0,
|
246 |
pinned=True,
|
@@ -321,22 +298,42 @@ def show_leaderboard(results, task):
|
|
321 |
for c in results[f"{split}_score"].columns
|
322 |
if "generated_" in c and "accuracy" not in c and "conditional" not in c
|
323 |
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
324 |
gen_tmp = results[f"{split}_score"].loc[:, cols].copy()
|
|
|
325 |
cols = [
|
326 |
c for c in results[f"{split}_score"].columns if "real_" in c and "accuracy" not in c and "conditional" not in c
|
327 |
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
328 |
real_tmp = results[f"{split}_score"].loc[:, cols].copy()
|
|
|
329 |
|
330 |
## Check cases
|
331 |
if accuracy_types[granularity] == 0:
|
332 |
-
"####
|
333 |
st.dataframe(gen_tmp, column_config=column_config)
|
334 |
|
335 |
"#### π§βπ€ True Negative Rate | Real Source"
|
336 |
st.dataframe(real_tmp, column_config=column_config)
|
337 |
|
338 |
elif accuracy_types[granularity] == 1:
|
339 |
-
"####
|
340 |
tnr = results[f"{split}_score"].loc[:, ["real_accuracy"]]
|
341 |
gen_tmp[:] = (gen_tmp.values + tnr.values) / 2.0
|
342 |
st.dataframe(gen_tmp, column_config=column_config)
|
@@ -347,11 +344,31 @@ def show_leaderboard(results, task):
|
|
347 |
st.dataframe(real_tmp, column_config=column_config)
|
348 |
else:
|
349 |
cols = [c for c in results[f"{split}_score"].columns if "generated_conditional_auc" in c]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
350 |
gen_tmp = results[f"{split}_score"].loc[:, cols].dropna(axis=1).copy()
|
|
|
351 |
cols = [c for c in results[f"{split}_score"].columns if "real_conditional_auc" in c]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
352 |
real_tmp = results[f"{split}_score"].loc[:, cols].dropna(axis=1).copy()
|
|
|
353 |
|
354 |
-
"####
|
355 |
st.dataframe(gen_tmp, column_config=column_config)
|
356 |
|
357 |
"#### π§βπ€ Conditional AUC | Real Source"
|
@@ -385,10 +402,6 @@ def make_roc(results):
|
|
385 |
|
386 |
|
387 |
def make_acc(results):
|
388 |
-
# results["FA"] = 1. - results["pristine_accuracy"]
|
389 |
-
# results = results[results["total_time"] >= 0]
|
390 |
-
# results["total_time"] = results["total_time"]
|
391 |
-
|
392 |
results = results.loc[results["total_time"] >= 0]
|
393 |
|
394 |
chart = (
|
@@ -427,11 +440,8 @@ def get_heatmaps(temp):
|
|
427 |
|
428 |
def make_plots_for_task(task, split, best_only):
|
429 |
results = load_results(task, best_only=best_only)
|
430 |
-
# results1[f"{split}_score"]
|
431 |
temp = results[f"{split}_score"].reset_index()
|
432 |
|
433 |
-
# st.write(temp)
|
434 |
-
|
435 |
t1, t2 = st.tabs(["Tables", "Charts"])
|
436 |
with t1:
|
437 |
show_leaderboard(results, task)
|
@@ -442,7 +452,6 @@ def make_plots_for_task(task, split, best_only):
|
|
442 |
acc_vs_time = make_acc(temp)
|
443 |
|
444 |
if split == "private" and hf_token is not None:
|
445 |
-
# with t2:
|
446 |
full_curves = st.toggle("Full curve", value=True, key=f"all curves {task}")
|
447 |
|
448 |
if full_curves:
|
@@ -450,19 +459,12 @@ def make_plots_for_task(task, split, best_only):
|
|
450 |
|
451 |
st.altair_chart(roc_scatter | acc_vs_time, use_container_width=False)
|
452 |
else:
|
453 |
-
# with t2:
|
454 |
st.altair_chart(roc_scatter | acc_vs_time, use_container_width=False)
|
455 |
|
456 |
-
# with t3:
|
457 |
-
# get_heatmaps(temp)
|
458 |
-
|
459 |
|
460 |
updated = get_updated_time()
|
461 |
st.markdown(updated)
|
462 |
-
|
463 |
-
# st.markdown("[SAFE: Synthetic Audio Forensics Evaluation Challenge](https://stresearch.github.io/SAFE/)")
|
464 |
-
best_only = True # st.toggle("Only Best per Team", value=True)
|
465 |
-
# show_chart = st.toggle("Show Table", value=True)
|
466 |
|
467 |
|
468 |
tp, t1, volume_tab, all_submission_tab = st.tabs(
|
|
|
6 |
import os
|
7 |
|
8 |
## Save results path
|
9 |
+
COMP_CACHE = Path("competition_cache/safe-challenge")
|
10 |
results_path = Path("competition_cache/cached_results")
|
11 |
TASKS = ["video-challenge-pilot-config", "video-challenge-task-1-config"]
|
12 |
valid_splits = ["public", "private"]
|
13 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
14 |
|
15 |
#####################################################################
|
16 |
## Data loading ##
|
|
|
148 |
|
149 |
if password == hf_token:
|
150 |
if st.button("Pull New Results"):
|
151 |
+
with st.spinner("Pulling new results", show_time=True):
|
152 |
try:
|
153 |
process = subprocess.Popen(
|
154 |
["python3", "utils.py"],
|
|
|
|
|
155 |
text=True, # Decode stdout/stderr as text
|
156 |
)
|
157 |
st.info(f"Background task started with PID: {process.pid}")
|
158 |
process.wait()
|
159 |
process.kill()
|
|
|
160 |
if process.returncode != 0:
|
161 |
st.error("The process did not finish successfully.")
|
162 |
else:
|
|
|
190 |
|
191 |
|
192 |
def show_leaderboard(results, task):
|
193 |
+
source_split_map = {}
|
194 |
+
if split == "private":
|
195 |
+
_sol_df = pd.read_csv(COMP_CACHE / task / "solution.csv")
|
196 |
+
pairs_df = _sol_df[["source_og", "split"]].drop_duplicates()
|
197 |
+
source_split_map = {x: y for x, y in zip(pairs_df["source_og"], pairs_df["split"])}
|
198 |
+
|
199 |
cols = [
|
200 |
"generated_accuracy",
|
201 |
"real_accuracy",
|
|
|
217 |
# width="small",
|
218 |
),
|
219 |
"generated_accuracy": st.column_config.NumberColumn(
|
220 |
+
"π€ True Postive Rate",
|
221 |
format="compact",
|
222 |
min_value=0,
|
223 |
pinned=True,
|
|
|
298 |
for c in results[f"{split}_score"].columns
|
299 |
if "generated_" in c and "accuracy" not in c and "conditional" not in c
|
300 |
]
|
301 |
+
col_names = [
|
302 |
+
(
|
303 |
+
f"π’ {c.replace('generated_', '')}"
|
304 |
+
if source_split_map.get(c.replace("generated_", ""), "public") == "public"
|
305 |
+
else f"π {c.replace('generated_', '')}"
|
306 |
+
)
|
307 |
+
for c in results[f"{split}_score"].columns
|
308 |
+
if "generated_" in c and "accuracy" not in c and "conditional" not in c
|
309 |
+
]
|
310 |
gen_tmp = results[f"{split}_score"].loc[:, cols].copy()
|
311 |
+
gen_tmp.columns = col_names
|
312 |
cols = [
|
313 |
c for c in results[f"{split}_score"].columns if "real_" in c and "accuracy" not in c and "conditional" not in c
|
314 |
]
|
315 |
+
col_names = [
|
316 |
+
(
|
317 |
+
f"π’ {c.replace('real_', '')}"
|
318 |
+
if source_split_map.get(c.replace("real_", ""), "public") == "public"
|
319 |
+
else f"π {c.replace('real_', '')}"
|
320 |
+
)
|
321 |
+
for c in results[f"{split}_score"].columns
|
322 |
+
if "real_" in c and "accuracy" not in c and "conditional" not in c
|
323 |
+
]
|
324 |
real_tmp = results[f"{split}_score"].loc[:, cols].copy()
|
325 |
+
real_tmp.columns = col_names
|
326 |
|
327 |
## Check cases
|
328 |
if accuracy_types[granularity] == 0:
|
329 |
+
"#### π€ True Positive Rate | Generated Source"
|
330 |
st.dataframe(gen_tmp, column_config=column_config)
|
331 |
|
332 |
"#### π§βπ€ True Negative Rate | Real Source"
|
333 |
st.dataframe(real_tmp, column_config=column_config)
|
334 |
|
335 |
elif accuracy_types[granularity] == 1:
|
336 |
+
"#### π€ Balanced Accuracy | Generated Source"
|
337 |
tnr = results[f"{split}_score"].loc[:, ["real_accuracy"]]
|
338 |
gen_tmp[:] = (gen_tmp.values + tnr.values) / 2.0
|
339 |
st.dataframe(gen_tmp, column_config=column_config)
|
|
|
344 |
st.dataframe(real_tmp, column_config=column_config)
|
345 |
else:
|
346 |
cols = [c for c in results[f"{split}_score"].columns if "generated_conditional_auc" in c]
|
347 |
+
col_names = [
|
348 |
+
(
|
349 |
+
f"π’ {c.replace('generated_conditional_auc_', '')}"
|
350 |
+
if source_split_map.get(c.replace("generated_conditional_auc_", ""), "public") == "public"
|
351 |
+
else f"π {c.replace('generated_conditional_auc_', '')}"
|
352 |
+
)
|
353 |
+
for c in results[f"{split}_score"].columns
|
354 |
+
if "generated_conditional_auc_" in c
|
355 |
+
]
|
356 |
gen_tmp = results[f"{split}_score"].loc[:, cols].dropna(axis=1).copy()
|
357 |
+
gen_tmp.columns = col_names
|
358 |
cols = [c for c in results[f"{split}_score"].columns if "real_conditional_auc" in c]
|
359 |
+
col_names = [
|
360 |
+
(
|
361 |
+
f"π’ {c.replace('real_conditional_auc_', '')}"
|
362 |
+
if source_split_map.get(c.replace("real_conditional_auc_", ""), "public") == "public"
|
363 |
+
else f"π {c.replace('real_conditional_auc_', '')}"
|
364 |
+
)
|
365 |
+
for c in results[f"{split}_score"].columns
|
366 |
+
if "real_conditional_auc" in c
|
367 |
+
]
|
368 |
real_tmp = results[f"{split}_score"].loc[:, cols].dropna(axis=1).copy()
|
369 |
+
real_tmp.columns = col_names
|
370 |
|
371 |
+
"#### π€ Conditional AUC | Generated Source"
|
372 |
st.dataframe(gen_tmp, column_config=column_config)
|
373 |
|
374 |
"#### π§βπ€ Conditional AUC | Real Source"
|
|
|
402 |
|
403 |
|
404 |
def make_acc(results):
|
|
|
|
|
|
|
|
|
405 |
results = results.loc[results["total_time"] >= 0]
|
406 |
|
407 |
chart = (
|
|
|
440 |
|
441 |
def make_plots_for_task(task, split, best_only):
|
442 |
results = load_results(task, best_only=best_only)
|
|
|
443 |
temp = results[f"{split}_score"].reset_index()
|
444 |
|
|
|
|
|
445 |
t1, t2 = st.tabs(["Tables", "Charts"])
|
446 |
with t1:
|
447 |
show_leaderboard(results, task)
|
|
|
452 |
acc_vs_time = make_acc(temp)
|
453 |
|
454 |
if split == "private" and hf_token is not None:
|
|
|
455 |
full_curves = st.toggle("Full curve", value=True, key=f"all curves {task}")
|
456 |
|
457 |
if full_curves:
|
|
|
459 |
|
460 |
st.altair_chart(roc_scatter | acc_vs_time, use_container_width=False)
|
461 |
else:
|
|
|
462 |
st.altair_chart(roc_scatter | acc_vs_time, use_container_width=False)
|
463 |
|
|
|
|
|
|
|
464 |
|
465 |
updated = get_updated_time()
|
466 |
st.markdown(updated)
|
467 |
+
best_only = True
|
|
|
|
|
|
|
468 |
|
469 |
|
470 |
tp, t1, volume_tab, all_submission_tab = st.tabs(
|
metric.py
CHANGED
@@ -185,8 +185,6 @@ def _metric(solution_df, submission_df, mode="top_level", full: bool = False):
|
|
185 |
all_reals = temp["pred"] == "real"
|
186 |
all_generated = temp["pred"] == "generated"
|
187 |
|
188 |
-
# for s in solution_df["source"].unique():
|
189 |
-
|
190 |
source_field = "source"
|
191 |
source_pred = temp[[source_field,"pred"]].drop_duplicates().values
|
192 |
|
@@ -223,8 +221,6 @@ def _metric(solution_df, submission_df, mode="top_level", full: bool = False):
|
|
223 |
all_reals = temp["pred"] == "real"
|
224 |
all_generated = temp["pred"] == "generated"
|
225 |
|
226 |
-
# for s in solution_df["source"].unique():
|
227 |
-
|
228 |
source_field = "source_og"
|
229 |
source_pred = temp[[source_field,"pred"]].drop_duplicates().values
|
230 |
|
@@ -246,26 +242,6 @@ def _metric(solution_df, submission_df, mode="top_level", full: bool = False):
|
|
246 |
|
247 |
evaluation[f"{split}_score"][f"{pred}_conditional_auc_{s}"] = out
|
248 |
|
249 |
-
# Compute AUC by source
|
250 |
-
# real_x_generated = [[source_map[key] for key in k] for k in real_x_generated]
|
251 |
-
# generated_x_real = [[source_map[key] for key in k] for k in generated_x_real]
|
252 |
-
# for real_column_subset in real_x_generated:
|
253 |
-
# try:
|
254 |
-
# evaluation[f"{split}_score"][f"real_conditional_auc_{real_column_subset[0]}"] = compute_roc(
|
255 |
-
# solution_df=temp[temp["source_og"].isin(real_column_subset)].copy()
|
256 |
-
# )
|
257 |
-
# except Exception as e:
|
258 |
-
# print(f"FAILED CONDITIONAL AUC: {real_column_subset} | {e}")
|
259 |
-
# evaluation[f"{split}_score"][f"real_conditional_auc_{real_column_subset[0]}"] = -1
|
260 |
-
# for generated_column_subset in generated_x_real:
|
261 |
-
# try:
|
262 |
-
# evaluation[f"{split}_score"][f"generated_conditional_auc_{generated_column_subset[0]}"] = compute_roc(
|
263 |
-
# solution_df=temp[temp["source_og"].isin(generated_column_subset)].copy()
|
264 |
-
# )
|
265 |
-
# except Exception as e:
|
266 |
-
# print(f"FAILED CONDITIONAL AUC: {generated_column_subset} | {e}")
|
267 |
-
# evaluation[f"{split}_score"][f"generated_conditional_auc_{generated_column_subset[0]}"] = -1
|
268 |
-
|
269 |
return evaluation
|
270 |
|
271 |
|
|
|
185 |
all_reals = temp["pred"] == "real"
|
186 |
all_generated = temp["pred"] == "generated"
|
187 |
|
|
|
|
|
188 |
source_field = "source"
|
189 |
source_pred = temp[[source_field,"pred"]].drop_duplicates().values
|
190 |
|
|
|
221 |
all_reals = temp["pred"] == "real"
|
222 |
all_generated = temp["pred"] == "generated"
|
223 |
|
|
|
|
|
224 |
source_field = "source_og"
|
225 |
source_pred = temp[[source_field,"pred"]].drop_duplicates().values
|
226 |
|
|
|
242 |
|
243 |
evaluation[f"{split}_score"][f"{pred}_conditional_auc_{s}"] = out
|
244 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
245 |
return evaluation
|
246 |
|
247 |
|