gmb/update-leaderboard

#4
by gmancino-ball - opened
Files changed (2) hide show
  1. app.py +53 -51
  2. metric.py +0 -24
app.py CHANGED
@@ -6,37 +6,11 @@ import subprocess
6
  import os
7
 
8
  ## Save results path
 
9
  results_path = Path("competition_cache/cached_results")
10
  TASKS = ["video-challenge-pilot-config", "video-challenge-task-1-config"]
11
  valid_splits = ["public", "private"]
12
 
13
- ## Check for files initially
14
- if False:
15
- print(f"Checking for task data")
16
- print(os.listdir())
17
- for task in TASKS:
18
- if not os.path.exists(results_path):
19
- print(f"{task} not found, running script")
20
- try:
21
- process = subprocess.Popen(
22
- ["python3", "utils.py"],
23
- stdout=subprocess.PIPE,
24
- stderr=subprocess.PIPE,
25
- text=True,
26
- )
27
- try:
28
- stdout, stderr = process.communicate(timeout=300)
29
- except subprocess.TimeoutExpired:
30
- process.kill()
31
- stdout, stderr = process.communicate()
32
- print(f"{task} script timed out.")
33
-
34
- print("OUTPUT:", stdout)
35
- print("ERROR:", stderr)
36
- except Exception as e:
37
- print(f"Failed to run subprocess for {task}: {e}")
38
- print("Task checking complete")
39
-
40
 
41
  #####################################################################
42
  ## Data loading ##
@@ -174,18 +148,15 @@ with st.sidebar:
174
 
175
  if password == hf_token:
176
  if st.button("Pull New Results"):
177
- with st.spinner("Pull in new results", show_time=True):
178
  try:
179
  process = subprocess.Popen(
180
  ["python3", "utils.py"],
181
- # stdout=subprocess.PIPE,
182
- # stderr=subprocess.PIPE,
183
  text=True, # Decode stdout/stderr as text
184
  )
185
  st.info(f"Background task started with PID: {process.pid}")
186
  process.wait()
187
  process.kill()
188
- # stdout, stderr = process.communicate()
189
  if process.returncode != 0:
190
  st.error("The process did not finish successfully.")
191
  else:
@@ -219,6 +190,12 @@ with st.sidebar:
219
 
220
 
221
  def show_leaderboard(results, task):
 
 
 
 
 
 
222
  cols = [
223
  "generated_accuracy",
224
  "real_accuracy",
@@ -240,7 +217,7 @@ def show_leaderboard(results, task):
240
  # width="small",
241
  ),
242
  "generated_accuracy": st.column_config.NumberColumn(
243
- "πŸ€– True Postive Rate",
244
  format="compact",
245
  min_value=0,
246
  pinned=True,
@@ -321,22 +298,42 @@ def show_leaderboard(results, task):
321
  for c in results[f"{split}_score"].columns
322
  if "generated_" in c and "accuracy" not in c and "conditional" not in c
323
  ]
 
 
 
 
 
 
 
 
 
324
  gen_tmp = results[f"{split}_score"].loc[:, cols].copy()
 
325
  cols = [
326
  c for c in results[f"{split}_score"].columns if "real_" in c and "accuracy" not in c and "conditional" not in c
327
  ]
 
 
 
 
 
 
 
 
 
328
  real_tmp = results[f"{split}_score"].loc[:, cols].copy()
 
329
 
330
  ## Check cases
331
  if accuracy_types[granularity] == 0:
332
- "#### πŸ€– True Positive Rate | Generated Source"
333
  st.dataframe(gen_tmp, column_config=column_config)
334
 
335
  "#### πŸ§‘β€πŸŽ€ True Negative Rate | Real Source"
336
  st.dataframe(real_tmp, column_config=column_config)
337
 
338
  elif accuracy_types[granularity] == 1:
339
- "#### πŸ€– Balanced Accuracy | Generated Source"
340
  tnr = results[f"{split}_score"].loc[:, ["real_accuracy"]]
341
  gen_tmp[:] = (gen_tmp.values + tnr.values) / 2.0
342
  st.dataframe(gen_tmp, column_config=column_config)
@@ -347,11 +344,31 @@ def show_leaderboard(results, task):
347
  st.dataframe(real_tmp, column_config=column_config)
348
  else:
349
  cols = [c for c in results[f"{split}_score"].columns if "generated_conditional_auc" in c]
 
 
 
 
 
 
 
 
 
350
  gen_tmp = results[f"{split}_score"].loc[:, cols].dropna(axis=1).copy()
 
351
  cols = [c for c in results[f"{split}_score"].columns if "real_conditional_auc" in c]
 
 
 
 
 
 
 
 
 
352
  real_tmp = results[f"{split}_score"].loc[:, cols].dropna(axis=1).copy()
 
353
 
354
- "#### πŸ€– Conditional AUC | Generated Source"
355
  st.dataframe(gen_tmp, column_config=column_config)
356
 
357
  "#### πŸ§‘β€πŸŽ€ Conditional AUC | Real Source"
@@ -385,10 +402,6 @@ def make_roc(results):
385
 
386
 
387
  def make_acc(results):
388
- # results["FA"] = 1. - results["pristine_accuracy"]
389
- # results = results[results["total_time"] >= 0]
390
- # results["total_time"] = results["total_time"]
391
-
392
  results = results.loc[results["total_time"] >= 0]
393
 
394
  chart = (
@@ -427,11 +440,8 @@ def get_heatmaps(temp):
427
 
428
  def make_plots_for_task(task, split, best_only):
429
  results = load_results(task, best_only=best_only)
430
- # results1[f"{split}_score"]
431
  temp = results[f"{split}_score"].reset_index()
432
 
433
- # st.write(temp)
434
-
435
  t1, t2 = st.tabs(["Tables", "Charts"])
436
  with t1:
437
  show_leaderboard(results, task)
@@ -442,7 +452,6 @@ def make_plots_for_task(task, split, best_only):
442
  acc_vs_time = make_acc(temp)
443
 
444
  if split == "private" and hf_token is not None:
445
- # with t2:
446
  full_curves = st.toggle("Full curve", value=True, key=f"all curves {task}")
447
 
448
  if full_curves:
@@ -450,19 +459,12 @@ def make_plots_for_task(task, split, best_only):
450
 
451
  st.altair_chart(roc_scatter | acc_vs_time, use_container_width=False)
452
  else:
453
- # with t2:
454
  st.altair_chart(roc_scatter | acc_vs_time, use_container_width=False)
455
 
456
- # with t3:
457
- # get_heatmaps(temp)
458
-
459
 
460
  updated = get_updated_time()
461
  st.markdown(updated)
462
- # st.markdown("#### Detailed Public Leaderboard")
463
- # st.markdown("[SAFE: Synthetic Audio Forensics Evaluation Challenge](https://stresearch.github.io/SAFE/)")
464
- best_only = True # st.toggle("Only Best per Team", value=True)
465
- # show_chart = st.toggle("Show Table", value=True)
466
 
467
 
468
  tp, t1, volume_tab, all_submission_tab = st.tabs(
 
6
  import os
7
 
8
  ## Save results path
9
+ COMP_CACHE = Path("competition_cache/safe-challenge")
10
  results_path = Path("competition_cache/cached_results")
11
  TASKS = ["video-challenge-pilot-config", "video-challenge-task-1-config"]
12
  valid_splits = ["public", "private"]
13
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
14
 
15
  #####################################################################
16
  ## Data loading ##
 
148
 
149
  if password == hf_token:
150
  if st.button("Pull New Results"):
151
+ with st.spinner("Pulling new results", show_time=True):
152
  try:
153
  process = subprocess.Popen(
154
  ["python3", "utils.py"],
 
 
155
  text=True, # Decode stdout/stderr as text
156
  )
157
  st.info(f"Background task started with PID: {process.pid}")
158
  process.wait()
159
  process.kill()
 
160
  if process.returncode != 0:
161
  st.error("The process did not finish successfully.")
162
  else:
 
190
 
191
 
192
  def show_leaderboard(results, task):
193
+ source_split_map = {}
194
+ if split == "private":
195
+ _sol_df = pd.read_csv(COMP_CACHE / task / "solution.csv")
196
+ pairs_df = _sol_df[["source_og", "split"]].drop_duplicates()
197
+ source_split_map = {x: y for x, y in zip(pairs_df["source_og"], pairs_df["split"])}
198
+
199
  cols = [
200
  "generated_accuracy",
201
  "real_accuracy",
 
217
  # width="small",
218
  ),
219
  "generated_accuracy": st.column_config.NumberColumn(
220
+ "πŸ‘€ True Postive Rate",
221
  format="compact",
222
  min_value=0,
223
  pinned=True,
 
298
  for c in results[f"{split}_score"].columns
299
  if "generated_" in c and "accuracy" not in c and "conditional" not in c
300
  ]
301
+ col_names = [
302
+ (
303
+ f"πŸ“’ {c.replace('generated_', '')}"
304
+ if source_split_map.get(c.replace("generated_", ""), "public") == "public"
305
+ else f"πŸ” {c.replace('generated_', '')}"
306
+ )
307
+ for c in results[f"{split}_score"].columns
308
+ if "generated_" in c and "accuracy" not in c and "conditional" not in c
309
+ ]
310
  gen_tmp = results[f"{split}_score"].loc[:, cols].copy()
311
+ gen_tmp.columns = col_names
312
  cols = [
313
  c for c in results[f"{split}_score"].columns if "real_" in c and "accuracy" not in c and "conditional" not in c
314
  ]
315
+ col_names = [
316
+ (
317
+ f"πŸ“’ {c.replace('real_', '')}"
318
+ if source_split_map.get(c.replace("real_", ""), "public") == "public"
319
+ else f"πŸ” {c.replace('real_', '')}"
320
+ )
321
+ for c in results[f"{split}_score"].columns
322
+ if "real_" in c and "accuracy" not in c and "conditional" not in c
323
+ ]
324
  real_tmp = results[f"{split}_score"].loc[:, cols].copy()
325
+ real_tmp.columns = col_names
326
 
327
  ## Check cases
328
  if accuracy_types[granularity] == 0:
329
+ "#### πŸ‘€ True Positive Rate | Generated Source"
330
  st.dataframe(gen_tmp, column_config=column_config)
331
 
332
  "#### πŸ§‘β€πŸŽ€ True Negative Rate | Real Source"
333
  st.dataframe(real_tmp, column_config=column_config)
334
 
335
  elif accuracy_types[granularity] == 1:
336
+ "#### πŸ‘€ Balanced Accuracy | Generated Source"
337
  tnr = results[f"{split}_score"].loc[:, ["real_accuracy"]]
338
  gen_tmp[:] = (gen_tmp.values + tnr.values) / 2.0
339
  st.dataframe(gen_tmp, column_config=column_config)
 
344
  st.dataframe(real_tmp, column_config=column_config)
345
  else:
346
  cols = [c for c in results[f"{split}_score"].columns if "generated_conditional_auc" in c]
347
+ col_names = [
348
+ (
349
+ f"πŸ“’ {c.replace('generated_conditional_auc_', '')}"
350
+ if source_split_map.get(c.replace("generated_conditional_auc_", ""), "public") == "public"
351
+ else f"πŸ” {c.replace('generated_conditional_auc_', '')}"
352
+ )
353
+ for c in results[f"{split}_score"].columns
354
+ if "generated_conditional_auc_" in c
355
+ ]
356
  gen_tmp = results[f"{split}_score"].loc[:, cols].dropna(axis=1).copy()
357
+ gen_tmp.columns = col_names
358
  cols = [c for c in results[f"{split}_score"].columns if "real_conditional_auc" in c]
359
+ col_names = [
360
+ (
361
+ f"πŸ“’ {c.replace('real_conditional_auc_', '')}"
362
+ if source_split_map.get(c.replace("real_conditional_auc_", ""), "public") == "public"
363
+ else f"πŸ” {c.replace('real_conditional_auc_', '')}"
364
+ )
365
+ for c in results[f"{split}_score"].columns
366
+ if "real_conditional_auc" in c
367
+ ]
368
  real_tmp = results[f"{split}_score"].loc[:, cols].dropna(axis=1).copy()
369
+ real_tmp.columns = col_names
370
 
371
+ "#### πŸ‘€ Conditional AUC | Generated Source"
372
  st.dataframe(gen_tmp, column_config=column_config)
373
 
374
  "#### πŸ§‘β€πŸŽ€ Conditional AUC | Real Source"
 
402
 
403
 
404
  def make_acc(results):
 
 
 
 
405
  results = results.loc[results["total_time"] >= 0]
406
 
407
  chart = (
 
440
 
441
  def make_plots_for_task(task, split, best_only):
442
  results = load_results(task, best_only=best_only)
 
443
  temp = results[f"{split}_score"].reset_index()
444
 
 
 
445
  t1, t2 = st.tabs(["Tables", "Charts"])
446
  with t1:
447
  show_leaderboard(results, task)
 
452
  acc_vs_time = make_acc(temp)
453
 
454
  if split == "private" and hf_token is not None:
 
455
  full_curves = st.toggle("Full curve", value=True, key=f"all curves {task}")
456
 
457
  if full_curves:
 
459
 
460
  st.altair_chart(roc_scatter | acc_vs_time, use_container_width=False)
461
  else:
 
462
  st.altair_chart(roc_scatter | acc_vs_time, use_container_width=False)
463
 
 
 
 
464
 
465
  updated = get_updated_time()
466
  st.markdown(updated)
467
+ best_only = True
 
 
 
468
 
469
 
470
  tp, t1, volume_tab, all_submission_tab = st.tabs(
metric.py CHANGED
@@ -185,8 +185,6 @@ def _metric(solution_df, submission_df, mode="top_level", full: bool = False):
185
  all_reals = temp["pred"] == "real"
186
  all_generated = temp["pred"] == "generated"
187
 
188
- # for s in solution_df["source"].unique():
189
-
190
  source_field = "source"
191
  source_pred = temp[[source_field,"pred"]].drop_duplicates().values
192
 
@@ -223,8 +221,6 @@ def _metric(solution_df, submission_df, mode="top_level", full: bool = False):
223
  all_reals = temp["pred"] == "real"
224
  all_generated = temp["pred"] == "generated"
225
 
226
- # for s in solution_df["source"].unique():
227
-
228
  source_field = "source_og"
229
  source_pred = temp[[source_field,"pred"]].drop_duplicates().values
230
 
@@ -246,26 +242,6 @@ def _metric(solution_df, submission_df, mode="top_level", full: bool = False):
246
 
247
  evaluation[f"{split}_score"][f"{pred}_conditional_auc_{s}"] = out
248
 
249
- # Compute AUC by source
250
- # real_x_generated = [[source_map[key] for key in k] for k in real_x_generated]
251
- # generated_x_real = [[source_map[key] for key in k] for k in generated_x_real]
252
- # for real_column_subset in real_x_generated:
253
- # try:
254
- # evaluation[f"{split}_score"][f"real_conditional_auc_{real_column_subset[0]}"] = compute_roc(
255
- # solution_df=temp[temp["source_og"].isin(real_column_subset)].copy()
256
- # )
257
- # except Exception as e:
258
- # print(f"FAILED CONDITIONAL AUC: {real_column_subset} | {e}")
259
- # evaluation[f"{split}_score"][f"real_conditional_auc_{real_column_subset[0]}"] = -1
260
- # for generated_column_subset in generated_x_real:
261
- # try:
262
- # evaluation[f"{split}_score"][f"generated_conditional_auc_{generated_column_subset[0]}"] = compute_roc(
263
- # solution_df=temp[temp["source_og"].isin(generated_column_subset)].copy()
264
- # )
265
- # except Exception as e:
266
- # print(f"FAILED CONDITIONAL AUC: {generated_column_subset} | {e}")
267
- # evaluation[f"{split}_score"][f"generated_conditional_auc_{generated_column_subset[0]}"] = -1
268
-
269
  return evaluation
270
 
271
 
 
185
  all_reals = temp["pred"] == "real"
186
  all_generated = temp["pred"] == "generated"
187
 
 
 
188
  source_field = "source"
189
  source_pred = temp[[source_field,"pred"]].drop_duplicates().values
190
 
 
221
  all_reals = temp["pred"] == "real"
222
  all_generated = temp["pred"] == "generated"
223
 
 
 
224
  source_field = "source_og"
225
  source_pred = temp[[source_field,"pred"]].drop_duplicates().values
226
 
 
242
 
243
  evaluation[f"{split}_score"][f"{pred}_conditional_auc_{s}"] = out
244
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
245
  return evaluation
246
 
247