Spaces:

safe-challenge
/

video-challenge-leaderboard

Running

App Files Files Community

gmb/update-leaderboard

by gmancino-ball - opened 7 days ago

base: refs/heads/main

←

from: refs/pr/4

Discussion Files changed

+53

-75

Files changed (2) hide show

app.py +53 -51
metric.py +0 -24

app.py CHANGED Viewed

@@ -6,37 +6,11 @@ import subprocess
 import os
 ## Save results path
 results_path = Path("competition_cache/cached_results")
 TASKS = ["video-challenge-pilot-config", "video-challenge-task-1-config"]
 valid_splits = ["public", "private"]
-## Check for files initially
-if False:
-    print(f"Checking for task data")
-    print(os.listdir())
-    for task in TASKS:
-        if not os.path.exists(results_path):
-            print(f"{task} not found, running script")
-            try:
-                process = subprocess.Popen(
-                    ["python3", "utils.py"],
-                    stdout=subprocess.PIPE,
-                    stderr=subprocess.PIPE,
-                    text=True,
-                )
-                try:
-                    stdout, stderr = process.communicate(timeout=300)
-                except subprocess.TimeoutExpired:
-                    process.kill()
-                    stdout, stderr = process.communicate()
-                    print(f"{task} script timed out.")
-                print("OUTPUT:", stdout)
-                print("ERROR:", stderr)
-            except Exception as e:
-                print(f"Failed to run subprocess for {task}: {e}")
-    print("Task checking complete")
 #####################################################################
 ##                            Data loading                         ##
@@ -174,18 +148,15 @@ with st.sidebar:
     if password == hf_token:
         if st.button("Pull New Results"):
-            with st.spinner("Pull in new results", show_time=True):
                 try:
                     process = subprocess.Popen(
                         ["python3", "utils.py"],
-                        # stdout=subprocess.PIPE,
-                        # stderr=subprocess.PIPE,
                         text=True,  # Decode stdout/stderr as text
                     )
                     st.info(f"Background task started with PID: {process.pid}")
                     process.wait()
                     process.kill()
-                    # stdout, stderr = process.communicate()
                     if process.returncode != 0:
                         st.error("The process did not finish successfully.")
                     else:
@@ -219,6 +190,12 @@ with st.sidebar:
 def show_leaderboard(results, task):
     cols = [
         "generated_accuracy",
         "real_accuracy",
@@ -240,7 +217,7 @@ def show_leaderboard(results, task):
             # width="small",
         ),
         "generated_accuracy": st.column_config.NumberColumn(
-            "🤖 True Postive Rate",
             format="compact",
             min_value=0,
             pinned=True,
@@ -321,22 +298,42 @@ def show_leaderboard(results, task):
         for c in results[f"{split}_score"].columns
         if "generated_" in c and "accuracy" not in c and "conditional" not in c
     ]
     gen_tmp = results[f"{split}_score"].loc[:, cols].copy()
     cols = [
         c for c in results[f"{split}_score"].columns if "real_" in c and "accuracy" not in c and "conditional" not in c
     ]
     real_tmp = results[f"{split}_score"].loc[:, cols].copy()
     ## Check cases
     if accuracy_types[granularity] == 0:
-        "#### 🤖 True Positive Rate | Generated Source"
         st.dataframe(gen_tmp, column_config=column_config)
         "#### 🧑‍🎤 True Negative Rate | Real Source"
         st.dataframe(real_tmp, column_config=column_config)
     elif accuracy_types[granularity] == 1:
-        "#### 🤖 Balanced Accuracy | Generated Source"
         tnr = results[f"{split}_score"].loc[:, ["real_accuracy"]]
         gen_tmp[:] = (gen_tmp.values + tnr.values) / 2.0
         st.dataframe(gen_tmp, column_config=column_config)
@@ -347,11 +344,31 @@ def show_leaderboard(results, task):
         st.dataframe(real_tmp, column_config=column_config)
     else:
         cols = [c for c in results[f"{split}_score"].columns if "generated_conditional_auc" in c]
         gen_tmp = results[f"{split}_score"].loc[:, cols].dropna(axis=1).copy()
         cols = [c for c in results[f"{split}_score"].columns if "real_conditional_auc" in c]
         real_tmp = results[f"{split}_score"].loc[:, cols].dropna(axis=1).copy()
-        "#### 🤖 Conditional AUC | Generated Source"
         st.dataframe(gen_tmp, column_config=column_config)
         "#### 🧑‍🎤 Conditional AUC | Real Source"
@@ -385,10 +402,6 @@ def make_roc(results):
 def make_acc(results):
-    # results["FA"] = 1. - results["pristine_accuracy"]
-    # results = results[results["total_time"] >= 0]
-    # results["total_time"] = results["total_time"]
     results = results.loc[results["total_time"] >= 0]
     chart = (
@@ -427,11 +440,8 @@ def get_heatmaps(temp):
 def make_plots_for_task(task, split, best_only):
     results = load_results(task, best_only=best_only)
-    # results1[f"{split}_score"]
     temp = results[f"{split}_score"].reset_index()
-    # st.write(temp)
     t1, t2 = st.tabs(["Tables", "Charts"])
     with t1:
         show_leaderboard(results, task)
@@ -442,7 +452,6 @@ def make_plots_for_task(task, split, best_only):
         acc_vs_time = make_acc(temp)
         if split == "private" and hf_token is not None:
-            # with t2:
             full_curves = st.toggle("Full curve", value=True, key=f"all curves {task}")
             if full_curves:
@@ -450,19 +459,12 @@ def make_plots_for_task(task, split, best_only):
             st.altair_chart(roc_scatter | acc_vs_time, use_container_width=False)
         else:
-            # with t2:
             st.altair_chart(roc_scatter | acc_vs_time, use_container_width=False)
-    # with t3:
-    #     get_heatmaps(temp)
 updated = get_updated_time()
 st.markdown(updated)
-# st.markdown("#### Detailed Public Leaderboard")
-# st.markdown("[SAFE: Synthetic Audio Forensics Evaluation Challenge](https://stresearch.github.io/SAFE/)")
-best_only = True  # st.toggle("Only Best per Team", value=True)
-# show_chart = st.toggle("Show Table", value=True)
 tp, t1, volume_tab, all_submission_tab = st.tabs(

 import os
 ## Save results path
+COMP_CACHE = Path("competition_cache/safe-challenge")
 results_path = Path("competition_cache/cached_results")
 TASKS = ["video-challenge-pilot-config", "video-challenge-task-1-config"]
 valid_splits = ["public", "private"]
 #####################################################################
 ##                            Data loading                         ##
     if password == hf_token:
         if st.button("Pull New Results"):
+            with st.spinner("Pulling new results", show_time=True):
                 try:
                     process = subprocess.Popen(
                         ["python3", "utils.py"],
                         text=True,  # Decode stdout/stderr as text
                     )
                     st.info(f"Background task started with PID: {process.pid}")
                     process.wait()
                     process.kill()
                     if process.returncode != 0:
                         st.error("The process did not finish successfully.")
                     else:
 def show_leaderboard(results, task):
+    source_split_map = {}
+    if split == "private":
+        _sol_df = pd.read_csv(COMP_CACHE / task / "solution.csv")
+        pairs_df = _sol_df[["source_og", "split"]].drop_duplicates()
+        source_split_map = {x: y for x, y in zip(pairs_df["source_og"], pairs_df["split"])}
     cols = [
         "generated_accuracy",
         "real_accuracy",
             # width="small",
         ),
         "generated_accuracy": st.column_config.NumberColumn(
+            "👤 True Postive Rate",
             format="compact",
             min_value=0,
             pinned=True,
         for c in results[f"{split}_score"].columns
         if "generated_" in c and "accuracy" not in c and "conditional" not in c
     ]
+    col_names = [
+        (
+            f"📢 {c.replace('generated_', '')}"
+            if source_split_map.get(c.replace("generated_", ""), "public") == "public"
+            else f"🔐 {c.replace('generated_', '')}"
+        )
+        for c in results[f"{split}_score"].columns
+        if "generated_" in c and "accuracy" not in c and "conditional" not in c
+    ]
     gen_tmp = results[f"{split}_score"].loc[:, cols].copy()
+    gen_tmp.columns = col_names
     cols = [
         c for c in results[f"{split}_score"].columns if "real_" in c and "accuracy" not in c and "conditional" not in c
     ]
+    col_names = [
+        (
+            f"📢 {c.replace('real_', '')}"
+            if source_split_map.get(c.replace("real_", ""), "public") == "public"
+            else f"🔐 {c.replace('real_', '')}"
+        )
+        for c in results[f"{split}_score"].columns
+        if "real_" in c and "accuracy" not in c and "conditional" not in c
+    ]
     real_tmp = results[f"{split}_score"].loc[:, cols].copy()
+    real_tmp.columns = col_names
     ## Check cases
     if accuracy_types[granularity] == 0:
+        "#### 👤 True Positive Rate | Generated Source"
         st.dataframe(gen_tmp, column_config=column_config)
         "#### 🧑‍🎤 True Negative Rate | Real Source"
         st.dataframe(real_tmp, column_config=column_config)
     elif accuracy_types[granularity] == 1:
+        "#### 👤 Balanced Accuracy | Generated Source"
         tnr = results[f"{split}_score"].loc[:, ["real_accuracy"]]
         gen_tmp[:] = (gen_tmp.values + tnr.values) / 2.0
         st.dataframe(gen_tmp, column_config=column_config)
         st.dataframe(real_tmp, column_config=column_config)
     else:
         cols = [c for c in results[f"{split}_score"].columns if "generated_conditional_auc" in c]
+        col_names = [
+            (
+                f"📢 {c.replace('generated_conditional_auc_', '')}"
+                if source_split_map.get(c.replace("generated_conditional_auc_", ""), "public") == "public"
+                else f"🔐 {c.replace('generated_conditional_auc_', '')}"
+            )
+            for c in results[f"{split}_score"].columns
+            if "generated_conditional_auc_" in c
+        ]
         gen_tmp = results[f"{split}_score"].loc[:, cols].dropna(axis=1).copy()
+        gen_tmp.columns = col_names
         cols = [c for c in results[f"{split}_score"].columns if "real_conditional_auc" in c]
+        col_names = [
+            (
+                f"📢 {c.replace('real_conditional_auc_', '')}"
+                if source_split_map.get(c.replace("real_conditional_auc_", ""), "public") == "public"
+                else f"🔐 {c.replace('real_conditional_auc_', '')}"
+            )
+            for c in results[f"{split}_score"].columns
+            if "real_conditional_auc" in c
+        ]
         real_tmp = results[f"{split}_score"].loc[:, cols].dropna(axis=1).copy()
+        real_tmp.columns = col_names
+        "#### 👤 Conditional AUC | Generated Source"
         st.dataframe(gen_tmp, column_config=column_config)
         "#### 🧑‍🎤 Conditional AUC | Real Source"
 def make_acc(results):
     results = results.loc[results["total_time"] >= 0]
     chart = (
 def make_plots_for_task(task, split, best_only):
     results = load_results(task, best_only=best_only)
     temp = results[f"{split}_score"].reset_index()
     t1, t2 = st.tabs(["Tables", "Charts"])
     with t1:
         show_leaderboard(results, task)
         acc_vs_time = make_acc(temp)
         if split == "private" and hf_token is not None:
             full_curves = st.toggle("Full curve", value=True, key=f"all curves {task}")
             if full_curves:
             st.altair_chart(roc_scatter | acc_vs_time, use_container_width=False)
         else:
             st.altair_chart(roc_scatter | acc_vs_time, use_container_width=False)
 updated = get_updated_time()
 st.markdown(updated)
+best_only = True
 tp, t1, volume_tab, all_submission_tab = st.tabs(

metric.py CHANGED Viewed

@@ -185,8 +185,6 @@ def _metric(solution_df, submission_df, mode="top_level", full: bool = False):
         all_reals = temp["pred"] == "real"
         all_generated = temp["pred"] == "generated"
-        # for s in solution_df["source"].unique():
         source_field = "source"
         source_pred = temp[[source_field,"pred"]].drop_duplicates().values
@@ -223,8 +221,6 @@ def _metric(solution_df, submission_df, mode="top_level", full: bool = False):
         all_reals = temp["pred"] == "real"
         all_generated = temp["pred"] == "generated"
-        # for s in solution_df["source"].unique():
         source_field = "source_og"
         source_pred = temp[[source_field,"pred"]].drop_duplicates().values
@@ -246,26 +242,6 @@ def _metric(solution_df, submission_df, mode="top_level", full: bool = False):
             evaluation[f"{split}_score"][f"{pred}_conditional_auc_{s}"] = out
-        # Compute AUC by source
-        # real_x_generated = [[source_map[key] for key in k] for k in real_x_generated]
-        # generated_x_real = [[source_map[key] for key in k] for k in generated_x_real]
-        # for real_column_subset in real_x_generated:
-        #     try:
-        #         evaluation[f"{split}_score"][f"real_conditional_auc_{real_column_subset[0]}"] = compute_roc(
-        #             solution_df=temp[temp["source_og"].isin(real_column_subset)].copy()
-        #         )
-        #     except Exception as e:
-        #         print(f"FAILED CONDITIONAL AUC: {real_column_subset} | {e}")
-        #         evaluation[f"{split}_score"][f"real_conditional_auc_{real_column_subset[0]}"] = -1
-        # for generated_column_subset in generated_x_real:
-        #     try:
-        #         evaluation[f"{split}_score"][f"generated_conditional_auc_{generated_column_subset[0]}"] = compute_roc(
-        #             solution_df=temp[temp["source_og"].isin(generated_column_subset)].copy()
-        #         )
-        #     except Exception as e:
-        #         print(f"FAILED CONDITIONAL AUC: {generated_column_subset} | {e}")
-        #         evaluation[f"{split}_score"][f"generated_conditional_auc_{generated_column_subset[0]}"] = -1
     return evaluation

         all_reals = temp["pred"] == "real"
         all_generated = temp["pred"] == "generated"
         source_field = "source"
         source_pred = temp[[source_field,"pred"]].drop_duplicates().values
         all_reals = temp["pred"] == "real"
         all_generated = temp["pred"] == "generated"
         source_field = "source_og"
         source_pred = temp[[source_field,"pred"]].drop_duplicates().values
             evaluation[f"{split}_score"][f"{pred}_conditional_auc_{s}"] = out
     return evaluation