Spaces:

safe-challenge
/

video-challenge-leaderboard

Running

App Files Files Community

video-challenge-leaderboard / app.py

gmancino-ball

gmb/fix-roc (#5)

5cf40e5 about 7 hours ago

raw

history blame contribute delete

17.1 kB

	import streamlit as st
	from pathlib import Path
	import pandas as pd
	import altair as alt
	import subprocess
	import os

	## Save results path
	COMP_CACHE = Path("competition_cache/safe-challenge")
	results_path = Path("competition_cache/cached_results")
	TASKS = ["video-challenge-pilot-config", "video-challenge-task-1-config"]
	valid_splits = ["public", "private"]


	#####################################################################
	## Data loading ##
	#####################################################################
	## Data loading
	@st.cache_data
	def load_results(task, best_only):
	if best_only:
	return {
	f"{s}_score": pd.read_csv(f"{results_path}/{task}_{s}_score.csv")
	.sort_values(["team", "balanced_accuracy"], ascending=False)
	.drop_duplicates(subset=["team"])
	.sort_values("balanced_accuracy", ascending=False)
	.set_index("team")
	for s in valid_splits
	}

	else:
	return {
	f"{s}_score": pd.read_csv(f"{results_path}/{task}_{s}_score.csv").set_index("team") for s in valid_splits
	}


	@st.cache_data
	def load_submission():
	out = []
	for task in TASKS:
	data = pd.read_csv(f"{results_path}/{task}_submissions.csv")
	data["task"] = task
	out.append(data)

	return pd.concat(out, ignore_index=True)


	def get_updated_time(file="competition_cache/updated.txt"):
	if os.path.exists(file):
	return open(file).read()
	else:
	return "no time file found"


	@st.cache_data
	def get_volume():
	subs = pd.concat(
	[pd.read_csv(f"{results_path}/{task}_submissions.csv") for task in TASKS],
	ignore_index=True,
	)
	subs["datetime"] = pd.DatetimeIndex(subs["datetime"])
	subs["date"] = subs["datetime"].dt.date
	subs = subs.groupby(["date", "status_reason"]).size().unstack().fillna(0).reset_index()

	return subs


	@st.cache_data
	def make_heatmap(results, label="generated", symbol="👤"):

	# Assuming df is your wide-format DataFrame (models as rows, datasets as columns)
	df_long = results.set_index("team")

	team_order = results.index.tolist()
	df_long = df_long.loc[:, [c for c in df_long.columns if c.startswith(label) and "accuracy" not in c]]

	df_long.columns = [c.replace(f"{label}_", "") for c in df_long.columns]

	if "none" in df_long.columns:
	df_long = df_long.drop(columns=["none"])

	df_long = df_long.reset_index().melt(id_vars="team", var_name="source", value_name="acc")

	# Base chart for rectangles
	base = alt.Chart(df_long).encode(
	x=alt.X("source:O", title="Source", axis=alt.Axis(orient="top", labelAngle=-60)),
	y=alt.Y("team:O", title="Team", sort=team_order),
	)

	# Heatmap rectangles
	heatmap = base.mark_rect().encode(
	color=alt.Color("acc:Q", scale=alt.Scale(scheme="greens"), title=f"{label} Accuracy")
	)

	# Text labels
	text = base.mark_text(baseline="middle", fontSize=16).encode(
	text=alt.Text("acc:Q", format=".2f"),
	color=alt.condition(
	alt.datum.acc < 0.5, # you can tune this for readability
	alt.value("black"),
	alt.value("white"),
	),
	)

	# Combine heatmap and text
	chart = (heatmap + text).properties(width=600, height=500, title=f"Accuracy on {symbol} {label} sources heatmap")

	return chart


	@st.cache_data
	def make_roc_curves(task, submission_cols, best_only=True):

	rocs = pd.read_csv(f"{results_path}/{task}_rocs.csv")

	if best_only:
	rocs = rocs[rocs["submission_id"].isin(submission_cols)]

	roc_chart = alt.Chart(rocs).mark_line().encode(x="fpr", y="tpr", color="team:N", detail="submission_id:N")

	return roc_chart


	#####################################################################
	## Page definition ##
	#####################################################################

	## Set title
	st.set_page_config(
	page_title="Leaderboard",
	initial_sidebar_state="collapsed",
	layout="wide", # This makes the app use the full width of the screen
	)

	## Pull new results or toggle private public if you are an owner
	with st.sidebar:

	hf_token = os.getenv("HF_TOKEN")
	password = st.text_input("Admin login:", type="password")

	if password == hf_token:
	if st.button("Pull New Results"):
	with st.spinner("Pulling new results", show_time=True):
	try:
	process = subprocess.Popen(
	["python3", "utils.py"],
	text=True, # Decode stdout/stderr as text
	)
	st.info(f"Background task started with PID: {process.pid}")
	process.wait()
	process.kill()
	if process.returncode != 0:
	st.error("The process did not finish successfully.")
	else:
	st.success(f"PID {process.pid} finished!")
	# If a user has the right perms, then this clears the cache
	load_results.clear()
	get_volume.clear()
	load_submission.clear()
	st.rerun()
	except Exception as e:
	st.error(f"Error starting background task: {e}")

	## Initialize the toggle state in session_state if it doesn't exist
	if "private_view" not in st.session_state:
	st.session_state.private_view = False

	# Create the toggle widget
	# The 'value' parameter sets the initial state, here linked to session_state
	# The 'key' parameter is crucial for identifying the widget across reruns and linking to session_state
	toggle_value = st.toggle("Private Scores", value=st.session_state.private_view, key="private_view")

	# The 'toggle_value' variable will hold the current state of the toggle (True or False)
	if toggle_value:
	st.write("Showing PRIVATE scores.")
	else:
	st.write("Showing PUBLIC scores.")

	split = "public" if not toggle_value else "private"
	else:
	split = "public"


	def show_leaderboard(results, task):
	source_split_map = {}
	if split == "private":
	_sol_df = pd.read_csv(COMP_CACHE / task / "solution.csv")
	pairs_df = _sol_df[["source_og", "split"]].drop_duplicates()
	source_split_map = {x: y for x, y in zip(pairs_df["source_og"], pairs_df["split"])}

	cols = [
	"generated_accuracy",
	"real_accuracy",
	# "pristine_accuracy",
	"balanced_accuracy",
	"auc",
	"fail_rate",
	"total_time",
	"datetime",
	]

	column_config = {
	"balanced_accuracy": st.column_config.NumberColumn(
	"⚖️ Balanced Accruacy",
	format="compact",
	min_value=0,
	pinned=True,
	max_value=1.0,
	# width="small",
	),
	"generated_accuracy": st.column_config.NumberColumn(
	"👤 True Postive Rate",
	format="compact",
	min_value=0,
	pinned=True,
	max_value=1.0,
	# width="small",
	),
	"real_accuracy": st.column_config.NumberColumn(
	"🧑‍🎤 True Negative Rate",
	format="compact",
	min_value=0,
	pinned=True,
	max_value=1.0,
	# width="small",
	),
	"auc": st.column_config.NumberColumn(
	"📐 AUC",
	format="compact",
	min_value=0,
	pinned=True,
	max_value=1.0,
	# width="small",
	),
	"fail_rate": st.column_config.NumberColumn(
	"❌ Fail Rate",
	format="compact",
	# width="small",
	),
	"total_time": st.column_config.NumberColumn(
	"🕒 Inference Time",
	format="compact",
	# width="small",
	),
	"datetime": st.column_config.DatetimeColumn(
	"🗓️ Submission Date",
	format="YYYY-MM-DD",
	# width="small",
	),
	}

	labels = {"real": "🧑‍🎤", "generated": "👤"}

	for c in results[f"{split}_score"].columns:
	if "accuracy" in c:
	continue
	if any(p in c for p in ["generated", "real"]):
	s = c.split("_")
	pred = s[0]
	source = " ".join(s[1:])
	column_config[c] = st.column_config.NumberColumn(
	labels[pred] + " " + source,
	help=c,
	format="compact",
	min_value=0,
	max_value=1.0,
	)

	"#### Summary"
	st.dataframe(results[f"{split}_score"].loc[:, cols], column_config=column_config)

	"##### Accuracy Breakdown by Source"
	accuracy_types = {
	"True positive/negative rate": 0,
	"Conditional balanced accuracy": 1,
	"AUC": 2,
	}
	granularity = st.radio(
	"accuracy type",
	list(accuracy_types.keys()),
	key=f"granularity-{task}",
	horizontal=True,
	label_visibility="collapsed",
	index=0,
	)

	## Subset the dataset
	cols = [
	c
	for c in results[f"{split}_score"].columns
	if "generated_" in c and "accuracy" not in c and "conditional" not in c
	]
	col_names = [
	(
	f"📢 {c.replace('generated_', '')}"
	if source_split_map.get(c.replace("generated_", ""), "public") == "public"
	else f"🔐 {c.replace('generated_', '')}"
	)
	for c in results[f"{split}_score"].columns
	if "generated_" in c and "accuracy" not in c and "conditional" not in c
	]
	gen_tmp = results[f"{split}_score"].loc[:, cols].copy()
	gen_tmp.columns = col_names
	cols = [
	c for c in results[f"{split}_score"].columns if "real_" in c and "accuracy" not in c and "conditional" not in c
	]
	col_names = [
	(
	f"📢 {c.replace('real_', '')}"
	if source_split_map.get(c.replace("real_", ""), "public") == "public"
	else f"🔐 {c.replace('real_', '')}"
	)
	for c in results[f"{split}_score"].columns
	if "real_" in c and "accuracy" not in c and "conditional" not in c
	]
	real_tmp = results[f"{split}_score"].loc[:, cols].copy()
	real_tmp.columns = col_names

	## Check cases
	if accuracy_types[granularity] == 0:
	"#### 👤 True Positive Rate \| Generated Source"
	st.dataframe(gen_tmp, column_config=column_config)

	"#### 🧑‍🎤 True Negative Rate \| Real Source"
	st.dataframe(real_tmp, column_config=column_config)

	elif accuracy_types[granularity] == 1:
	"#### 👤 Balanced Accuracy \| Generated Source"
	tnr = results[f"{split}_score"].loc[:, ["real_accuracy"]]
	gen_tmp[:] = (gen_tmp.values + tnr.values) / 2.0
	st.dataframe(gen_tmp, column_config=column_config)

	"#### 🧑‍🎤 Balanced Accuracy \| Real Source"
	tpr = results[f"{split}_score"].loc[:, ["generated_accuracy"]]
	real_tmp[:] = (real_tmp.values + tpr.values) / 2.0
	st.dataframe(real_tmp, column_config=column_config)
	else:
	cols = [c for c in results[f"{split}_score"].columns if "generated_conditional_auc" in c]
	col_names = [
	(
	f"📢 {c.replace('generated_conditional_auc_', '')}"
	if source_split_map.get(c.replace("generated_conditional_auc_", ""), "public") == "public"
	else f"🔐 {c.replace('generated_conditional_auc_', '')}"
	)
	for c in results[f"{split}_score"].columns
	if "generated_conditional_auc_" in c
	]
	gen_tmp = results[f"{split}_score"].loc[:, cols].dropna(axis=1).copy()
	gen_tmp.columns = col_names
	cols = [c for c in results[f"{split}_score"].columns if "real_conditional_auc" in c]
	col_names = [
	(
	f"📢 {c.replace('real_conditional_auc_', '')}"
	if source_split_map.get(c.replace("real_conditional_auc_", ""), "public") == "public"
	else f"🔐 {c.replace('real_conditional_auc_', '')}"
	)
	for c in results[f"{split}_score"].columns
	if "real_conditional_auc" in c
	]
	real_tmp = results[f"{split}_score"].loc[:, cols].dropna(axis=1).copy()
	real_tmp.columns = col_names

	"#### 👤 Conditional AUC \| Generated Source"
	st.dataframe(gen_tmp, column_config=column_config)

	"#### 🧑‍🎤 Conditional AUC \| Real Source"
	st.dataframe(real_tmp, column_config=column_config)


	def make_roc(results):
	results["FA"] = 1.0 - results["real_accuracy"]

	chart = (
	alt.Chart(results)
	.mark_circle()
	.encode(
	x=alt.X("FA:Q", title="🧑‍🎤 False Positive Rate", scale=alt.Scale(domain=[0.0, 1.0])),
	y=alt.Y("generated_accuracy:Q", title="👤 True Positive Rate", scale=alt.Scale(domain=[0.0, 1.0])),
	color="team:N", # Color by categorical field
	size=alt.Size(
	"total_time:Q", title="🕒 Inference Time", scale=alt.Scale(rangeMin=100)
	), # Size by quantitative field
	)
	.properties(width=400, height=400, title="Detection vs False Alarm vs Inference Time")
	)

	diag_line = (
	alt.Chart(pd.DataFrame(dict(tpr=[0, 1], fpr=[0, 1])))
	.mark_line(color="lightgray", strokeDash=[8, 4])
	.encode(x="fpr", y="tpr")
	)

	return chart + diag_line


	def make_acc(results):
	results = results.loc[results["total_time"] >= 0]

	chart = (
	alt.Chart(results)
	.mark_circle(size=200)
	.encode(
	x=alt.X("total_time:Q", title="🕒 Inference Time", scale=alt.Scale(domain=[0.0, 10000])),
	y=alt.Y(
	"balanced_accuracy:Q",
	title="Balanced Accuracy",
	scale=alt.Scale(domain=[0.4, 1]),
	),
	color="team:N", # Color by categorical field # Size by quantitative field
	)
	.properties(width=400, height=400, title="Inference Time vs Balanced Accuracy")
	)
	diag_line = (
	alt.Chart(pd.DataFrame(dict(t=[0, results["total_time"].max()], y=[0.5, 0.5])))
	.mark_line(color="lightgray", strokeDash=[8, 4])
	.encode(x="t", y="y")
	)
	return chart + diag_line


	def get_heatmaps(temp):
	h1 = make_heatmap(temp, "generated", symbol="👤")
	h2 = make_heatmap(temp, "real", symbol="🧑‍🎤")

	st.altair_chart(h1, use_container_width=True)
	st.altair_chart(h2, use_container_width=True)

	if temp.columns.str.contains("aug", case=False).any():
	h3 = make_heatmap(temp, "aug", symbol="🛠️")
	st.altair_chart(h3, use_container_width=True)


	def make_plots_for_task(task, split, best_only):
	results = load_results(task, best_only=best_only)
	temp = results[f"{split}_score"].reset_index()

	t1, t2 = st.tabs(["Tables", "Charts"])
	with t1:
	show_leaderboard(results, task)

	with t2:

	roc_scatter = make_roc(temp)
	acc_vs_time = make_acc(temp)

	if split == "private" and hf_token is not None:
	full_curves = st.toggle("Full curve", value=True, key=f"all curves {task}")

	if full_curves:
	roc_scatter = make_roc_curves(task, temp["submission_id"].values.tolist(), best_only) + roc_scatter

	st.altair_chart(roc_scatter \| acc_vs_time, use_container_width=False)
	else:
	st.altair_chart(roc_scatter \| acc_vs_time, use_container_width=False)


	updated = get_updated_time()
	st.markdown(updated)
	best_only = True


	tp, t1, volume_tab, all_submission_tab = st.tabs(
	["Pilot Task", "Task 1", "Submission Volume", "All Submissions"]
	)
	with tp:
	"Detection of Synthetic Video Content. Video files are unmodified from the original output from the models or the real sources."
	make_plots_for_task(TASKS[0], split, best_only)
	with t1:
	"Detection of Synthetic Video Content. Video files are unmodified from the original output from the models or the real sources."
	make_plots_for_task(TASKS[1], split, best_only)

	with volume_tab:
	subs = get_volume()
	status_lookup = "QUEUED,PROCESSING,SUCCESS,FAILED".split(",")
	found_columns = subs.columns.values.tolist()
	status_lookup = list(set(status_lookup) & set(found_columns))
	st.bar_chart(subs, x="date", y=status_lookup, stack=True)

	total_submissions = int(subs.loc[:, status_lookup].fillna(0).values.sum())
	st.metric("Total Submissions", value=total_submissions)

	st.metric("Duration", f'{(subs["date"].max() - subs["date"].min()).days} days')

	if split == "private":
	with all_submission_tab:
	data = load_submission()
	st.dataframe(data)