import datasets import polars as pl BASE_REPO_ID = "ai-conferences/ICCV2025" PAPER_PAGE_REPO_ID = "hysts-bot-data/paper-pages-slim" def format_author_claim_ratio(row: dict) -> str: n_linked_authors = row["n_linked_authors"] n_authors = row["n_authors"] if n_linked_authors is None or n_authors is None: return "" author_linked = "✅" if n_linked_authors > 0 else "" return f"{n_linked_authors}/{n_authors} {author_linked}".strip() df_orig = datasets.load_dataset(BASE_REPO_ID, split="train").to_polars().rename({"cvf_url": "cvf"}) df_paper_page = ( datasets.load_dataset(PAPER_PAGE_REPO_ID, split="train") .to_polars() .drop(["summary", "author_names", "ai_keywords"]) ) df_orig = ( df_orig.join(df_paper_page, on="arxiv_id", how="left", suffix="_2") .with_columns( [ pl.when(pl.col("github_2").is_not_null()) .then(pl.col("github_2")) .otherwise(pl.col("github")) .alias("github") ] ) .drop(["github_2"]) ) # format authors df_orig = df_orig.with_columns(pl.col("authors").list.join(", ").alias("authors_str")) # format links df_orig = df_orig.with_columns( [ pl.format("[link]({})", pl.col(col)).fill_null("").alias(f"{col}_md") for col in ["cvf", "project_page", "github"] ] ) # format paper page link df_orig = df_orig.with_columns( (pl.lit("https://huggingface.co/papers/") + pl.col("arxiv_id")).alias("paper_page") ).with_columns(pl.format("[{}]({})", pl.col("arxiv_id"), pl.col("paper_page")).fill_null("").alias("paper_page_md")) # count authors df_orig = df_orig.with_columns(pl.col("authors").list.len().alias("n_authors")) df_orig = df_orig.with_columns( pl.col("author_usernames") .map_elements(lambda lst: sum(x is not None for x in lst) if lst is not None else None, return_dtype=pl.Int64) .alias("n_linked_authors") ) df_orig = df_orig.with_columns( pl.struct(["n_linked_authors", "n_authors"]) .map_elements(format_author_claim_ratio, return_dtype=pl.Utf8) .alias("claimed") ) # TODO: Fix this once https://github.com/gradio-app/gradio/issues/10916 is fixed # noqa: FIX002, TD002 # format numbers as strings df_orig = df_orig.with_columns( [pl.col(col).cast(pl.Utf8).fill_null("").alias(col) for col in ["upvotes", "num_comments"]] ) # format spaces, models, datasets for repo_id_col, markdown_col, base_url in [ ("space_ids", "Spaces", "https://huggingface.co/spaces/"), ("model_ids", "Models", "https://huggingface.co/"), ("dataset_ids", "Datasets", "https://huggingface.co/datasets/"), ]: df_orig = df_orig.with_columns( pl.col(repo_id_col) .map_elements( lambda lst: "\n".join([f"[{x}]({base_url}{x})" for x in lst]) if lst is not None else None, # noqa: B023 return_dtype=pl.Utf8, ) .fill_null("") .alias(markdown_col) )