import json from pathlib import Path import gradio as gr import pandas as pd from texts import TITLE, DESCRIPTION from process_data import load_average_data, load_hard_data, load_easy_data from display import custom_css BENCHMARKS_TO_SKIP = [] color_map = { "Pretrained": "#7497db", "RL": "#E8ECF2", "Finetuned": "#ffcd75", # "DPO": "#75809c", } model_name_map = { "qwen2.5-3b-instruct": "Qwen/Qwen2.5-3B-Instruct", "qwen2.5-7b-instruct": "Qwen/Qwen2.5-7B-Instruct", "qwen2.5-14b-instruct": "Qwen/Qwen2.5-14B-Instruct", "qwen2.5-32b-instruct": "Qwen/Qwen2.5-32B-Instruct", "qwen2.5-72b-instruct": "Qwen/Qwen2.5-72B-Instruct", "llama-3.1-8b-instruct": "Meta-Llama/Llama-3.1-8B-Instruct", "llama-3.1-70b-instruct": "Meta-Llama/Llama-3.1-70B-Instruct", "llama-3.2-3b-instruct": "Meta-Llama/Llama-3.2-3B-Instruct", "llama-3.3-70b-instruct": "Meta-Llama/Llama-3.3-70B-Instruct", "mistral-large-instruct-2411": "Mistral/Mistral-Large-2411", "gemma-2-27b-it": "google/gemma-2-27b-it", "gemma-2-9b-it": "google/gemma-2-9b-it", "deepseek-v3": "deepseek-ai/DeepSeek-V3", "deepseek-r1": "deepseek-ai/DeepSeek-R1", "qwq-32b": "Qwen/QwQ-32B", "yi-lightning": "Yi/Yi-Lightning", 'gpt-3.5-turbo': "openai/gpt-3.5-turbo", 'gpt-4o': "openai/gpt-4o", 'gpt-4o-mini': "openai/gpt-4o-mini", 'o1-mini': "openai/o1-mini", 'claude-3.5-haiku': "anthropic/claude-3.5-haiku", 'claude-3.5-sonnet': "anthropic/claude-3.5-sonnet", } def map_model_name(model_id): if model_id not in model_name_map.keys(): return model_id else: return model_name_map[model_id] # 定义函数,将模型名称转换为带有链接的 HTML 格式 def model_hyperlink(link, model_name): return f'{model_name}' def make_clickable_model(model_name): link = f"https://huggingface.co/{model_name}" return model_hyperlink(link, model_name) rl_models = ['deepseek-r1', 'o1-mini'] def map_model_type(model_name): if model_name in rl_models: return "RL" else: return "Pretrained" def prep_leaderboard_df(): average_df = load_average_data() hard_df = load_hard_data() easy_df = load_easy_data() df = pd.concat([easy_df, hard_df, average_df], axis=1) # insert a column named "Model" at the first position df.insert(0, "Model", [map_model_name(idx) for idx in df.index]) df.insert(1, "Model Type", [map_model_type(idx) for idx in df.index]) # 对 Model 列应用函数,将模型名称转换为链接形式 # df['Model'] = df['Model'].apply(make_clickable_model) df = df.round(2) return df leaderboard_df = prep_leaderboard_df() # Function to update the table based on search query def filter_and_search(cols: list[str], search_query: str, agg: str): print("filter") df = leaderboard_df search_terms = "Model" if len(search_query) > 0: search_terms = search_query.split(";") search_terms = [term.strip().lower() for term in search_terms] pattern = "|".join(search_terms) df = df[df["Model"].str.lower().str.contains(pattern, regex=True)] # Drop any columns which are all NaN df = df.dropna(how="all", axis=1) if len(cols) > 0: index_cols = list(leaderboard_df.columns[:1]) new_cols = index_cols + cols df = df.copy()[new_cols] df = df.copy().dropna(how="all", axis=0, subset=[c for c in df.columns if c in cols]) df[cols] = df[cols].apply(pd.to_numeric, errors='coerce') df = df.sort_values(by=cols, ascending=False, na_position='last') df[cols] = df[cols].astype(str) return df demo = gr.Blocks(css=custom_css) with demo: gr.HTML(TITLE) with gr.Column(): gr.Markdown(DESCRIPTION, elem_classes="markdown-text") with gr.Row(): search_bar = gr.Textbox(placeholder="Search for your model...", show_label=False) cols_bar = gr.CheckboxGroup( choices=[c for c in leaderboard_df.columns[1:] if c != "Average"], show_label=False, # info="Select columns to display", ) with gr.Group(): leaderboard_table = gr.Dataframe( value=leaderboard_df, wrap=True, # column_widths=[400, 110] + [(260 + len(c)) for c in leaderboard_df.columns[1:]], ) cols_bar.change(filter_and_search, inputs=[cols_bar, search_bar], outputs=[leaderboard_table]) search_bar.submit(filter_and_search, inputs=[cols_bar, search_bar], outputs=[leaderboard_table]) with gr.Row(): with gr.Accordion("📚 Citation", open=False): citation_button = gr.Textbox( value=r"""@article{lin2025generative, title={Generative Evaluation of Complex Reasoning in Large Language Models}, author={Lin, Haowei and Wang, Xiangyu and Yan, Ruilin and Huang, Baizhou and Ye, Haotian and Zhu, Jianhua and Wang, Zihao and Zou, James and Ma, Jianzhu and Liang, Yitao}, journal={arXiv preprint arXiv:2504.02810}, year={2025} }""", lines=7, label="Copy the following to cite these results.", elem_id="citation-button", show_copy_button=True, ) demo.launch()