import gradio as gr from gradio_rangeslider import RangeSlider import core as core from style import CSS, LANG_SYMBOLS, T_SYMBOLS, TITLE demo = gr.Blocks(css=CSS) with demo: gr.HTML(TITLE) gr.Markdown( "This is a collection of multilingual evaluation results obtained using our fork of the LM-evaluation-harness (https://github.com/OpenGPTX/lm-evaluation-harness), based on V1 of the https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard.\ Note that currently, benchmarks are available in 21 European languages (Irish, Maltese, Croatian missing).", elem_classes="markdown-text", ) selected_tab = gr.State(value=0) with gr.Tabs(elem_classes="tab-buttons") as tabs: with gr.TabItem( "🏅 LLM accuracy benchmark", elem_id="llm-benchmark-tab-table-acc", id=0, ) as acc: with gr.Column(): with gr.Row(): search_bar = gr.Textbox( label="Search models", placeholder=" 🔍 Separate multiple queries with ';' and press ENTER...", show_label=True, elem_id="search-bar", ) with gr.Row(): with gr.Column(): model_types = gr.CheckboxGroup( label="Select model type", choices=[ ( f"Pretrained {T_SYMBOLS['pretrained']}", T_SYMBOLS["pretrained"], ), (f"Chat {T_SYMBOLS['chat']}", T_SYMBOLS["chat"]), ], value=list(T_SYMBOLS.values()), ) with gr.Column(): model_sizes = RangeSlider(minimum=0, maximum=150, value=(7, 10), label="Select the number of parameters (B)") with gr.Row(): langs_bar = gr.CheckboxGroup( choices=[(LANG_SYMBOLS.get(l, l), l) for l in core.languages_list], value=core.languages_list, label="Select languages to average over", elem_id="column-select", interactive=True, scale=6, ) with gr.Column(scale=1): clear = gr.ClearButton( langs_bar, value="Deselect all languages", size="sm", scale=1, ) select = gr.Button( value="Select all languages", size="sm", scale=1, ) select.click( lambda: gr.CheckboxGroup(value=core.languages_list), inputs=[], outputs=langs_bar, ) with gr.Row(): shown_tasks = gr.CheckboxGroup( choices=core.get_available_task_groups(core.get_selected_task_type(0), True), value=core.get_available_task_groups(core.get_selected_task_type(0), True), label="Select tasks to show", elem_id="column-select", interactive=True, scale=50, ) clear = gr.ClearButton( shown_tasks, value="Deselect all tasks", size="sm", scale=1, ) select = gr.Button( value="Select all tasks", size="sm", scale=1, ) select.click( lambda: gr.CheckboxGroup( value=core.get_available_task_groups(core.get_selected_task_type(0), True)), inputs=[], outputs=shown_tasks, ) leaderboard_table = gr.Dataframe(datatype=["str", "markdown", "number"]) with gr.TabItem( "🏅 LLM accuracy benchmark (Zero-Shot)", elem_id="llm-benchmark-tab-table-acc-zeroshot", id=3, ) as acc_zero_shot: with gr.Column(): with gr.Row(): search_bar_zero_shot = gr.Textbox( label="Search models", placeholder=" 🔍 Separate multiple queries with ';' and press ENTER...", show_label=True, elem_id="search-bar", ) with gr.Row(): with gr.Column(): model_types_zero_shot = gr.CheckboxGroup( label="Select model type", choices=[ ( f"Pretrained {T_SYMBOLS['pretrained']}", T_SYMBOLS["pretrained"], ), (f"Chat {T_SYMBOLS['chat']}", T_SYMBOLS["chat"]), ], value=list(T_SYMBOLS.values()), ) with gr.Column(): model_sizes_zero_shot = RangeSlider(minimum=0, maximum=150, value=(7, 10), label="Select the number of parameters (B)") with gr.Row(): langs_bar_zero_shot = gr.CheckboxGroup( choices=[(LANG_SYMBOLS.get(l, l), l) for l in core.languages_list], value=core.languages_list, label="Select languages to average over", elem_id="column-select", interactive=True, scale=6, ) with gr.Column(scale=1): clear_zero_shot = gr.ClearButton( langs_bar_zero_shot, value="Deselect all languages", size="sm", scale=1, ) select_zero_shot = gr.Button( value="Select all languages", size="sm", scale=1, ) select_zero_shot.click( lambda: gr.CheckboxGroup(value=core.languages_list), inputs=[], outputs=langs_bar_zero_shot, ) with gr.Row(): shown_tasks_zero_shot = gr.CheckboxGroup( choices=core.get_available_task_groups(core.get_selected_task_type(3), False), value=core.get_available_task_groups(core.get_selected_task_type(3), False), label="Select tasks to show", elem_id="column-select", interactive=True, scale=50, ) clear_zero_shot = gr.ClearButton( shown_tasks_zero_shot, value="Deselect all tasks", size="sm", scale=1, ) select_zero_shot = gr.Button( value="Select all tasks", size="sm", scale=1, ) select_zero_shot.click( lambda: gr.CheckboxGroup( value=core.get_available_task_groups(core.get_selected_task_type(3), False)), inputs=[], outputs=shown_tasks_zero_shot, ) leaderboard_table_zero_shot = gr.Dataframe(datatype=["str", "markdown", "number"]) with gr.TabItem( "🌐 LLM translation benchmark", elem_id="llm-benchmark-tab-table-misc", id=1, ) as misc: with gr.Column(): with gr.Row(): search_bar_misc = gr.Textbox( label="Search models", placeholder=" 🔍 Separate multiple queries with ';' and press ENTER...", show_label=True, elem_id="search-bar", ) with gr.Row(): with gr.Column(): model_types_misc = gr.CheckboxGroup( label="Select model type", choices=[ ( f"Pretrained {T_SYMBOLS['pretrained']}", T_SYMBOLS["pretrained"], ), (f"Chat {T_SYMBOLS['chat']}", T_SYMBOLS["chat"]), ], value=list(T_SYMBOLS.values()), ) with gr.Column(): model_sizes_misc = RangeSlider(minimum=0, maximum=150, value=(7, 10), label="Select the number of parameters (B)") with gr.Row(): langs_bar_misc = gr.CheckboxGroup( choices=[(LANG_SYMBOLS.get(l, l), l) for l in core.languages_list], value=core.languages_list, label="Select languages to average over", elem_id="column-select", interactive=True, scale=6, ) with gr.Column(scale=1): clear_misc = gr.ClearButton( langs_bar_misc, value="Deselect all languages", size="sm", scale=1, ) select_misc = gr.Button( value="Select all languages", size="sm", scale=1, ) select_misc.click( lambda: gr.CheckboxGroup(value=core.languages_list), inputs=[], outputs=langs_bar_misc, ) with gr.Row(): shown_tasks_misc = gr.CheckboxGroup( choices=core.get_available_task_groups(core.get_selected_task_type(1), False), value=core.get_available_task_groups(core.get_selected_task_type(1), False), label="Select tasks to show", elem_id="column-select", interactive=True, scale=50, ) clear_tasks_misc = gr.ClearButton( shown_tasks_misc, value="Deselect all tasks", size="sm", scale=1, ) select_all_tasks_misc = gr.Button( value="Select all tasks", size="sm", scale=1, ) select_all_tasks_misc.click( lambda: gr.CheckboxGroup( value=core.get_available_task_groups(core.get_selected_task_type(1), False)), inputs=[], outputs=shown_tasks_misc, ) leaderboard_table_misc = gr.Dataframe(datatype=["str", "markdown", "number"]) with gr.TabItem( "🌐 LLM MT-Bench benchmark", elem_id="llm-benchmark-tab-table-mtbench", id=2, ) as mtbench: with gr.Column(): with gr.Row(): search_bar_mtbench = gr.Textbox( label="Search models", placeholder=" 🔍 Separate multiple queries with ';' and press ENTER...", show_label=True, elem_id="search-bar", ) with gr.Row(): langs_bar_mtbench = gr.CheckboxGroup( choices=[(LANG_SYMBOLS.get(l, l), l) for l in core.mt_bench_language_list], value=core.mt_bench_language_list, label="Select languages to average over", elem_id="column-select", interactive=True, scale=6, ) with gr.Column(scale=1): clear_mtbench = gr.ClearButton( langs_bar_mtbench, value="Deselect all languages", size="sm", scale=1, ) select_mtbench = gr.Button( value="Select all languages", size="sm", scale=1, ) select_mtbench.click( lambda: gr.CheckboxGroup(value=core.mt_bench_language_list), inputs=[], outputs=langs_bar_mtbench, ) leaderboard_table_mtbench = gr.Dataframe(datatype=["str", "markdown", "number"]) for comp, fn in [ (search_bar, "submit"), (langs_bar, "change"), (shown_tasks, "change"), (model_types, "change"), (model_sizes, "change"), ]: getattr(comp, fn)( core.update_df, [gr.State(value=0), shown_tasks, search_bar, langs_bar, model_sizes, gr.State(value=True), model_types], # [shown_tasks, search_bar, langs_bar, model_types, gr.State(value=True)], leaderboard_table, ) for comp, fn in [ (search_bar_zero_shot, "submit"), (model_types_zero_shot, "change"), (langs_bar_zero_shot, "change"), (shown_tasks_zero_shot, "change"), (model_sizes_zero_shot, "change") ]: getattr(comp, fn)( core.update_df, [gr.State(value=1), shown_tasks_zero_shot, search_bar_zero_shot, langs_bar_zero_shot, model_sizes_zero_shot, gr.State(value=False), model_types_zero_shot], leaderboard_table_zero_shot, ) for comp, fn in [ (search_bar_misc, "submit"), (langs_bar_misc, "change"), (shown_tasks_misc, "change"), (model_types_misc, "change"), (model_sizes_misc, "change"), ]: getattr(comp, fn)( core.update_df, [gr.State(value=2), shown_tasks_misc, search_bar_misc, langs_bar_misc, model_sizes_misc, gr.State(value=False), model_types_misc], leaderboard_table_misc, ) for comp, fn in [ (search_bar_mtbench, "submit"), (langs_bar_mtbench, "change"), ]: getattr(comp, fn)( core.update_df, [gr.State(value=3), gr.State(value=core.get_available_task_groups(core.get_selected_task_type(2), False)), search_bar_mtbench, langs_bar_mtbench, gr.State(value=False)], leaderboard_table_mtbench, ) gr.Blocks.load( block=demo, fn=core.update_df, inputs=[gr.State(value=0), shown_tasks, search_bar, langs_bar, model_sizes, gr.State(value=True), model_types], outputs=leaderboard_table, ) gr.Blocks.load( block=demo, fn=core.update_df, inputs=[gr.State(value=1), shown_tasks_zero_shot, search_bar_zero_shot, langs_bar_zero_shot, model_sizes_zero_shot, gr.State(value=False), model_types_zero_shot], outputs=leaderboard_table_zero_shot, ) gr.Blocks.load( block=demo, fn=core.update_df, inputs=[gr.State(value=2), shown_tasks_misc, search_bar_misc, langs_bar_misc, model_sizes_misc, gr.State(value=False), model_types_misc], outputs=leaderboard_table_misc, ) # We do not have a checkbox for model_type in mt_bench, hence there is no model_types variable gr.Blocks.load( block=demo, fn=core.update_df, inputs=[gr.State(value=3), gr.State(value=core.get_available_task_groups(core.get_selected_task_type(2), False)), search_bar_mtbench, langs_bar_mtbench, gr.State(value=False)], outputs=leaderboard_table_mtbench, ) demo.launch()