Spaces:
Runtime error
Runtime error
| import os | |
| import json | |
| import glob | |
| from collections import defaultdict | |
| import pandas as pd | |
| import gradio as gr | |
| from content import * | |
| from css import * | |
| import glob | |
| ARC = "arc" | |
| HELLASWAG = "hellaswag" | |
| MMLU = "mmlu" | |
| TRUTHFULQA = "truthfulqa" | |
| BENCHMARKS = [ARC, HELLASWAG, MMLU, TRUTHFULQA] | |
| METRICS = ["acc_norm", "acc_norm", "acc_norm", "mc2"] | |
| LANGS = 'ar,bn,ca,da,de,es,eu,fr,gu,hi,hr,hu,hy,id,it,kn,ml,mr,ne,nl,pt,ro,ru,sk,sr,sv,ta,te,uk,vi,zh'.split(',') | |
| LANG_NAME = { | |
| 'ar': 'Arabic', | |
| 'bn': 'Bengali', | |
| 'ca': 'Catalan', | |
| 'da': 'Danish', | |
| 'de': 'German', | |
| 'es': 'Spanish', | |
| 'eu': 'Basque', | |
| 'fr': 'French', | |
| 'gu': 'Gujarati', | |
| 'hi': 'Hindi', | |
| 'hr': 'Croatian', | |
| 'hu': 'Hungarian', | |
| 'hy': 'Armenian', | |
| 'id': 'Indonesian', | |
| 'it': 'Italian', | |
| 'kn': 'Kannada', | |
| 'ml': 'Malayalam', | |
| 'mr': 'Marathi', | |
| 'ne': 'Nepali', | |
| 'nl': 'Dutch', | |
| 'pt': 'Portuguese', | |
| 'ro': 'Romanian', | |
| 'ru': 'Russian', | |
| 'sk': 'Slovak', | |
| 'sr': 'Serbian', | |
| 'sv': 'Swedish', | |
| 'ta': 'Tamil', | |
| 'te': 'Telugu', | |
| 'uk': 'Ukrainian', | |
| 'vi': 'Vietnamese', | |
| 'zh': 'Chinese' | |
| } | |
| def collect_results(): | |
| performance_dict = defaultdict(dict) | |
| pretrained_models = set() | |
| for file in glob.glob('evals/*/*.json'): | |
| with open(file, 'r') as f: | |
| data = json.load(f) | |
| if 'results' not in data: | |
| continue | |
| if 'config' not in data: | |
| continue | |
| results = data['results'] | |
| config = data['config'] | |
| if 'model_args' not in config: | |
| continue | |
| model_args = config['model_args'].split(',') | |
| pretrained = [x for x in model_args if x.startswith('pretrained=')] | |
| if len(pretrained) != 1: | |
| continue | |
| pretrained = pretrained[0].split('=')[1] | |
| pretrained = pretrained.split('/')[-1] | |
| pretrained_models.add(pretrained) | |
| for lang_task, perfs in results.items(): | |
| task, lang = lang_task.split('_') | |
| assert task in BENCHMARKS | |
| if lang and task: | |
| metric = METRICS[BENCHMARKS.index(task)] | |
| p = round(perfs[metric] * 100, 1) | |
| performance_dict[(pretrained, lang)][task] = p | |
| return performance_dict, pretrained_models | |
| def get_leaderboard_df(performance_dict, pretrained_models): | |
| df = list() | |
| for (pretrained, lang), perfs in performance_dict.items(): | |
| lang_name = LANG_NAME[lang] | |
| arc_perf = perfs.get(ARC, 0.0) | |
| hellaswag_perf = perfs.get(HELLASWAG, 0.0) | |
| mmlu_perf = perfs.get(MMLU, 0.0) | |
| truthfulqa_perf = perfs.get(TRUTHFULQA, 0.0) | |
| if arc_perf * hellaswag_perf * mmlu_perf * truthfulqa_perf == 0: | |
| continue | |
| avg = round((arc_perf + hellaswag_perf + mmlu_perf + truthfulqa_perf) / 4, 1) | |
| notes = ' '.join([pretrained, lang_name]) | |
| row = [pretrained, lang_name, lang, avg, arc_perf, hellaswag_perf, mmlu_perf, truthfulqa_perf, notes] | |
| df.append(row) | |
| df = pd.DataFrame.from_records(df, columns=COLS) | |
| df = df.sort_values(by=[LANG_COL, AVERAGE_COL], ascending=False) | |
| df = df[COLS] | |
| return df | |
| def search_table(df, query): | |
| filtered_df = df[df[NOTES_COL].str.contains(query, case=False)] | |
| return filtered_df | |
| MODEL_COL = "Model" | |
| LANG_COL = "Language" | |
| CODE_COL = "Code" | |
| AVERAGE_COL = "Average" | |
| ARC_COL = "ARC (25-shot)" | |
| HELLASWAG_COL = "HellaSwag (0-shot)️" | |
| MMLU_COL = "MMLU (25-shot)" | |
| TRUTHFULQA_COL = "TruthfulQA (0-shot)" | |
| NOTES_COL = "Notes" # For search only | |
| COLS = [MODEL_COL, LANG_COL, CODE_COL, AVERAGE_COL, ARC_COL, HELLASWAG_COL, MMLU_COL, TRUTHFULQA_COL, NOTES_COL] | |
| TYPES = ["str", "str", "str", "number", "number", "number", "number", "number", "str"] | |
| args = collect_results() | |
| original_df = get_leaderboard_df(*args) | |
| demo = gr.Blocks(css=CUSTOM_CSS) | |
| with demo: | |
| gr.HTML(TITLE) | |
| gr.Markdown(INTRO_TEXT, elem_classes="markdown-text") | |
| gr.Markdown(HOW_TO, elem_classes="markdown-text") | |
| with gr.Group(): | |
| search_bar = gr.Textbox( | |
| placeholder="Search models and languages...", show_label=False, elem_id="search-bar" | |
| ) | |
| leaderboard_table = gr.components.Dataframe( | |
| value=original_df, | |
| headers=COLS, | |
| datatype=TYPES, | |
| # max_rows=5, | |
| elem_id="leaderboard-table", | |
| ) | |
| # # Dummy leaderboard for handling the case when the user uses backspace key | |
| hidden_leaderboard_table_for_search = gr.components.Dataframe( | |
| value=original_df, | |
| headers=COLS, | |
| datatype=TYPES, | |
| # max_rows=5, | |
| visible=False | |
| ) | |
| search_bar.change( | |
| search_table, | |
| [hidden_leaderboard_table_for_search, search_bar], | |
| leaderboard_table, | |
| ) | |
| gr.Markdown(CREDIT, elem_classes="markdown-text") | |
| gr.Markdown(CITATION, elem_classes="markdown-text") | |
| demo.launch() | |