Spaces:
				
			
			
	
			
			
		Runtime error
		
	
	
	
			
			
	
	
	
	
		
		
		Runtime error
		
	Add search capability and language names
Browse files- app.py +70 -6
- content.py +1 -1
- css.py +13 -0
    	
        app.py
    CHANGED
    
    | @@ -2,8 +2,10 @@ import os | |
| 2 | 
             
            import json
         | 
| 3 | 
             
            import glob
         | 
| 4 | 
             
            from collections import defaultdict
         | 
|  | |
| 5 | 
             
            import gradio as gr
         | 
| 6 | 
             
            from content import *
         | 
|  | |
| 7 | 
             
            import glob
         | 
| 8 |  | 
| 9 | 
             
            ARC = "arc"
         | 
| @@ -14,6 +16,42 @@ BENCHMARKS = [ARC, HELLASWAG, MMLU, TRUTHFULQA] | |
| 14 |  | 
| 15 | 
             
            METRICS = ["acc_norm", "acc_norm", "acc_norm", "mc2"]
         | 
| 16 |  | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 17 |  | 
| 18 | 
             
            def collect_results():
         | 
| 19 | 
             
                performance_dict = defaultdict(dict)
         | 
| @@ -52,6 +90,7 @@ def collect_results(): | |
| 52 | 
             
            def get_leaderboard_df(performance_dict, pretrained_models):
         | 
| 53 | 
             
                df = list()
         | 
| 54 | 
             
                for (pretrained, lang), perfs in performance_dict.items():
         | 
|  | |
| 55 | 
             
                    arc_perf = perfs.get(ARC, 0.0)
         | 
| 56 | 
             
                    hellaswag_perf = perfs.get(HELLASWAG, 0.0)
         | 
| 57 | 
             
                    mmlu_perf = perfs.get(MMLU, 0.0)
         | 
| @@ -60,26 +99,40 @@ def get_leaderboard_df(performance_dict, pretrained_models): | |
| 60 | 
             
                    if arc_perf * hellaswag_perf * mmlu_perf * truthfulqa_perf == 0:
         | 
| 61 | 
             
                        continue
         | 
| 62 | 
             
                    avg = round((arc_perf + hellaswag_perf + mmlu_perf + truthfulqa_perf) / 4, 1)
         | 
| 63 | 
            -
                     | 
|  | |
| 64 | 
             
                    df.append(row)
         | 
|  | |
|  | |
|  | |
|  | |
|  | |
| 65 | 
             
                return df
         | 
| 66 |  | 
| 67 |  | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 68 | 
             
            MODEL_COL = "Model"
         | 
| 69 | 
             
            LANG_COL = "Language"
         | 
|  | |
| 70 | 
             
            AVERAGE_COL = "Average"
         | 
| 71 | 
             
            ARC_COL = "ARC (25-shot)"
         | 
| 72 | 
             
            HELLASWAG_COL = "HellaSwag (10-shot)️"
         | 
| 73 | 
             
            MMLU_COL = "MMLU (5-shot)"
         | 
| 74 | 
             
            TRUTHFULQA_COL = "TruthfulQA (0-shot)"
         | 
|  | |
| 75 |  | 
| 76 | 
            -
            COLS = [MODEL_COL, LANG_COL, AVERAGE_COL, ARC_COL, HELLASWAG_COL, MMLU_COL, TRUTHFULQA_COL]
         | 
| 77 | 
            -
            TYPES = ["str", "str", "number", "number", "number", "number", "number"]
         | 
| 78 |  | 
| 79 | 
             
            args = collect_results()
         | 
| 80 | 
            -
             | 
| 81 |  | 
| 82 | 
            -
            demo = gr.Blocks()
         | 
| 83 | 
             
            with demo:
         | 
| 84 | 
             
                gr.HTML(TITLE)
         | 
| 85 | 
             
                gr.Markdown(INTRO_TEXT, elem_classes="markdown-text")
         | 
| @@ -91,13 +144,24 @@ with demo: | |
| 91 | 
             
                    )
         | 
| 92 |  | 
| 93 | 
             
                    leaderboard_table = gr.components.Dataframe(
         | 
| 94 | 
            -
                        value= | 
| 95 | 
             
                        headers=COLS,
         | 
| 96 | 
             
                        datatype=TYPES,
         | 
| 97 | 
             
                        max_rows=5,
         | 
| 98 | 
             
                        elem_id="leaderboard-table",
         | 
| 99 | 
             
                    )
         | 
| 100 |  | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 101 | 
             
                gr.Markdown(CREDIT, elem_classes="markdown-text")
         | 
| 102 | 
             
                gr.Markdown(CITATION, elem_classes="markdown-text")
         | 
| 103 |  | 
|  | |
| 2 | 
             
            import json
         | 
| 3 | 
             
            import glob
         | 
| 4 | 
             
            from collections import defaultdict
         | 
| 5 | 
            +
            import pandas as pd
         | 
| 6 | 
             
            import gradio as gr
         | 
| 7 | 
             
            from content import *
         | 
| 8 | 
            +
            from css import *
         | 
| 9 | 
             
            import glob
         | 
| 10 |  | 
| 11 | 
             
            ARC = "arc"
         | 
|  | |
| 16 |  | 
| 17 | 
             
            METRICS = ["acc_norm", "acc_norm", "acc_norm", "mc2"]
         | 
| 18 |  | 
| 19 | 
            +
            LANGS = 'ar,bn,ca,da,de,es,eu,fr,gu,hi,hr,hu,hy,id,it,kn,ml,mr,ne,nl,pt,ro,ru,sk,sr,sv,ta,te,uk,vi,zh'.split(',')
         | 
| 20 | 
            +
             | 
| 21 | 
            +
            LANG_NAME = {
         | 
| 22 | 
            +
                'ar': 'Arabic',
         | 
| 23 | 
            +
                'bn': 'Bengali',
         | 
| 24 | 
            +
                'ca': 'Catalan',
         | 
| 25 | 
            +
                'da': 'Danish',
         | 
| 26 | 
            +
                'de': 'German',
         | 
| 27 | 
            +
                'es': 'Spanish',
         | 
| 28 | 
            +
                'eu': 'Basque',
         | 
| 29 | 
            +
                'fr': 'French',
         | 
| 30 | 
            +
                'gu': 'Gujarati',
         | 
| 31 | 
            +
                'hi': 'Hindi',
         | 
| 32 | 
            +
                'hr': 'Croatian',
         | 
| 33 | 
            +
                'hu': 'Hungarian',
         | 
| 34 | 
            +
                'hy': 'Armenian',
         | 
| 35 | 
            +
                'id': 'Indonesian',
         | 
| 36 | 
            +
                'it': 'Italian',
         | 
| 37 | 
            +
                'kn': 'Kannada',
         | 
| 38 | 
            +
                'ml': 'Malayalam',
         | 
| 39 | 
            +
                'mr': 'Marathi',
         | 
| 40 | 
            +
                'ne': 'Nepali',
         | 
| 41 | 
            +
                'nl': 'Dutch',
         | 
| 42 | 
            +
                'pt': 'Portuguese',
         | 
| 43 | 
            +
                'ro': 'Romanian',
         | 
| 44 | 
            +
                'ru': 'Russian',
         | 
| 45 | 
            +
                'sk': 'Slovak',
         | 
| 46 | 
            +
                'sr': 'Serbian',
         | 
| 47 | 
            +
                'sv': 'Swedish',
         | 
| 48 | 
            +
                'ta': 'Tamil',
         | 
| 49 | 
            +
                'te': 'Telugu',
         | 
| 50 | 
            +
                'uk': 'Ukrainian',
         | 
| 51 | 
            +
                'vi': 'Vietnamese',
         | 
| 52 | 
            +
                'zh': 'Chinese'
         | 
| 53 | 
            +
            }
         | 
| 54 | 
            +
             | 
| 55 |  | 
| 56 | 
             
            def collect_results():
         | 
| 57 | 
             
                performance_dict = defaultdict(dict)
         | 
|  | |
| 90 | 
             
            def get_leaderboard_df(performance_dict, pretrained_models):
         | 
| 91 | 
             
                df = list()
         | 
| 92 | 
             
                for (pretrained, lang), perfs in performance_dict.items():
         | 
| 93 | 
            +
                    lang_name = LANG_NAME[lang]
         | 
| 94 | 
             
                    arc_perf = perfs.get(ARC, 0.0)
         | 
| 95 | 
             
                    hellaswag_perf = perfs.get(HELLASWAG, 0.0)
         | 
| 96 | 
             
                    mmlu_perf = perfs.get(MMLU, 0.0)
         | 
|  | |
| 99 | 
             
                    if arc_perf * hellaswag_perf * mmlu_perf * truthfulqa_perf == 0:
         | 
| 100 | 
             
                        continue
         | 
| 101 | 
             
                    avg = round((arc_perf + hellaswag_perf + mmlu_perf + truthfulqa_perf) / 4, 1)
         | 
| 102 | 
            +
                    notes = ' '.join([pretrained, lang_name, lang])
         | 
| 103 | 
            +
                    row = [pretrained, lang_name, lang, avg, arc_perf, hellaswag_perf, mmlu_perf, truthfulqa_perf, notes]
         | 
| 104 | 
             
                    df.append(row)
         | 
| 105 | 
            +
             | 
| 106 | 
            +
                df = pd.DataFrame.from_records(df, columns=COLS)
         | 
| 107 | 
            +
                df = df.sort_values(by=[AVERAGE_COL], ascending=False)
         | 
| 108 | 
            +
                df = df[COLS]
         | 
| 109 | 
            +
             | 
| 110 | 
             
                return df
         | 
| 111 |  | 
| 112 |  | 
| 113 | 
            +
            def search_table(df, query):
         | 
| 114 | 
            +
                filtered_df = df[df[NOTES_COL].str.contains(query, case=False)]
         | 
| 115 | 
            +
                return filtered_df
         | 
| 116 | 
            +
             | 
| 117 | 
            +
             | 
| 118 | 
            +
             | 
| 119 | 
             
            MODEL_COL = "Model"
         | 
| 120 | 
             
            LANG_COL = "Language"
         | 
| 121 | 
            +
            CODE_COL = "Code"
         | 
| 122 | 
             
            AVERAGE_COL = "Average"
         | 
| 123 | 
             
            ARC_COL = "ARC (25-shot)"
         | 
| 124 | 
             
            HELLASWAG_COL = "HellaSwag (10-shot)️"
         | 
| 125 | 
             
            MMLU_COL = "MMLU (5-shot)"
         | 
| 126 | 
             
            TRUTHFULQA_COL = "TruthfulQA (0-shot)"
         | 
| 127 | 
            +
            NOTES_COL = "Notes"  # For search only
         | 
| 128 |  | 
| 129 | 
            +
            COLS = [MODEL_COL, LANG_COL, CODE_COL, AVERAGE_COL, ARC_COL, HELLASWAG_COL, MMLU_COL, TRUTHFULQA_COL, NOTES_COL]
         | 
| 130 | 
            +
            TYPES = ["str", "str", "str", "number", "number", "number", "number", "number", "str"]
         | 
| 131 |  | 
| 132 | 
             
            args = collect_results()
         | 
| 133 | 
            +
            original_df = get_leaderboard_df(*args)
         | 
| 134 |  | 
| 135 | 
            +
            demo = gr.Blocks(css=CUSTOM_CSS)
         | 
| 136 | 
             
            with demo:
         | 
| 137 | 
             
                gr.HTML(TITLE)
         | 
| 138 | 
             
                gr.Markdown(INTRO_TEXT, elem_classes="markdown-text")
         | 
|  | |
| 144 | 
             
                    )
         | 
| 145 |  | 
| 146 | 
             
                    leaderboard_table = gr.components.Dataframe(
         | 
| 147 | 
            +
                        value=original_df,
         | 
| 148 | 
             
                        headers=COLS,
         | 
| 149 | 
             
                        datatype=TYPES,
         | 
| 150 | 
             
                        max_rows=5,
         | 
| 151 | 
             
                        elem_id="leaderboard-table",
         | 
| 152 | 
             
                    )
         | 
| 153 |  | 
| 154 | 
            +
                    # # Dummy leaderboard for handling the case when the user uses backspace key
         | 
| 155 | 
            +
                    hidden_leaderboard_table_for_search = gr.components.Dataframe(
         | 
| 156 | 
            +
                        value=original_df, headers=COLS, datatype=TYPES, max_rows=5, visible=False
         | 
| 157 | 
            +
                    )
         | 
| 158 | 
            +
             | 
| 159 | 
            +
                    search_bar.change(
         | 
| 160 | 
            +
                        search_table,
         | 
| 161 | 
            +
                        [hidden_leaderboard_table_for_search, search_bar],
         | 
| 162 | 
            +
                        leaderboard_table,
         | 
| 163 | 
            +
                    )
         | 
| 164 | 
            +
             | 
| 165 | 
             
                gr.Markdown(CREDIT, elem_classes="markdown-text")
         | 
| 166 | 
             
                gr.Markdown(CITATION, elem_classes="markdown-text")
         | 
| 167 |  | 
    	
        content.py
    CHANGED
    
    | @@ -3,7 +3,7 @@ TITLE = '<h1 align="center" id="space-title">Open Multilingual LLM Evaluation Le | |
| 3 | 
             
            INTRO_TEXT = f"""
         | 
| 4 | 
             
            ## About
         | 
| 5 |  | 
| 6 | 
            -
            This leaderboard shows the performance of pretrained models in 29 languages  | 
| 7 |  | 
| 8 | 
             
            - <a href="https://arxiv.org/abs/1803.05457" target="_blank">  AI2 Reasoning Challenge </a> (25-shot) 
         | 
| 9 | 
             
            - <a href="https://arxiv.org/abs/1905.07830" target="_blank">  HellaSwag </a> (10-shot) 
         | 
|  | |
| 3 | 
             
            INTRO_TEXT = f"""
         | 
| 4 | 
             
            ## About
         | 
| 5 |  | 
| 6 | 
            +
            This leaderboard shows the performance of pretrained models in 29 languages including Arabic, Armenian, Basque, Bengali, Catalan, Chinese, Croatian, Danish, Dutch, French, German, Gujarati, Hindi, Hungarian, Indonesian, Italian, Kannada, Malayalam, Marathi, Nepali, Portuguese, Romanian, Russian, Serbian, Slovak, Spanish, Swedish, Tamil, Telugu, Ukrainian, and Vietnameseon four benchmarks:
         | 
| 7 |  | 
| 8 | 
             
            - <a href="https://arxiv.org/abs/1803.05457" target="_blank">  AI2 Reasoning Challenge </a> (25-shot) 
         | 
| 9 | 
             
            - <a href="https://arxiv.org/abs/1905.07830" target="_blank">  HellaSwag </a> (10-shot) 
         | 
    	
        css.py
    ADDED
    
    | @@ -0,0 +1,13 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            CUSTOM_CSS= """
         | 
| 2 | 
            +
            /* Hides the final column */
         | 
| 3 | 
            +
            table td:last-child,
         | 
| 4 | 
            +
            table th:last-child {
         | 
| 5 | 
            +
                display: none;
         | 
| 6 | 
            +
            }
         | 
| 7 | 
            +
            # table td:first-child,
         | 
| 8 | 
            +
            # table th:first-child {
         | 
| 9 | 
            +
            #     max-width: 400px;
         | 
| 10 | 
            +
            #     overflow: auto;
         | 
| 11 | 
            +
            #     white-space: nowrap;
         | 
| 12 | 
            +
            # }
         | 
| 13 | 
            +
            """
         | 
