Ahmed Ahmed commited on
Commit
77c0f20
·
1 Parent(s): 70ea05e

consolidate

Browse files
Files changed (6) hide show
  1. app.py +32 -182
  2. src/about.py +24 -41
  3. src/display/utils.py +3 -1
  4. src/envs.py +2 -11
  5. src/leaderboard/read_evals.py +31 -79
  6. src/populate.py +4 -45
app.py CHANGED
@@ -1,13 +1,9 @@
1
  import gradio as gr
2
- from gradio_leaderboard import Leaderboard, ColumnFilter, SelectColumns
3
  import pandas as pd
4
- from apscheduler.schedulers.background import BackgroundScheduler
5
  from huggingface_hub import snapshot_download
6
 
7
  from src.about import (
8
- CITATION_BUTTON_LABEL,
9
- CITATION_BUTTON_TEXT,
10
- EVALUATION_QUEUE_TEXT,
11
  INTRODUCTION_TEXT,
12
  LLM_BENCHMARKS_TEXT,
13
  TITLE,
@@ -16,80 +12,24 @@ from src.display.css_html_js import custom_css
16
  from src.display.utils import (
17
  BENCHMARK_COLS,
18
  COLS,
19
- EVAL_COLS,
20
- EVAL_TYPES,
21
  AutoEvalColumn,
22
- ModelType,
23
- fields,
24
- WeightType,
25
- Precision
26
  )
27
- from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, REPO_ID, RESULTS_REPO, TOKEN
28
- from src.populate import get_evaluation_queue_df, get_leaderboard_df
29
- from src.submission.submit import add_new_eval
30
  from src.evaluation.dynamic_eval import run_dynamic_perplexity_eval
31
 
32
-
33
- def restart_space():
34
- API.restart_space(repo_id=REPO_ID)
35
-
36
- ### Space initialisation
37
- try:
38
- print(EVAL_REQUESTS_PATH)
39
- snapshot_download(
40
- repo_id=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
41
- )
42
- except Exception:
43
- restart_space()
44
- try:
45
- print(EVAL_RESULTS_PATH)
46
- snapshot_download(
47
- repo_id=RESULTS_REPO, local_dir=EVAL_RESULTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
48
- )
49
- except Exception:
50
- restart_space()
51
-
52
-
53
- LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS)
54
-
55
- (
56
- finished_eval_queue_df,
57
- running_eval_queue_df,
58
- pending_eval_queue_df,
59
- ) = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
60
-
61
  def init_leaderboard(dataframe):
62
  if dataframe is None or dataframe.empty:
63
  raise ValueError("Leaderboard DataFrame is empty or None.")
 
64
  return Leaderboard(
65
- value=dataframe,
66
- datatype=[c.type for c in fields(AutoEvalColumn)],
67
- select_columns=SelectColumns(
68
- default_selection=[c.name for c in fields(AutoEvalColumn) if c.displayed_by_default],
69
- cant_deselect=[c.name for c in fields(AutoEvalColumn) if c.never_hidden],
70
- label="Select Columns to Display:",
71
- ),
72
- search_columns=[AutoEvalColumn.model.name, AutoEvalColumn.license.name],
73
- hide_columns=[c.name for c in fields(AutoEvalColumn) if c.hidden],
74
- filter_columns=[
75
- ColumnFilter(AutoEvalColumn.model_type.name, type="checkboxgroup", label="Model types"),
76
- ColumnFilter(AutoEvalColumn.precision.name, type="checkboxgroup", label="Precision"),
77
- ColumnFilter(
78
- AutoEvalColumn.params.name,
79
- type="slider",
80
- min=0.01,
81
- max=150,
82
- label="Select the number of parameters (B)",
83
- ),
84
- ColumnFilter(
85
- AutoEvalColumn.still_on_hub.name, type="boolean", label="Deleted/incomplete", default=True
86
- ),
87
- ],
88
- bool_checkboxgroup_label="Hide models",
89
- interactive=False,
90
  )
91
 
92
-
93
  def run_perplexity_test(model_name, revision, precision):
94
  """Run perplexity evaluation on demand."""
95
  if not model_name:
@@ -102,140 +42,50 @@ def run_perplexity_test(model_name, revision, precision):
102
  else:
103
  return f"❌ Evaluation failed: {result}"
104
 
 
 
 
 
 
 
 
 
105
 
 
 
 
 
106
  demo = gr.Blocks(css=custom_css)
107
  with demo:
108
  gr.HTML(TITLE)
109
  gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
110
 
111
  with gr.Tabs(elem_classes="tab-buttons") as tabs:
112
- with gr.TabItem("🏅 LLM Benchmark", elem_id="llm-benchmark-tab-table", id=0):
113
  leaderboard = init_leaderboard(LEADERBOARD_DF)
114
 
115
- with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=2):
116
  gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
117
 
118
- with gr.TabItem("🚀 Submit here! ", elem_id="llm-benchmark-tab-table", id=3):
119
- with gr.Column():
120
- with gr.Row():
121
- gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
122
-
123
- with gr.Column():
124
- with gr.Accordion(
125
- f"✅ Finished Evaluations ({len(finished_eval_queue_df)})",
126
- open=False,
127
- ):
128
- with gr.Row():
129
- finished_eval_table = gr.components.Dataframe(
130
- value=finished_eval_queue_df,
131
- headers=EVAL_COLS,
132
- datatype=EVAL_TYPES,
133
- row_count=5,
134
- )
135
- with gr.Accordion(
136
- f"🔄 Running Evaluation Queue ({len(running_eval_queue_df)})",
137
- open=False,
138
- ):
139
- with gr.Row():
140
- running_eval_table = gr.components.Dataframe(
141
- value=running_eval_queue_df,
142
- headers=EVAL_COLS,
143
- datatype=EVAL_TYPES,
144
- row_count=5,
145
- )
146
-
147
- with gr.Accordion(
148
- f"⏳ Pending Evaluation Queue ({len(pending_eval_queue_df)})",
149
- open=False,
150
- ):
151
- with gr.Row():
152
- pending_eval_table = gr.components.Dataframe(
153
- value=pending_eval_queue_df,
154
- headers=EVAL_COLS,
155
- datatype=EVAL_TYPES,
156
- row_count=5,
157
- )
158
- with gr.Row():
159
- gr.Markdown("# ✉️✨ Submit your model here!", elem_classes="markdown-text")
160
-
161
  with gr.Row():
162
  with gr.Column():
163
- model_name_textbox = gr.Textbox(label="Model name")
164
- revision_name_textbox = gr.Textbox(label="Revision commit", placeholder="main")
165
- model_type = gr.Dropdown(
166
- choices=[t.to_str(" : ") for t in ModelType if t != ModelType.Unknown],
167
- label="Model type",
168
- multiselect=False,
169
- value=None,
170
- interactive=True,
171
- )
172
-
173
- with gr.Column():
174
  precision = gr.Dropdown(
175
- choices=[i.value.name for i in Precision if i != Precision.Unknown],
176
- label="Precision",
177
- multiselect=False,
178
- value="float16",
179
- interactive=True,
180
- )
181
- weight_type = gr.Dropdown(
182
- choices=[i.value.name for i in WeightType],
183
- label="Weights type",
184
- multiselect=False,
185
- value="Original",
186
- interactive=True,
187
- )
188
- base_model_name_textbox = gr.Textbox(label="Base model (for delta or adapter weights)")
189
-
190
- submit_button = gr.Button("Submit Eval")
191
- submission_result = gr.Markdown()
192
- submit_button.click(
193
- add_new_eval,
194
- [
195
- model_name_textbox,
196
- base_model_name_textbox,
197
- revision_name_textbox,
198
- precision,
199
- weight_type,
200
- model_type,
201
- ],
202
- submission_result,
203
- )
204
-
205
- with gr.TabItem("🧪 Dynamic Testing", elem_id="dynamic-testing-tab", id=4):
206
- gr.Markdown("## Run Perplexity Evaluation")
207
-
208
- with gr.Row():
209
- with gr.Column():
210
- dynamic_model_name = gr.Textbox(label="Model name", placeholder="org/model-name")
211
- dynamic_revision = gr.Textbox(label="Revision", placeholder="main", value="main")
212
- dynamic_precision = gr.Dropdown(
213
  choices=["float16", "bfloat16"],
214
  label="Precision",
215
  value="float16"
216
  )
217
 
218
  with gr.Column():
219
- dynamic_test_button = gr.Button("🚀 Run Perplexity Test", variant="primary")
220
- dynamic_result = gr.Markdown()
221
 
222
- dynamic_test_button.click(
223
  run_perplexity_test,
224
- [dynamic_model_name, dynamic_revision, dynamic_precision],
225
- dynamic_result
226
- )
227
-
228
- with gr.Row():
229
- with gr.Accordion("📙 Citation", open=False):
230
- citation_button = gr.Textbox(
231
- value=CITATION_BUTTON_TEXT,
232
- label=CITATION_BUTTON_LABEL,
233
- lines=20,
234
- elem_id="citation-button",
235
- show_copy_button=True,
236
  )
237
 
238
- scheduler = BackgroundScheduler()
239
- scheduler.add_job(restart_space, "interval", seconds=1800)
240
- scheduler.start()
241
- demo.queue(default_concurrency_limit=40).launch()
 
1
  import gradio as gr
2
+ from gradio_leaderboard import Leaderboard
3
  import pandas as pd
 
4
  from huggingface_hub import snapshot_download
5
 
6
  from src.about import (
 
 
 
7
  INTRODUCTION_TEXT,
8
  LLM_BENCHMARKS_TEXT,
9
  TITLE,
 
12
  from src.display.utils import (
13
  BENCHMARK_COLS,
14
  COLS,
 
 
15
  AutoEvalColumn,
 
 
 
 
16
  )
17
+ from src.envs import API, EVAL_RESULTS_PATH, RESULTS_REPO, TOKEN
18
+ from src.populate import get_leaderboard_df
 
19
  from src.evaluation.dynamic_eval import run_dynamic_perplexity_eval
20
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
21
  def init_leaderboard(dataframe):
22
  if dataframe is None or dataframe.empty:
23
  raise ValueError("Leaderboard DataFrame is empty or None.")
24
+
25
  return Leaderboard(
26
+ dataframe,
27
+ headers=COLS,
28
+ column_config={
29
+ AutoEvalColumn.model.name: "markdown",
30
+ },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
31
  )
32
 
 
33
  def run_perplexity_test(model_name, revision, precision):
34
  """Run perplexity evaluation on demand."""
35
  if not model_name:
 
42
  else:
43
  return f"❌ Evaluation failed: {result}"
44
 
45
+ # Initialize results directory
46
+ try:
47
+ print(EVAL_RESULTS_PATH)
48
+ snapshot_download(
49
+ repo_id=RESULTS_REPO, local_dir=EVAL_RESULTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
50
+ )
51
+ except Exception as e:
52
+ print(f"Error initializing results: {e}")
53
 
54
+ # Get initial leaderboard data
55
+ LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH, COLS, BENCHMARK_COLS)
56
+
57
+ # Create the Gradio interface
58
  demo = gr.Blocks(css=custom_css)
59
  with demo:
60
  gr.HTML(TITLE)
61
  gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
62
 
63
  with gr.Tabs(elem_classes="tab-buttons") as tabs:
64
+ with gr.TabItem("🏅 Leaderboard", elem_id="leaderboard-tab", id=0):
65
  leaderboard = init_leaderboard(LEADERBOARD_DF)
66
 
67
+ with gr.TabItem("📝 About", elem_id="about-tab", id=1):
68
  gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
69
 
70
+ with gr.TabItem("🧪 Test Model", elem_id="test-model-tab", id=2):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
71
  with gr.Row():
72
  with gr.Column():
73
+ model_name = gr.Textbox(label="Model name", placeholder="org/model-name")
74
+ revision = gr.Textbox(label="Revision", placeholder="main", value="main")
 
 
 
 
 
 
 
 
 
75
  precision = gr.Dropdown(
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
76
  choices=["float16", "bfloat16"],
77
  label="Precision",
78
  value="float16"
79
  )
80
 
81
  with gr.Column():
82
+ test_button = gr.Button("🚀 Run Perplexity Test", variant="primary")
83
+ result = gr.Markdown()
84
 
85
+ test_button.click(
86
  run_perplexity_test,
87
+ [model_name, revision, precision],
88
+ result
 
 
 
 
 
 
 
 
 
 
89
  )
90
 
91
+ demo.queue(default_concurrency_limit=5).launch()
 
 
 
src/about.py CHANGED
@@ -7,67 +7,50 @@ class Task:
7
  metric: str
8
  col_name: str
9
 
10
-
11
  # Select your tasks here
12
  # ---------------------------------------------------
13
  class Tasks(Enum):
14
  # task_key in the json file, metric_key in the json file, name to display in the leaderboard
15
- task0 = Task("anli_r1", "acc", "ANLI")
16
- task1 = Task("logiqa", "acc_norm", "LogiQA")
17
- task2 = Task("perplexity", "perplexity", "Perplexity")
18
 
19
- NUM_FEWSHOT = 0 # Change with your few shot
20
  # ---------------------------------------------------
21
 
22
-
23
-
24
  # Your leaderboard name
25
- TITLE = """<h1 align="center" id="space-title">Demo leaderboard</h1>"""
26
 
27
  # What does your leaderboard evaluate?
28
  INTRODUCTION_TEXT = """
29
- Intro text
 
30
  """
31
 
32
- # Which evaluations are you running? how can people reproduce what you have?
33
- LLM_BENCHMARKS_TEXT = f"""
34
  ## How it works
35
 
36
- ## Reproducibility
37
- To reproduce our results, here is the commands you can run:
38
 
39
- """
40
 
41
- EVALUATION_QUEUE_TEXT = """
42
- ## Some good practices before submitting a model
43
-
44
- ### 1) Make sure you can load your model and tokenizer using AutoClasses:
45
- ```python
46
- from transformers import AutoConfig, AutoModel, AutoTokenizer
47
- config = AutoConfig.from_pretrained("your model name", revision=revision)
48
- model = AutoModel.from_pretrained("your model name", revision=revision)
49
- tokenizer = AutoTokenizer.from_pretrained("your model name", revision=revision)
50
  ```
51
- If this step fails, follow the error messages to debug your model before submitting it. It's likely your model has been improperly uploaded.
52
-
53
- Note: make sure your model is public!
54
- Note: if your model needs `use_remote_code=True`, we do not support this option yet but we are working on adding it, stay posted!
55
-
56
- ### 2) Convert your model weights to [safetensors](https://huggingface.co/docs/safetensors/index)
57
- It's a new format for storing weights which is safer and faster to load and use. It will also allow us to add the number of parameters of your model to the `Extended Viewer`!
58
-
59
- ### 3) Make sure your model has an open license!
60
- This is a leaderboard for Open LLMs, and we'd love for as many people as possible to know they can use your model 🤗
61
 
62
- ### 4) Fill up your model card
63
- When we add extra information about models to the leaderboard, it will be automatically taken from the model card
64
 
65
- ## In case of model failure
66
- If your model is displayed in the `FAILED` category, its execution stopped.
67
- Make sure you have followed the above steps first.
68
- If everything is done, check you can launch the EleutherAIHarness on your model locally, using the above command without modifications (you can add `--limit` to limit the number of examples per task).
69
  """
70
 
71
  CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
72
- CITATION_BUTTON_TEXT = r"""
73
- """
 
7
  metric: str
8
  col_name: str
9
 
 
10
  # Select your tasks here
11
  # ---------------------------------------------------
12
  class Tasks(Enum):
13
  # task_key in the json file, metric_key in the json file, name to display in the leaderboard
14
+ task0 = Task("perplexity", "perplexity", "Perplexity")
 
 
15
 
16
+ NUM_FEWSHOT = 0 # Not used for perplexity
17
  # ---------------------------------------------------
18
 
 
 
19
  # Your leaderboard name
20
+ TITLE = """<h1 align="center" id="space-title">Model Perplexity Leaderboard</h1>"""
21
 
22
  # What does your leaderboard evaluate?
23
  INTRODUCTION_TEXT = """
24
+ This leaderboard evaluates language models based on their perplexity scores on a fixed test passage.
25
+ Lower perplexity scores indicate better performance - it means the model is better at predicting the next token in the text.
26
  """
27
 
28
+ # Which evaluations are you running?
29
+ LLM_BENCHMARKS_TEXT = """
30
  ## How it works
31
 
32
+ The evaluation runs perplexity tests on language models using a fixed test passage about artificial intelligence.
33
+ Perplexity measures how well a model predicts text - lower scores mean better predictions.
34
 
35
+ ## Test Text
36
 
37
+ The evaluation uses the following passage:
 
 
 
 
 
 
 
 
38
  ```
39
+ Artificial intelligence has transformed the way we live and work, bringing both opportunities and challenges.
40
+ From autonomous vehicles to language models that can engage in human-like conversation, AI technologies are becoming increasingly
41
+ sophisticated. However, with this advancement comes the responsibility to ensure these systems are developed and deployed ethically,
42
+ with careful consideration for privacy, fairness, and transparency. The future of AI will likely depend on how well we balance innovation
43
+ with these important social considerations.
44
+ ```
45
+ """
 
 
 
46
 
47
+ EVALUATION_QUEUE_TEXT = """
48
+ ## Before submitting a model
49
 
50
+ 1. Make sure your model is public on the Hugging Face Hub
51
+ 2. The model should be loadable with AutoModelForCausalLM
52
+ 3. The model should support text generation tasks
 
53
  """
54
 
55
  CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
56
+ CITATION_BUTTON_TEXT = ""
 
src/display/utils.py CHANGED
@@ -28,7 +28,9 @@ auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model", "ma
28
  #Scores
29
  auto_eval_column_dict.append(["average", ColumnContent, ColumnContent("Average ⬆️", "number", True)])
30
  for task in Tasks:
31
- auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True)])
 
 
32
  # Model information
33
  auto_eval_column_dict.append(["model_type", ColumnContent, ColumnContent("Type", "str", False)])
34
  auto_eval_column_dict.append(["architecture", ColumnContent, ColumnContent("Architecture", "str", False)])
 
28
  #Scores
29
  auto_eval_column_dict.append(["average", ColumnContent, ColumnContent("Average ⬆️", "number", True)])
30
  for task in Tasks:
31
+ # Add ⬆️ for metrics where higher is better, ⬇️ for metrics where lower is better
32
+ arrow = "⬇️" if task.value.benchmark == "perplexity" else "⬆️"
33
+ auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(f"{task.value.col_name} {arrow}", "number", True)])
34
  # Model information
35
  auto_eval_column_dict.append(["model_type", ColumnContent, ColumnContent("Type", "str", False)])
36
  auto_eval_column_dict.append(["architecture", ColumnContent, ColumnContent("Architecture", "str", False)])
src/envs.py CHANGED
@@ -1,25 +1,16 @@
1
  import os
2
-
3
  from huggingface_hub import HfApi
4
 
5
  # Info to change for your repository
6
  # ----------------------------------
7
  TOKEN = os.environ.get("HF_TOKEN") # A read/write token for your org
8
-
9
- OWNER = "demo-leaderboard-backend" # Change to your org - don't forget to create a results and request dataset, with the correct format!
10
  # ----------------------------------
11
 
12
- REPO_ID = f"{OWNER}/leaderboard"
13
- QUEUE_REPO = f"{OWNER}/requests"
14
  RESULTS_REPO = f"{OWNER}/results"
15
 
16
- # If you setup a cache later, just change HF_HOME
17
- CACHE_PATH=os.getenv("HF_HOME", ".")
18
-
19
  # Local caches
20
- EVAL_REQUESTS_PATH = os.path.join(CACHE_PATH, "eval-queue")
21
  EVAL_RESULTS_PATH = os.path.join(CACHE_PATH, "eval-results")
22
- EVAL_REQUESTS_PATH_BACKEND = os.path.join(CACHE_PATH, "eval-queue-bk")
23
- EVAL_RESULTS_PATH_BACKEND = os.path.join(CACHE_PATH, "eval-results-bk")
24
 
25
  API = HfApi(token=TOKEN)
 
1
  import os
 
2
  from huggingface_hub import HfApi
3
 
4
  # Info to change for your repository
5
  # ----------------------------------
6
  TOKEN = os.environ.get("HF_TOKEN") # A read/write token for your org
7
+ OWNER = "model-trace" # Change to your org
 
8
  # ----------------------------------
9
 
 
 
10
  RESULTS_REPO = f"{OWNER}/results"
11
 
 
 
 
12
  # Local caches
13
+ CACHE_PATH = os.getenv("HF_HOME", ".")
14
  EVAL_RESULTS_PATH = os.path.join(CACHE_PATH, "eval-results")
 
 
15
 
16
  API = HfApi(token=TOKEN)
src/leaderboard/read_evals.py CHANGED
@@ -4,18 +4,13 @@ import math
4
  import os
5
  from dataclasses import dataclass
6
 
7
- import dateutil
8
- import numpy as np
9
-
10
  from src.display.formatting import make_clickable_model
11
  from src.display.utils import AutoEvalColumn, ModelType, Tasks, Precision, WeightType
12
  from src.submission.check_validity import is_model_on_hub
13
 
14
-
15
  @dataclass
16
  class EvalResult:
17
- """Represents one full evaluation. Built from a combination of the result and request file for a given run.
18
- """
19
  eval_name: str # org_model_precision (uid)
20
  full_model: str # org/model (path on hub)
21
  org: str
@@ -23,13 +18,9 @@ class EvalResult:
23
  revision: str # commit hash, "" if main
24
  results: dict
25
  precision: Precision = Precision.Unknown
26
- model_type: ModelType = ModelType.Unknown # Pretrained, fine tuned, ...
27
- weight_type: WeightType = WeightType.Original # Original or Adapter
28
- architecture: str = "Unknown"
29
- license: str = "?"
30
- likes: int = 0
31
- num_params: int = 0
32
- date: str = "" # submission date of request file
33
  still_on_hub: bool = False
34
 
35
  @classmethod
@@ -66,18 +57,10 @@ class EvalResult:
66
  if architectures:
67
  architecture = ";".join(architectures)
68
 
69
- # Extract results available in this file (some results are split in several files)
70
  results = {}
71
- for task in Tasks:
72
- task = task.value
73
-
74
- # We average all scores of a given metric (not all metrics are present in all files)
75
- accs = np.array([v.get(task.metric, None) for k, v in data["results"].items() if task.benchmark == k])
76
- if accs.size == 0 or any([acc is None for acc in accs]):
77
- continue
78
-
79
- mean_acc = np.mean(accs) * 100.0
80
- results[task.benchmark] = mean_acc
81
 
82
  return self(
83
  eval_name=result_key,
@@ -85,31 +68,27 @@ class EvalResult:
85
  org=org,
86
  model=model,
87
  results=results,
88
- precision=precision,
89
- revision= config.get("model_sha", ""),
90
  still_on_hub=still_on_hub,
91
  architecture=architecture
92
  )
93
 
94
- def update_with_request_file(self, requests_path):
95
- """Finds the relevant request file for the current model and updates info with it"""
96
- request_file = get_request_file_for_model(requests_path, self.full_model, self.precision.value.name)
97
-
98
- try:
99
- with open(request_file, "r") as f:
100
- request = json.load(f)
101
- self.model_type = ModelType.from_str(request.get("model_type", ""))
102
- self.weight_type = WeightType[request.get("weight_type", "Original")]
103
- self.license = request.get("license", "?")
104
- self.likes = request.get("likes", 0)
105
- self.num_params = request.get("params", 0)
106
- self.date = request.get("submitted_time", "")
107
- except Exception:
108
- print(f"Could not find request file for {self.org}/{self.model} with precision {self.precision.value.name}")
109
-
110
  def to_dict(self):
111
  """Converts the Eval Result to a dict compatible with our dataframe display"""
112
- average = sum([v for v in self.results.values() if v is not None]) / len(Tasks)
 
 
 
 
 
 
 
 
 
 
 
 
113
  data_dict = {
114
  "eval_name": self.eval_name, # not a column, just a save name,
115
  AutoEvalColumn.precision.name: self.precision.value.name,
@@ -120,42 +99,22 @@ class EvalResult:
120
  AutoEvalColumn.model.name: make_clickable_model(self.full_model),
121
  AutoEvalColumn.revision.name: self.revision,
122
  AutoEvalColumn.average.name: average,
123
- AutoEvalColumn.license.name: self.license,
124
- AutoEvalColumn.likes.name: self.likes,
125
- AutoEvalColumn.params.name: self.num_params,
126
  AutoEvalColumn.still_on_hub.name: self.still_on_hub,
127
  }
128
 
129
  for task in Tasks:
130
- data_dict[task.value.col_name] = self.results[task.value.benchmark]
 
 
 
 
 
 
131
 
132
  return data_dict
133
 
134
-
135
- def get_request_file_for_model(requests_path, model_name, precision):
136
- """Selects the correct request file for a given model. Only keeps runs tagged as FINISHED"""
137
- request_files = os.path.join(
138
- requests_path,
139
- f"{model_name}_eval_request_*.json",
140
- )
141
- request_files = glob.glob(request_files)
142
-
143
- # Select correct request file (precision)
144
- request_file = ""
145
- request_files = sorted(request_files, reverse=True)
146
- for tmp_request_file in request_files:
147
- with open(tmp_request_file, "r") as f:
148
- req_content = json.load(f)
149
- if (
150
- req_content["status"] in ["FINISHED"]
151
- and req_content["precision"] == precision.split(".")[-1]
152
- ):
153
- request_file = tmp_request_file
154
- return request_file
155
-
156
-
157
- def get_raw_eval_results(results_path: str, requests_path: str) -> list[EvalResult]:
158
- """From the path of the results folder root, extract all needed info for results"""
159
  model_result_filepaths = []
160
 
161
  for root, _, files in os.walk(results_path):
@@ -163,12 +122,6 @@ def get_raw_eval_results(results_path: str, requests_path: str) -> list[EvalResu
163
  if len(files) == 0 or any([not f.endswith(".json") for f in files]):
164
  continue
165
 
166
- # Sort the files by date
167
- try:
168
- files.sort(key=lambda x: x.removesuffix(".json").removeprefix("results_")[:-7])
169
- except dateutil.parser._parser.ParserError:
170
- files = [files[-1]]
171
-
172
  for file in files:
173
  model_result_filepaths.append(os.path.join(root, file))
174
 
@@ -176,7 +129,6 @@ def get_raw_eval_results(results_path: str, requests_path: str) -> list[EvalResu
176
  for model_result_filepath in model_result_filepaths:
177
  # Creation of result
178
  eval_result = EvalResult.init_from_json_file(model_result_filepath)
179
- eval_result.update_with_request_file(requests_path)
180
 
181
  # Store results of same eval together
182
  eval_name = eval_result.eval_name
 
4
  import os
5
  from dataclasses import dataclass
6
 
 
 
 
7
  from src.display.formatting import make_clickable_model
8
  from src.display.utils import AutoEvalColumn, ModelType, Tasks, Precision, WeightType
9
  from src.submission.check_validity import is_model_on_hub
10
 
 
11
  @dataclass
12
  class EvalResult:
13
+ """Represents one perplexity evaluation result."""
 
14
  eval_name: str # org_model_precision (uid)
15
  full_model: str # org/model (path on hub)
16
  org: str
 
18
  revision: str # commit hash, "" if main
19
  results: dict
20
  precision: Precision = Precision.Unknown
21
+ model_type: ModelType = ModelType.PT # Default to pretrained
22
+ weight_type: WeightType = WeightType.Original
23
+ architecture: str = "Unknown"
 
 
 
 
24
  still_on_hub: bool = False
25
 
26
  @classmethod
 
57
  if architectures:
58
  architecture = ";".join(architectures)
59
 
60
+ # Extract perplexity result
61
  results = {}
62
+ if "perplexity" in data["results"]:
63
+ results["perplexity"] = data["results"]["perplexity"]["perplexity"]
 
 
 
 
 
 
 
 
64
 
65
  return self(
66
  eval_name=result_key,
 
68
  org=org,
69
  model=model,
70
  results=results,
71
+ precision=precision,
72
+ revision=config.get("model_sha", ""),
73
  still_on_hub=still_on_hub,
74
  architecture=architecture
75
  )
76
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
77
  def to_dict(self):
78
  """Converts the Eval Result to a dict compatible with our dataframe display"""
79
+ # Calculate average, handling perplexity (lower is better)
80
+ scores = []
81
+ for task in Tasks:
82
+ if task.value.benchmark in self.results:
83
+ score = self.results[task.value.benchmark]
84
+ # Convert perplexity to a 0-100 scale where lower perplexity = higher score
85
+ # Using a log scale since perplexity can vary widely
86
+ # Cap at 100 for very low perplexity and 0 for very high perplexity
87
+ score = max(0, min(100, 100 * (1 - math.log(score) / 10)))
88
+ scores.append(score)
89
+
90
+ average = sum(scores) / len(scores) if scores else 0
91
+
92
  data_dict = {
93
  "eval_name": self.eval_name, # not a column, just a save name,
94
  AutoEvalColumn.precision.name: self.precision.value.name,
 
99
  AutoEvalColumn.model.name: make_clickable_model(self.full_model),
100
  AutoEvalColumn.revision.name: self.revision,
101
  AutoEvalColumn.average.name: average,
 
 
 
102
  AutoEvalColumn.still_on_hub.name: self.still_on_hub,
103
  }
104
 
105
  for task in Tasks:
106
+ benchmark = task.value.benchmark
107
+ if benchmark in self.results:
108
+ score = self.results[benchmark]
109
+ # Store original perplexity score (lower is better)
110
+ data_dict[task.value.col_name] = score
111
+ else:
112
+ data_dict[task.value.col_name] = None
113
 
114
  return data_dict
115
 
116
+ def get_raw_eval_results(results_path: str) -> list[EvalResult]:
117
+ """From the path of the results folder root, extract all perplexity results"""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
118
  model_result_filepaths = []
119
 
120
  for root, _, files in os.walk(results_path):
 
122
  if len(files) == 0 or any([not f.endswith(".json") for f in files]):
123
  continue
124
 
 
 
 
 
 
 
125
  for file in files:
126
  model_result_filepaths.append(os.path.join(root, file))
127
 
 
129
  for model_result_filepath in model_result_filepaths:
130
  # Creation of result
131
  eval_result = EvalResult.init_from_json_file(model_result_filepath)
 
132
 
133
  # Store results of same eval together
134
  eval_name = eval_result.eval_name
src/populate.py CHANGED
@@ -1,58 +1,17 @@
1
- import json
2
- import os
3
-
4
  import pandas as pd
5
-
6
  from src.display.formatting import has_no_nan_values, make_clickable_model
7
- from src.display.utils import AutoEvalColumn, EvalQueueColumn
8
  from src.leaderboard.read_evals import get_raw_eval_results
9
 
10
-
11
- def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchmark_cols: list) -> pd.DataFrame:
12
  """Creates a dataframe from all the individual experiment results"""
13
- raw_data = get_raw_eval_results(results_path, requests_path)
14
  all_data_json = [v.to_dict() for v in raw_data]
15
 
16
  df = pd.DataFrame.from_records(all_data_json)
17
  df = df.sort_values(by=[AutoEvalColumn.average.name], ascending=False)
18
  df = df[cols].round(decimals=2)
19
 
20
- # filter out if any of the benchmarks have not been produced
21
  df = df[has_no_nan_values(df, benchmark_cols)]
22
  return df
23
-
24
-
25
- def get_evaluation_queue_df(save_path: str, cols: list) -> list[pd.DataFrame]:
26
- """Creates the different dataframes for the evaluation queues requestes"""
27
- entries = [entry for entry in os.listdir(save_path) if not entry.startswith(".")]
28
- all_evals = []
29
-
30
- for entry in entries:
31
- if ".json" in entry:
32
- file_path = os.path.join(save_path, entry)
33
- with open(file_path) as fp:
34
- data = json.load(fp)
35
-
36
- data[EvalQueueColumn.model.name] = make_clickable_model(data["model"])
37
- data[EvalQueueColumn.revision.name] = data.get("revision", "main")
38
-
39
- all_evals.append(data)
40
- elif ".md" not in entry:
41
- # this is a folder
42
- sub_entries = [e for e in os.listdir(f"{save_path}/{entry}") if os.path.isfile(e) and not e.startswith(".")]
43
- for sub_entry in sub_entries:
44
- file_path = os.path.join(save_path, entry, sub_entry)
45
- with open(file_path) as fp:
46
- data = json.load(fp)
47
-
48
- data[EvalQueueColumn.model.name] = make_clickable_model(data["model"])
49
- data[EvalQueueColumn.revision.name] = data.get("revision", "main")
50
- all_evals.append(data)
51
-
52
- pending_list = [e for e in all_evals if e["status"] in ["PENDING", "RERUN"]]
53
- running_list = [e for e in all_evals if e["status"] == "RUNNING"]
54
- finished_list = [e for e in all_evals if e["status"].startswith("FINISHED") or e["status"] == "PENDING_NEW_EVAL"]
55
- df_pending = pd.DataFrame.from_records(pending_list, columns=cols)
56
- df_running = pd.DataFrame.from_records(running_list, columns=cols)
57
- df_finished = pd.DataFrame.from_records(finished_list, columns=cols)
58
- return df_finished[cols], df_running[cols], df_pending[cols]
 
 
 
 
1
  import pandas as pd
 
2
  from src.display.formatting import has_no_nan_values, make_clickable_model
3
+ from src.display.utils import AutoEvalColumn
4
  from src.leaderboard.read_evals import get_raw_eval_results
5
 
6
+ def get_leaderboard_df(results_path: str, cols: list, benchmark_cols: list) -> pd.DataFrame:
 
7
  """Creates a dataframe from all the individual experiment results"""
8
+ raw_data = get_raw_eval_results(results_path)
9
  all_data_json = [v.to_dict() for v in raw_data]
10
 
11
  df = pd.DataFrame.from_records(all_data_json)
12
  df = df.sort_values(by=[AutoEvalColumn.average.name], ascending=False)
13
  df = df[cols].round(decimals=2)
14
 
15
+ # filter out if perplexity hasn't been evaluated
16
  df = df[has_no_nan_values(df, benchmark_cols)]
17
  return df