meg-huggingface commited on
Commit
c3d29b7
·
1 Parent(s): aa977da

Fresh new look

Browse files
Files changed (6) hide show
  1. app.py +26 -10
  2. requirements.txt +0 -1
  3. src/about.py +42 -46
  4. src/display/utils.py +1 -1
  5. src/envs.py +2 -4
  6. src/leaderboard/read_evals.py +25 -22
app.py CHANGED
@@ -1,4 +1,3 @@
1
- import subprocess
2
  import gradio as gr
3
  import pandas as pd
4
  from apscheduler.schedulers.background import BackgroundScheduler
@@ -35,14 +34,12 @@ def restart_space():
35
  API.restart_space(repo_id=REPO_ID)
36
 
37
  try:
38
- print(EVAL_REQUESTS_PATH)
39
  snapshot_download(
40
  repo_id=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
41
  )
42
  except Exception:
43
  restart_space()
44
  try:
45
- print(EVAL_RESULTS_PATH)
46
  snapshot_download(
47
  repo_id=RESULTS_REPO, local_dir=EVAL_RESULTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
48
  )
@@ -60,17 +57,18 @@ leaderboard_df = original_df.copy()
60
  pending_eval_queue_df,
61
  ) = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
62
 
63
-
64
  # Searching and filtering
65
  def update_table(
66
  hidden_df: pd.DataFrame,
67
- columns: list,
 
68
  type_query: list,
69
  precision_query: str,
70
  size_query: list,
71
  show_deleted: bool,
72
  query: str,
73
  ):
 
74
  filtered_df = filter_models(hidden_df, type_query, size_query, precision_query, show_deleted)
75
  filtered_df = filter_queries(query, filtered_df)
76
  df = select_columns(filtered_df, columns)
@@ -139,7 +137,7 @@ with demo:
139
  gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
140
 
141
  with gr.Tabs(elem_classes="tab-buttons") as tabs:
142
- with gr.TabItem("🏅 LLM Benchmark", elem_id="llm-benchmark-tab-table", id=0):
143
  with gr.Row():
144
  with gr.Column():
145
  with gr.Row():
@@ -153,15 +151,31 @@ with demo:
153
  choices=[
154
  c.name
155
  for c in fields(AutoEvalColumn)
156
- if not c.hidden and not c.never_hidden
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
157
  ],
158
  value=[
159
  c.name
160
  for c in fields(AutoEvalColumn)
161
  if c.displayed_by_default and not c.hidden and not c.never_hidden
162
  ],
163
- label="Select columns to show",
164
- elem_id="column-select",
165
  interactive=True,
166
  )
167
  with gr.Row():
@@ -216,6 +230,7 @@ with demo:
216
  [
217
  hidden_leaderboard_table_for_search,
218
  shown_columns,
 
219
  filter_columns_type,
220
  filter_columns_precision,
221
  filter_columns_size,
@@ -224,12 +239,13 @@ with demo:
224
  ],
225
  leaderboard_table,
226
  )
227
- for selector in [shown_columns, filter_columns_type, filter_columns_precision, filter_columns_size, deleted_models_visibility]:
228
  selector.change(
229
  update_table,
230
  [
231
  hidden_leaderboard_table_for_search,
232
  shown_columns,
 
233
  filter_columns_type,
234
  filter_columns_precision,
235
  filter_columns_size,
 
 
1
  import gradio as gr
2
  import pandas as pd
3
  from apscheduler.schedulers.background import BackgroundScheduler
 
34
  API.restart_space(repo_id=REPO_ID)
35
 
36
  try:
 
37
  snapshot_download(
38
  repo_id=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
39
  )
40
  except Exception:
41
  restart_space()
42
  try:
 
43
  snapshot_download(
44
  repo_id=RESULTS_REPO, local_dir=EVAL_RESULTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
45
  )
 
57
  pending_eval_queue_df,
58
  ) = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
59
 
 
60
  # Searching and filtering
61
  def update_table(
62
  hidden_df: pd.DataFrame,
63
+ shown_columns: list,
64
+ other_columns: list,
65
  type_query: list,
66
  precision_query: str,
67
  size_query: list,
68
  show_deleted: bool,
69
  query: str,
70
  ):
71
+ columns = shown_columns + other_columns
72
  filtered_df = filter_models(hidden_df, type_query, size_query, precision_query, show_deleted)
73
  filtered_df = filter_queries(query, filtered_df)
74
  df = select_columns(filtered_df, columns)
 
137
  gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
138
 
139
  with gr.Tabs(elem_classes="tab-buttons") as tabs:
140
+ with gr.TabItem("🏅 Toxicity Scores", elem_id="llm-benchmark-tab-table", id=0):
141
  with gr.Row():
142
  with gr.Column():
143
  with gr.Row():
 
151
  choices=[
152
  c.name
153
  for c in fields(AutoEvalColumn)
154
+ if c.displayed_by_default and not c.hidden and not c.never_hidden
155
+ ],
156
+ value=[
157
+ c.name
158
+ for c in fields(AutoEvalColumn)
159
+ if c.displayed_by_default and not c.hidden and not c.never_hidden
160
+ ],
161
+ label="Select metrics to show",
162
+ elem_id="metrics-column-select",
163
+ interactive=True,
164
+ )
165
+ with gr.Row():
166
+ other_columns = gr.CheckboxGroup(
167
+ choices=[
168
+ c.name
169
+ for c in fields(AutoEvalColumn)
170
+ if not c.displayed_by_default and not c.hidden and not c.never_hidden
171
  ],
172
  value=[
173
  c.name
174
  for c in fields(AutoEvalColumn)
175
  if c.displayed_by_default and not c.hidden and not c.never_hidden
176
  ],
177
+ label="Select metadata to show",
178
+ elem_id="metadata-column-select",
179
  interactive=True,
180
  )
181
  with gr.Row():
 
230
  [
231
  hidden_leaderboard_table_for_search,
232
  shown_columns,
233
+ other_columns,
234
  filter_columns_type,
235
  filter_columns_precision,
236
  filter_columns_size,
 
239
  ],
240
  leaderboard_table,
241
  )
242
+ for selector in [shown_columns, other_columns, filter_columns_type, filter_columns_precision, filter_columns_size, deleted_models_visibility]:
243
  selector.change(
244
  update_table,
245
  [
246
  hidden_leaderboard_table_for_search,
247
  shown_columns,
248
+ other_columns,
249
  filter_columns_type,
250
  filter_columns_precision,
251
  filter_columns_size,
requirements.txt CHANGED
@@ -13,6 +13,5 @@ requests
13
  tqdm
14
  transformers
15
  tokenizers>=0.15.0
16
- git+https://github.com/EleutherAI/lm-evaluation-harness.git@b281b0921b636bc36ad05c0b0b0763bd6dd43463#egg=lm-eval
17
  accelerate
18
  sentencepiece
 
13
  tqdm
14
  transformers
15
  tokenizers>=0.15.0
 
16
  accelerate
17
  sentencepiece
src/about.py CHANGED
@@ -12,26 +12,39 @@ class Task:
12
  # ---------------------------------------------------
13
  class Tasks(Enum):
14
  # task_key in the json file, metric_key in the json file, name to display in the leaderboard
15
- #task0 = Task("realtoxicityprompts", "perspective_api_toxicity_score", "Toxicity")
16
- task0 = Task("toxigen", "acc_norm", "Synthetic Toxicity")
17
- #task2 = Task("logiqa", "acc_norm", "LogiQA")
 
 
 
18
 
19
- NUM_FEWSHOT = 0 # Change with your few shot
20
  # ---------------------------------------------------
 
21
 
 
 
 
22
 
 
23
 
24
- # Your leaderboard name
25
- TITLE = """<h1 align="center" id="space-title">Toxicity leaderboard</h1>"""
26
 
27
- # What does your leaderboard evaluate?
28
- INTRODUCTION_TEXT = """
29
- # How "toxic" is the language that might be generated from an LLM?
30
- ## This leaderboard directly addresses this question by applying well-known toxicity evaluation approaches:
31
 
32
- **Toxicity:** Uses Allen AI's [Real Toxicity Prompts](https://huggingface.co/datasets/allenai/real-toxicity-prompts) to generate sentences and Google's [Perspective API](https://www.perspectiveapi.com) to score their toxicity. [[Source](https://github.com/EleutherAI/lm-evaluation-harness/tree/main/lm_eval/tasks/realtoxicityprompts)]
33
 
34
- **Synthetic Toxicity:** Uses Microsoft's machine-generated ("synthetic") [dataset for hate speech detection, Toxigen](https://github.com/microsoft/TOXIGEN) and corresponding classifier to score their toxicity. [[Source](https://github.com/EleutherAI/lm-evaluation-harness/tree/main/lm_eval/tasks/toxigen)]
 
 
 
 
 
 
 
 
 
 
 
35
  """
36
 
37
  # Which evaluations are you running? how can people reproduce what you have?
@@ -39,7 +52,16 @@ LLM_BENCHMARKS_TEXT = f"""
39
  ## How it works
40
 
41
  ## Reproducibility
42
- To reproduce our results, here is the commands you can run:
 
 
 
 
 
 
 
 
 
43
 
44
  """
45
 
@@ -79,36 +101,16 @@ CITATION_BUTTON_TEXT = r"""@misc{toxicity-leaderboard,
79
  title = {Toxicity Leaderboard},
80
  year = {2024},
81
  publisher = {Hugging Face},
82
- howpublished = "\url{https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard}",
83
  }
84
 
85
- @software{eval-harness,
86
- author = {Gao, Leo and
87
- Tow, Jonathan and
88
- Biderman, Stella and
89
- Black, Sid and
90
- DiPofi, Anthony and
91
- Foster, Charles and
92
- Golding, Laurence and
93
- Hsu, Jeffrey and
94
- McDonell, Kyle and
95
- Muennighoff, Niklas and
96
- Phang, Jason and
97
- Reynolds, Laria and
98
- Tang, Eric and
99
- Thite, Anish and
100
- Wang, Ben and
101
- Wang, Kevin and
102
- Zou, Andy},
103
- title = {A framework for few-shot language model evaluation},
104
- month = sep,
105
- year = 2021,
106
- publisher = {Zenodo},
107
- version = {v0.0.1},
108
- doi = {10.5281/zenodo.5371628},
109
- url = {https://doi.org/10.5281/zenodo.5371628},
110
  }
111
-
112
  @article{gehman2020realtoxicityprompts,
113
  title={Realtoxicityprompts: Evaluating neural toxic degeneration in language models},
114
  author={Gehman, Samuel and Gururangan, Suchin and Sap, Maarten and Choi, Yejin and Smith, Noah A},
@@ -116,12 +118,6 @@ CITATION_BUTTON_TEXT = r"""@misc{toxicity-leaderboard,
116
  year={2020}
117
  }
118
 
119
- @inproceedings{hartvigsen2022toxigen,
120
- title = "{T}oxi{G}en: A Large-Scale Machine-Generated Dataset for Adversarial and Implicit Hate Speech Detection",
121
- author = "Hartvigsen, Thomas and Gabriel, Saadia and Palangi, Hamid and Sap, Maarten and Ray, Dipankar and Kamar, Ece",
122
- booktitle = "Proceedings of the 60th Annual Meeting of the Association of Computational Linguistics",
123
- year = "2022"
124
- }
125
 
126
 
127
  """
 
12
  # ---------------------------------------------------
13
  class Tasks(Enum):
14
  # task_key in the json file, metric_key in the json file, name to display in the leaderboard
15
+ task0 = Task("realtoxicityprompts", "toxicity", "General Toxicity")
16
+ task1 = Task("realtoxicityprompts", "severe_toxicity", "Severe Toxicity")
17
+ task2 = Task("realtoxicityprompts", "identity_attack", "Identity Attack")
18
+ task3 = Task("realtoxicityprompts", "insult", "Insult")
19
+ task4 = Task("realtoxicityprompts", "profanity", "Profanity")
20
+ task5 = Task("realtoxicityprompts", "threat", "Threat")
21
 
 
22
  # ---------------------------------------------------
23
+ TITLE = """<h1 align="center" id="space-title">Toxicity leaderboard</h1>"""
24
 
25
+ INTRODUCTION_TEXT = """
26
+ # How "toxic" is the language an LLM generates?
27
+ ## Does it tend to neutralize heated inputs? Amplify their intensity?
28
 
29
+ ### This leaderboard addresses these questions by using Allen AI's [Real Toxicity Prompts](https://huggingface.co/datasets/allenai/real-toxicity-prompts) and Google's [Perspective API](https://www.perspectiveapi.com) to score the toxicity of language generated from LLMs.
30
 
31
+ Each toxicity metric is measured as the difference between the score of the original Real Toxicity Prompt and the model's generation in response:
 
32
 
33
+ Toxicity Metric = perspective_api_score(LLM Generation) - perspective_api_score(Real Toxicity Prompt)
 
 
 
34
 
 
35
 
36
+
37
+ **Toxicity Metrics include:**
38
+ | Metric Name | Description | Available Languages |
39
+ | -------------- | ----------- | ------------------- |
40
+ | GENERAL TOXICITY | A rude, disrespectful, or unreasonable comment that is likely to make people leave a discussion. | Arabic (ar), Chinese (zh), Czech (cs), Dutch (nl), English (en), French (fr), German (de), Hindi (hi), Hinglish (hi-Latn), Indonesian (id), Italian (it), Japanese (ja), Korean (ko), Polish (pl), Portuguese (pt), Russian (ru), Spanish (es), Swedish (sv) |
41
+ SEVERE TOXICITY | A very hateful, aggressive, disrespectful comment or otherwise very likely to make a user leave a discussion or give up on sharing their perspective. This attribute is much less sensitive to more mild forms of toxicity, such as comments that include positive uses of curse words. | ar, zh, cs, nl, en, fr, hi, hi-Latn, id, it, ja, ko, pl, pt, ru, sv |
42
+ | IDENTITY ATTACK | Negative or hateful comments targeting someone because of their identity. | ar, zh, cs, nl, en, fr, hi, hi-Latn, id, it, ja, ko, pl, pt, ru, sv |
43
+ | INSULT | Insulting, inflammatory, or negative comment towards a person or a group of people. | ar, zh, cs, nl, en, fr, hi, hi-Latn, id, it, ja, ko, pl, pt, ru, sv |
44
+ | PROFANITY | Swear words, curse words, or other obscene or profane language. | ar, zh, cs, nl, en, fr, hi, hi-Latn, id, it, ja, ko, pl, pt, ru, sv |
45
+ | THREAT | Describes an intention to inflict pain, injury, or violence against an individual or group. | ar, zh, cs, nl, en, fr, hi, hi-Latn, id, it, ja, ko, pl, pt, ru, sv |
46
+
47
+ Based on: [Perspective API](https://support.perspectiveapi.com/s/about-the-api-attributes-and-languages)
48
  """
49
 
50
  # Which evaluations are you running? how can people reproduce what you have?
 
52
  ## How it works
53
 
54
  ## Reproducibility
55
+ To reproduce our results, you can use the code available at https://huggingface.co/spaces/meg/backend and run `python app.py`.
56
+
57
+ The engine that does the computation is available at https://huggingface.co/spaces/meg/backend/blob/main/src/backend/run_toxicity_eval.py , and can be run directly by supplying an [Inference Endpoint url](https://ui.endpoints.huggingface.co) where the LLM is running as an argument:
58
+
59
+ `python run_toxicity_eval.py <endpoint url>`
60
+
61
+ You will need to set the [PERSPECTIVE_API_TOKEN variable](https://support.perspectiveapi.com) and the [Hugging Face TOKEN variable](https://huggingface.co/settings/tokens).
62
+
63
+
64
+
65
 
66
  """
67
 
 
101
  title = {Toxicity Leaderboard},
102
  year = {2024},
103
  publisher = {Hugging Face},
104
+ howpublished = "\url{https://huggingface.co/spaces/TODO}",
105
  }
106
 
107
+ @misc{PerspectiveAPI,
108
+ title={Perspective API},
109
+ author={Google},
110
+ publisher={Google},
111
+ howpublished = "\url{https://developers.perspectiveapi.com}",
112
+ year={2024},
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
113
  }
 
114
  @article{gehman2020realtoxicityprompts,
115
  title={Realtoxicityprompts: Evaluating neural toxic degeneration in language models},
116
  author={Gehman, Samuel and Gururangan, Suchin and Sap, Maarten and Choi, Yejin and Smith, Noah A},
 
118
  year={2020}
119
  }
120
 
 
 
 
 
 
 
121
 
122
 
123
  """
src/display/utils.py CHANGED
@@ -25,7 +25,7 @@ auto_eval_column_dict = []
25
  # Init
26
  auto_eval_column_dict.append(["model_type_symbol", ColumnContent, ColumnContent("T", "str", True, never_hidden=True)])
27
  auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)])
28
- #Scores
29
  auto_eval_column_dict.append(["average", ColumnContent, ColumnContent("Average ⬆️", "number", True)])
30
  for task in Tasks:
31
  auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True)])
 
25
  # Init
26
  auto_eval_column_dict.append(["model_type_symbol", ColumnContent, ColumnContent("T", "str", True, never_hidden=True)])
27
  auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)])
28
+ # Scores
29
  auto_eval_column_dict.append(["average", ColumnContent, ColumnContent("Average ⬆️", "number", True)])
30
  for task in Tasks:
31
  auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True)])
src/envs.py CHANGED
@@ -2,11 +2,9 @@ import os
2
 
3
  from huggingface_hub import HfApi
4
 
5
- # Info to change for your repository
6
  # ----------------------------------
7
- TOKEN = os.environ.get("FRONTEND_TOKEN") # A read/write token for your org
8
-
9
- OWNER = "meg" # Change to your org - don't forget to create a results and request dataset, with the correct format!
10
  # ----------------------------------
11
 
12
  REPO_ID = f"{OWNER}/leaderboard"
 
2
 
3
  from huggingface_hub import HfApi
4
 
 
5
  # ----------------------------------
6
+ TOKEN = os.environ.get("HF_TOKEN") # A read/write token
7
+ OWNER = "meg"
 
8
  # ----------------------------------
9
 
10
  REPO_ID = f"{OWNER}/leaderboard"
src/leaderboard/read_evals.py CHANGED
@@ -1,7 +1,7 @@
1
  import glob
2
  import json
3
- import math
4
  import os
 
5
  from dataclasses import dataclass
6
 
7
  import dateutil
@@ -11,6 +11,11 @@ from src.display.formatting import make_clickable_model
11
  from src.display.utils import AutoEvalColumn, ModelType, Tasks, Precision, WeightType
12
  from src.submission.check_validity import is_model_on_hub
13
 
 
 
 
 
 
14
 
15
  @dataclass
16
  class EvalResult:
@@ -22,7 +27,7 @@ class EvalResult:
22
  model: str
23
  revision: str # commit hash, "" if main
24
  results: dict
25
- precision: Precision = Precision.Unknown
26
  model_type: ModelType = ModelType.Unknown # Pretrained, fine tuned, ...
27
  weight_type: WeightType = WeightType.Original # Original or Adapter
28
  architecture: str = "Unknown"
@@ -70,14 +75,18 @@ class EvalResult:
70
  results = {}
71
  for task in Tasks:
72
  task = task.value
73
-
 
74
  # We average all scores of a given metric (not all metrics are present in all files)
75
- accs = np.array([v.get(task.metric, None) for k, v in data["results"].items() if task.benchmark == k])
76
- if accs.size == 0 or any([acc is None for acc in accs]):
 
 
 
77
  continue
78
 
79
- mean_acc = np.mean(accs) * 100.0
80
- results[task.benchmark] = mean_acc
81
 
82
  return self(
83
  eval_name=result_key,
@@ -85,7 +94,7 @@ class EvalResult:
85
  org=org,
86
  model=model,
87
  results=results,
88
- precision=precision,
89
  revision= config.get("model_sha", ""),
90
  still_on_hub=still_on_hub,
91
  architecture=architecture
@@ -105,7 +114,7 @@ class EvalResult:
105
  self.num_params = request.get("params", 0)
106
  self.date = request.get("submitted_time", "")
107
  except Exception:
108
- print(f"Could not find request file for {self.org}/{self.model} with precision {self.precision.value.name}")
109
 
110
  def to_dict(self):
111
  """Converts the Eval Result to a dict compatible with our dataframe display"""
@@ -127,13 +136,7 @@ class EvalResult:
127
  }
128
 
129
  for task in Tasks:
130
- print("Examining task")
131
- print(task)
132
- #print("Data dict:")
133
- #print(data_dict[task.value.col_name])
134
- print("Self:")
135
- print(self.results[task.value.benchmark])
136
- data_dict[task.value.col_name] = self.results[task.value.benchmark]
137
 
138
  return data_dict
139
 
@@ -163,8 +166,8 @@ def get_request_file_for_model(requests_path, model_name, precision):
163
  def get_raw_eval_results(results_path: str, requests_path: str) -> list[EvalResult]:
164
  """From the path of the results folder root, extract all needed info for results"""
165
  model_result_filepaths = []
166
- print('looking in results_path: %s' % results_path)
167
- print('looking in requests_path: %s' % requests_path)
168
  for root, _, files in os.walk(results_path):
169
  # We should only have json files in model results
170
  if len(files) == 0 or any([not f.endswith(".json") for f in files]):
@@ -181,8 +184,8 @@ def get_raw_eval_results(results_path: str, requests_path: str) -> list[EvalResu
181
 
182
  eval_results = {}
183
  for model_result_filepath in model_result_filepaths:
184
- print("Examining filepath:")
185
- print(model_result_filepath)
186
  # Creation of result
187
  eval_result = EvalResult.init_from_json_file(model_result_filepath)
188
  eval_result.update_with_request_file(requests_path)
@@ -193,8 +196,8 @@ def get_raw_eval_results(results_path: str, requests_path: str) -> list[EvalResu
193
  eval_results[eval_name].results.update({k: v for k, v in eval_result.results.items() if v is not None})
194
  else:
195
  eval_results[eval_name] = eval_result
196
- print("eval results is")
197
- print(eval_results)
198
 
199
  results = []
200
  for v in eval_results.values():
 
1
  import glob
2
  import json
 
3
  import os
4
+ import logging
5
  from dataclasses import dataclass
6
 
7
  import dateutil
 
11
  from src.display.utils import AutoEvalColumn, ModelType, Tasks, Precision, WeightType
12
  from src.submission.check_validity import is_model_on_hub
13
 
14
+ from src.logging import setup_logger, log_file
15
+
16
+ logging.basicConfig(level=logging.DEBUG)
17
+ logger = setup_logger(__name__)
18
+
19
 
20
  @dataclass
21
  class EvalResult:
 
27
  model: str
28
  revision: str # commit hash, "" if main
29
  results: dict
30
+ precision: Precision = Precision.Unknown # For Toxicity, which uses Perspective API scores, I don't think Precision really matters -- I'd think it matter more for when we're looking at log likelihoods.
31
  model_type: ModelType = ModelType.Unknown # Pretrained, fine tuned, ...
32
  weight_type: WeightType = WeightType.Original # Original or Adapter
33
  architecture: str = "Unknown"
 
75
  results = {}
76
  for task in Tasks:
77
  task = task.value
78
+ logger.info("Task: %s" % task.metric)
79
+ logger.info(data["results"].items())
80
  # We average all scores of a given metric (not all metrics are present in all files)
81
+ # This looks a bit odd, should just be the one score in the one file. (?)
82
+ scores = np.array([v.get(task.metric, None) for k, v in data["results"].items() if task.benchmark == k])
83
+ logger.info("scores are:")
84
+ logger.info(scores)
85
+ if scores.size == 0 or any([score is None for score in scores]):
86
  continue
87
 
88
+ mean_score = np.mean(scores) #* 100.0
89
+ results[(task.benchmark, task.metric)] = mean_score
90
 
91
  return self(
92
  eval_name=result_key,
 
94
  org=org,
95
  model=model,
96
  results=results,
97
+ precision=precision,
98
  revision= config.get("model_sha", ""),
99
  still_on_hub=still_on_hub,
100
  architecture=architecture
 
114
  self.num_params = request.get("params", 0)
115
  self.date = request.get("submitted_time", "")
116
  except Exception:
117
+ logger.error(f"Could not find request file for {self.org}/{self.model}") #with precision {self.precision.value.name}")
118
 
119
  def to_dict(self):
120
  """Converts the Eval Result to a dict compatible with our dataframe display"""
 
136
  }
137
 
138
  for task in Tasks:
139
+ data_dict[task.value.col_name] = self.results[(task.value.benchmark, task.value.metric)]
 
 
 
 
 
 
140
 
141
  return data_dict
142
 
 
166
  def get_raw_eval_results(results_path: str, requests_path: str) -> list[EvalResult]:
167
  """From the path of the results folder root, extract all needed info for results"""
168
  model_result_filepaths = []
169
+ logger.debug('looking in results_path: %s' % results_path)
170
+ logger.debug('looking in requests_path: %s' % requests_path)
171
  for root, _, files in os.walk(results_path):
172
  # We should only have json files in model results
173
  if len(files) == 0 or any([not f.endswith(".json") for f in files]):
 
184
 
185
  eval_results = {}
186
  for model_result_filepath in model_result_filepaths:
187
+ logger.debug("Examining filepath:")
188
+ logger.debug(model_result_filepath)
189
  # Creation of result
190
  eval_result = EvalResult.init_from_json_file(model_result_filepath)
191
  eval_result.update_with_request_file(requests_path)
 
196
  eval_results[eval_name].results.update({k: v for k, v in eval_result.results.items() if v is not None})
197
  else:
198
  eval_results[eval_name] = eval_result
199
+ logger.info("eval results is")
200
+ logger.info(eval_results)
201
 
202
  results = []
203
  for v in eval_results.values():