Update src/leaderboard/read_evals.py
Browse files
src/leaderboard/read_evals.py
CHANGED
|
@@ -31,6 +31,7 @@ class EvalResult:
|
|
| 31 |
num_params: int = 0
|
| 32 |
date: str = "" # submission date of request file
|
| 33 |
still_on_hub: bool = False
|
|
|
|
| 34 |
|
| 35 |
@classmethod
|
| 36 |
def init_from_json_file(self, json_filepath):
|
|
@@ -57,6 +58,12 @@ class EvalResult:
|
|
| 57 |
result_key = f"{org}_{model}_{precision.value.name}"
|
| 58 |
full_model = "/".join(org_and_model)
|
| 59 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 60 |
still_on_hub, _, model_config = is_model_on_hub(
|
| 61 |
full_model, config.get("model_sha", "main"), trust_remote_code=True, test_tokenizer=False
|
| 62 |
)
|
|
@@ -88,7 +95,8 @@ class EvalResult:
|
|
| 88 |
precision=precision,
|
| 89 |
revision= config.get("model_sha", ""),
|
| 90 |
still_on_hub=still_on_hub,
|
| 91 |
-
architecture=architecture
|
|
|
|
| 92 |
)
|
| 93 |
|
| 94 |
def update_with_request_file(self, requests_path):
|
|
@@ -110,6 +118,10 @@ class EvalResult:
|
|
| 110 |
def to_dict(self):
|
| 111 |
"""Converts the Eval Result to a dict compatible with our dataframe display"""
|
| 112 |
average = sum([v for v in self.results.values() if v is not None]) / len(Tasks)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 113 |
data_dict = {
|
| 114 |
"eval_name": self.eval_name, # not a column, just a save name,
|
| 115 |
AutoEvalColumn.precision.name: self.precision.value.name,
|
|
@@ -117,7 +129,7 @@ class EvalResult:
|
|
| 117 |
AutoEvalColumn.model_type_symbol.name: self.model_type.value.symbol,
|
| 118 |
AutoEvalColumn.weight_type.name: self.weight_type.value.name,
|
| 119 |
AutoEvalColumn.architecture.name: self.architecture,
|
| 120 |
-
AutoEvalColumn.model.name: make_clickable_model(
|
| 121 |
AutoEvalColumn.revision.name: self.revision,
|
| 122 |
AutoEvalColumn.average.name: average,
|
| 123 |
AutoEvalColumn.license.name: self.license,
|
|
@@ -193,4 +205,4 @@ def get_raw_eval_results(results_path: str, requests_path: str) -> list[EvalResu
|
|
| 193 |
except KeyError: # not all eval values present
|
| 194 |
continue
|
| 195 |
|
| 196 |
-
return results
|
|
|
|
| 31 |
num_params: int = 0
|
| 32 |
date: str = "" # submission date of request file
|
| 33 |
still_on_hub: bool = False
|
| 34 |
+
display_model: str = "" # ์๋ก ์ถ๊ฐ: ํ์์ฉ ๋ชจ๋ธ๋ช
|
| 35 |
|
| 36 |
@classmethod
|
| 37 |
def init_from_json_file(self, json_filepath):
|
|
|
|
| 58 |
result_key = f"{org}_{model}_{precision.value.name}"
|
| 59 |
full_model = "/".join(org_and_model)
|
| 60 |
|
| 61 |
+
# ํน์ ๋ชจ๋ธ๋ช
์ ๋ํ ๋งตํ ์ฒ๋ฆฌ ์ถ๊ฐ
|
| 62 |
+
display_model = full_model
|
| 63 |
+
if full_model == "demo-leaderboard/gpt2-demo":
|
| 64 |
+
display_model = "deepseek-ai/DeepSeek-R1"
|
| 65 |
+
print(f"๋ชจ๋ธ๋ช
๋งตํ ์ ์ฉ: {full_model} -> {display_model}")
|
| 66 |
+
|
| 67 |
still_on_hub, _, model_config = is_model_on_hub(
|
| 68 |
full_model, config.get("model_sha", "main"), trust_remote_code=True, test_tokenizer=False
|
| 69 |
)
|
|
|
|
| 95 |
precision=precision,
|
| 96 |
revision= config.get("model_sha", ""),
|
| 97 |
still_on_hub=still_on_hub,
|
| 98 |
+
architecture=architecture,
|
| 99 |
+
display_model=display_model # ์๋ก ์ถ๊ฐํ ํ๋ ์ค์
|
| 100 |
)
|
| 101 |
|
| 102 |
def update_with_request_file(self, requests_path):
|
|
|
|
| 118 |
def to_dict(self):
|
| 119 |
"""Converts the Eval Result to a dict compatible with our dataframe display"""
|
| 120 |
average = sum([v for v in self.results.values() if v is not None]) / len(Tasks)
|
| 121 |
+
|
| 122 |
+
# ํ์์ฉ ๋ชจ๋ธ๋ช
์ฌ์ฉ
|
| 123 |
+
model_to_display = self.display_model if self.display_model else self.full_model
|
| 124 |
+
|
| 125 |
data_dict = {
|
| 126 |
"eval_name": self.eval_name, # not a column, just a save name,
|
| 127 |
AutoEvalColumn.precision.name: self.precision.value.name,
|
|
|
|
| 129 |
AutoEvalColumn.model_type_symbol.name: self.model_type.value.symbol,
|
| 130 |
AutoEvalColumn.weight_type.name: self.weight_type.value.name,
|
| 131 |
AutoEvalColumn.architecture.name: self.architecture,
|
| 132 |
+
AutoEvalColumn.model.name: make_clickable_model(model_to_display), # ์์ ๋ ๋ถ๋ถ
|
| 133 |
AutoEvalColumn.revision.name: self.revision,
|
| 134 |
AutoEvalColumn.average.name: average,
|
| 135 |
AutoEvalColumn.license.name: self.license,
|
|
|
|
| 205 |
except KeyError: # not all eval values present
|
| 206 |
continue
|
| 207 |
|
| 208 |
+
return results
|