File size: 14,737 Bytes
80b7c55 5fc9ef3 452afcb 80b7c55 e281857 80b7c55 e281857 80b7c55 77c26bd 80b7c55 afdb882 80b7c55 403215c 80b7c55 403215c 452afcb 80b7c55 452afcb 80b7c55 452afcb 80b7c55 403215c 80b7c55 403215c 80b7c55 ea9345e 80b7c55 ea9345e 0b8fd4f 7321625 0b8fd4f 7321625 0b8fd4f 7321625 0b8fd4f 7321625 0b8fd4f ea9345e 0b8fd4f 7321625 0b8fd4f 7321625 0b8fd4f 7321625 0b8fd4f 7321625 0b8fd4f 055ccc0 ea9345e 7321625 77c26bd ea9345e 77c26bd 80b7c55 e281857 403215c 80b7c55 77c26bd 80b7c55 e281857 80b7c55 afdb882 80b7c55 aa3f395 acbc2ea e281857 80b7c55 77c26bd 80b7c55 7321625 80b7c55 1af3003 7321625 1af3003 21655da 80b7c55 e281857 80b7c55 f834299 7321625 f834299 7321625 f834299 7321625 f834299 7321625 f834299 7321625 f834299 80b7c55 452afcb a112d34 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 |
import glob
import json
import os
from dataclasses import dataclass
import dateutil
import numpy as np
from src.display.formatting import make_clickable_model
from src.display.utils import AutoEvalColumn, ModelType, Tasks, Precision, WeightType
from src.submission.check_validity import is_model_on_hub
@dataclass
class EvalResult:
"""Represents one full evaluation. Built from a combination of the result and request file for a given run."""
eval_name: str # org_model_precision (uid)
full_model: str # org/model (path on hub)
org: str
model: str
revision: str # commit hash, "" if main
results: dict
precision: Precision = Precision.Unknown
model_type: ModelType = ModelType.Unknown # Pretrained, fine tuned, ...
weight_type: WeightType = WeightType.Original # Original or Adapter
architecture: str = "Unknown"
license: str = "?"
likes: int = 0
num_params: int = 0
date: str = "" # submission date of request file
still_on_hub: bool = False
energy_score: str = "NA" # energy consumption in kWh, "NA" if not available
@classmethod
def init_from_json_file(self, json_filepath):
"""Inits the result from the specific model result file"""
with open(json_filepath) as fp:
data = json.load(fp)
config = data.get("config_general")
# Handle case where config_general is None
if config is None:
# Set default values
precision = Precision.Unknown
org_and_model = data.get("model_name", "Unknown/Unknown")
if not isinstance(org_and_model, str):
org_and_model = "Unknown/Unknown"
revision = "main"
else:
# Precision
precision = Precision.from_str(config.get("model_dtype"))
# Get revision
revision = config.get("model_sha", "")
# Get model and org
org_and_model = config.get("model_name", config.get("model_args", None))
if isinstance(org_and_model, str):
org_and_model = org_and_model.split("/", 1)
else:
org_and_model = ["Unknown", "Unknown"]
# Already handled above
if len(org_and_model) == 1:
org = None
model = org_and_model[0]
result_key = f"{model}_{precision.value.name}"
else:
org = org_and_model[0]
model = org_and_model[1]
result_key = f"{org}_{model}_{precision.value.name}"
full_model = "/".join(org_and_model)
# Use a safe default for model_sha if config is None
model_sha = "main"
if config is not None:
model_sha = config.get("model_sha", "main")
still_on_hub, _, model_config = is_model_on_hub(
full_model, model_sha, trust_remote_code=True, test_tokenizer=False
)
architecture = "?"
if model_config is not None:
architectures = getattr(model_config, "architectures", None)
if architectures:
architecture = ";".join(architectures)
# Extract results available in this file (some results are split in several files)
results = {}
# Check if results key exists in the data
if "results" not in data:
# If no results, set all benchmarks to None
for task in Tasks:
task = task.value
results[task.benchmark] = None
else:
# Process results normally
for task in Tasks:
task = task.value
# We average all scores of a given metric (not all metrics are present in all files)
# Handle metrics that could be None in the JSON
metric_values = []
# Define the expected metric name and alternative names for each benchmark
expected_metric = task.metric
alternative_metrics = []
print(f"Processing benchmark: {task.benchmark}, expected metric: {expected_metric}")
# Set up alternative metric names based on the benchmark
if task.benchmark == "custom|folio:logical_reasoning|0":
if expected_metric != "folio_em":
alternative_metrics = ["folio_em"]
elif task.benchmark == "custom|telecom:qna|0":
if expected_metric != "telecom_qna_em":
alternative_metrics = ["telecom_qna_em"]
elif task.benchmark == "custom|3gpp:tsg|0":
if expected_metric != "em":
alternative_metrics = ["em"]
elif task.benchmark == "custom|math:problem_solving|0":
if expected_metric != "math_metric":
alternative_metrics = ["math_metric"]
elif task.benchmark == "custom|spider:text2sql|0":
if expected_metric != "sql_metric":
alternative_metrics = ["sql_metric"]
# Check for results with the benchmark name
for k, v in data["results"].items():
if task.benchmark == k:
# Try the expected metric name first
metric_value = v.get(expected_metric)
# If not found, try alternative metric names
if metric_value is None:
for alt_metric in alternative_metrics:
if alt_metric in v:
metric_value = v.get(alt_metric)
break
if metric_value is not None:
metric_values.append(metric_value)
print(f"Found metric value for {task.benchmark}: {metric_value}")
accs = np.array([v for v in metric_values if v is not None])
if len(accs) == 0:
# Also check the "all" section for metrics
if "all" in data["results"]:
all_results = data["results"]["all"]
print(f"Checking 'all' section for {task.benchmark}, available keys: {list(all_results.keys())}")
# Try the expected metric name first
metric_value = all_results.get(expected_metric)
# If not found, try alternative metric names
if metric_value is None:
for alt_metric in alternative_metrics:
if alt_metric in all_results:
metric_value = all_results.get(alt_metric)
print(f"Found alternative metric {alt_metric} in 'all' section")
break
if metric_value is not None:
accs = np.array([metric_value])
print(f"Found metric value in 'all' section for {task.benchmark}: {metric_value}")
else:
results[task.benchmark] = None
continue
else:
results[task.benchmark] = None
continue
mean_acc = np.mean(accs) * 100.0
results[task.benchmark] = mean_acc
print(f"Final result for {task.benchmark}: {mean_acc}")
# Extract energy score if available
energy_score = "NA"
if "energy_metrics" in data and data["energy_metrics"] is not None and data["energy_metrics"].get("enabled", False):
total_energy = data["energy_metrics"].get("total_energy", 0)
if total_energy > 0:
energy_score = f"{total_energy:.5f}"
return self(
eval_name=result_key,
full_model=full_model,
org=org,
model=model,
results=results,
precision=precision,
revision=revision,
still_on_hub=still_on_hub,
architecture=architecture,
energy_score=energy_score
)
def update_with_request_file(self, requests_path):
"""Finds the relevant request file for the current model and updates info with it"""
request_file = get_request_file_for_model(requests_path, self.full_model, self.precision.value.name)
try:
with open(request_file, "r") as f:
request = json.load(f)
self.model_type = ModelType.from_str(request.get("model_type", ""))
self.weight_type = WeightType[request.get("weight_type", "Original")]
self.license = request.get("license", "?")
self.likes = request.get("likes", 0)
self.num_params = request.get("params", 0)
self.date = request.get("submitted_time", "")
self.architecture = request.get("architectures", "Unknown") # delete later
self.status = request.get("status", "FAILED")
except Exception:
self.status = "FAILED"
print(f'Could not find request file for {self.org}/{self.model} with "precision:{self.precision.value.name},model_type:{self.model_type}",license:{self.license},status:{self.status}')
def to_dict(self):
"""Converts the Eval Result to a dict compatible with our dataframe display"""
available_metrics = [v for v in self.results.values() if v is not None]
average = sum(available_metrics) / len([v for v in available_metrics if v is not None]) if available_metrics else None
data_dict = {
"eval_name": self.eval_name, # not a column, just a save name,
AutoEvalColumn.precision.name: self.precision.value.name,
AutoEvalColumn.model_type.name: self.model_type.value.name,
AutoEvalColumn.model_type_symbol.name: self.model_type.value.symbol,
AutoEvalColumn.weight_type.name: self.weight_type.value.name,
AutoEvalColumn.architecture.name: self.architecture,
AutoEvalColumn.model.name: make_clickable_model(self.full_model),
AutoEvalColumn.revision.name: self.revision,
AutoEvalColumn.average.name: average,
AutoEvalColumn.license.name: self.license,
AutoEvalColumn.likes.name: self.likes,
AutoEvalColumn.params.name: self.num_params,
AutoEvalColumn.still_on_hub.name: self.still_on_hub,
AutoEvalColumn.energy_score.name: self.energy_score,
}
print(f"\nConverting to dict for model: {self.full_model}")
for task in Tasks:
result = self.results.get(task.value.benchmark)
print(f" Task: {task.value.col_name}, Benchmark: {task.value.benchmark}, Result: {result}")
data_dict[task.value.col_name] = "NA" if result is None else round(result, 2)
return data_dict
def get_request_file_for_model(requests_path, model_name, precision):
"""Selects the correct request file for a given model. Only keeps runs tagged as FINISHED"""
request_files = os.path.join(
requests_path,
f"{model_name}_eval_request_*.json",
)
request_files = glob.glob(request_files)
# Select correct request file (precision)
request_file = ""
request_files = sorted(request_files, reverse=True)
for tmp_request_file in request_files:
with open(tmp_request_file, "r") as f:
req_content = json.load(f)
if (
req_content["status"] in ["FINISHED"]
and req_content["precision"] == precision.split(".")[-1]
):
request_file = tmp_request_file
return request_file
def get_raw_eval_results(results_path: str, requests_path: str) -> list[EvalResult]:
"""From the path of the results folder root, extract all needed info for results"""
model_result_filepaths = []
for root, _, files in os.walk(results_path):
# We should only have json files in model results
if len(files) == 0 or any([not f.endswith(".json") for f in files]):
continue
# Sort the files by date
try:
files.sort(key=lambda x: x.removesuffix(".json").removeprefix("results_")[:-7])
except dateutil.parser._parser.ParserError:
files = [files[-1]]
for file in files:
model_result_filepaths.append(os.path.join(root, file))
eval_results = {}
for model_result_filepath in model_result_filepaths:
try:
# Creation of result
print(f"\nProcessing file: {model_result_filepath}")
eval_result = EvalResult.init_from_json_file(model_result_filepath)
# Skip entries with Unknown/Unknown model name
if eval_result.full_model == "Unknown/Unknown":
print(f"Skipping invalid result file: {model_result_filepath}")
continue
print(f"Model: {eval_result.full_model}")
print(f"Results before update_with_request_file:")
for benchmark, value in eval_result.results.items():
print(f" {benchmark}: {value}")
eval_result.update_with_request_file(requests_path)
print(f"Results after update_with_request_file:")
for benchmark, value in eval_result.results.items():
print(f" {benchmark}: {value}")
# Store results of same eval together
eval_name = eval_result.eval_name
if eval_name in eval_results.keys():
print(f"Updating existing results for {eval_name}")
eval_results[eval_name].results.update({k: v for k, v in eval_result.results.items() if v is not None})
else:
print(f"Adding new results for {eval_name}")
eval_results[eval_name] = eval_result
except Exception as e:
print(f"Error processing result file {model_result_filepath}: {str(e)}")
continue
results = []
for v in eval_results.values():
try:
v.to_dict() # we test if the dict version is complete
results.append(v)
except KeyError: # not all eval values present
continue
return results |