Spaces:

evalitahf
/

evalita_llm_leaderboard

Running

App Files Files Community

evalita_llm_leaderboard / src /leaderboard /read_evals.py

rzanoli

Small changes

cae4d0f 4 months ago

raw

history blame

3.36 kB

	import glob
	import json
	import math
	import os
	from dataclasses import dataclass
	from src.display.formatting import make_clickable_model
	from src.display.utils import AutoEvalColumn, ModelType, Tasks, Precision, WeightType, FewShotType
	from src.submission.check_validity import is_model_on_hub

	@dataclass
	class EvalResult:
	eval_name: str
	full_model: str
	org: str
	model: str
	revision: str
	results: dict
	average_CPS: str
	fewshot: int
	fewshot_type: FewShotType = FewShotType.Unknown
	weight_type: WeightType = WeightType.Original
	architecture: str = "Unknown"
	license: str = "?"
	likes: int = 0
	num_params: int = 0
	date: str = ""
	still_on_hub: bool = False

	@classmethod
	def init_from_json_file(cls, json_filepath):
	with open(json_filepath) as fp:
	data = json.load(fp)

	config = data.get("config")
	average_CPS = f"{data.get('average_CPS'):.2f}"

	num_fewshot = int(config.get("num_fewshot", 0))
	fewshot_type = FewShotType.from_num_fewshot(num_fewshot)

	model_type = ModelType.from_str(config.get("model_type")) if config.get("model_type") else None
	num_params = math.ceil(config.get("num_params_billion", 0)) if config.get("num_params_billion") else 0

	org_and_model = config.get("model_name", "").split("/", 1)
	org, model = (org_and_model if len(org_and_model) == 2 else (None, org_and_model[0]))

	full_model = "/".join([org, model] if org else [model])
	still_on_hub, _, model_config = is_model_on_hub(full_model, config.get("model_sha", "main"))

	architecture = ";".join(getattr(model_config, "architectures", [])) if model_config else "?"

	results = {
	task.value.benchmark: f"{data['tasks'].get(task.value.benchmark, {}).get(task.metric_type, 0):.2f}"
	for task in Tasks
	}

	return cls(
	eval_name=f"{model}_{num_fewshot}",
	full_model=full_model,
	org=org,
	model=model,
	results=results,
	average_CPS=average_CPS,
	fewshot=fewshot_type,
	fewshot_type=fewshot_type,
	revision=config.get("model_sha", ""),
	still_on_hub=still_on_hub,
	architecture=architecture,
	num_params=num_params
	)

	def get_raw_eval_results(results_path: str, requests_path: str) -> list[EvalResult]:
	model_result_filepaths = [
	os.path.join(root, file)
	for root, _, files in os.walk(results_path)
	for file in sorted(files, key=lambda x: x.split("_")[-1], reverse=True) if file.endswith(".json")
	]

	eval_results = {}
	for model_result_filepath in model_result_filepaths:
	eval_result = EvalResult.init_from_json_file(model_result_filepath)
	eval_name = eval_result.eval_name
	if eval_name not in eval_results:
	eval_results[eval_name] = eval_result
	else:
	eval_results[eval_name].results.update(eval_result.results)

	results = []
	for v in eval_results.values():
	try:
	v.to_dict() # Test if the dict version is complete
	results.append(v)
	except KeyError:
	continue

	return results