Spaces:

evalitahf
/

evalita_llm_leaderboard

Running

File size: 3,360 Bytes

cae4d0f

import glob
import json
import math
import os
from dataclasses import dataclass
from src.display.formatting import make_clickable_model
from src.display.utils import AutoEvalColumn, ModelType, Tasks, Precision, WeightType, FewShotType
from src.submission.check_validity import is_model_on_hub

@dataclass
class EvalResult:
    eval_name: str
    full_model: str
    org: str
    model: str
    revision: str
    results: dict
    average_CPS: str
    fewshot: int
    fewshot_type: FewShotType = FewShotType.Unknown
    weight_type: WeightType = WeightType.Original
    architecture: str = "Unknown"
    license: str = "?"
    likes: int = 0
    num_params: int = 0
    date: str = ""
    still_on_hub: bool = False

    @classmethod
    def init_from_json_file(cls, json_filepath):
        with open(json_filepath) as fp:
            data = json.load(fp)

        config = data.get("config")
        average_CPS = f"{data.get('average_CPS'):.2f}"

        num_fewshot = int(config.get("num_fewshot", 0))
        fewshot_type = FewShotType.from_num_fewshot(num_fewshot)

        model_type = ModelType.from_str(config.get("model_type")) if config.get("model_type") else None
        num_params = math.ceil(config.get("num_params_billion", 0)) if config.get("num_params_billion") else 0

        org_and_model = config.get("model_name", "").split("/", 1)
        org, model = (org_and_model if len(org_and_model) == 2 else (None, org_and_model[0]))

        full_model = "/".join([org, model] if org else [model])
        still_on_hub, _, model_config = is_model_on_hub(full_model, config.get("model_sha", "main"))

        architecture = ";".join(getattr(model_config, "architectures", [])) if model_config else "?"

        results = {
            task.value.benchmark: f"{data['tasks'].get(task.value.benchmark, {}).get(task.metric_type, 0):.2f}"
            for task in Tasks
        }

        return cls(
            eval_name=f"{model}_{num_fewshot}",
            full_model=full_model,
            org=org,
            model=model,
            results=results,
            average_CPS=average_CPS,
            fewshot=fewshot_type,
            fewshot_type=fewshot_type,
            revision=config.get("model_sha", ""),
            still_on_hub=still_on_hub,
            architecture=architecture,
            num_params=num_params
        )

def get_raw_eval_results(results_path: str, requests_path: str) -> list[EvalResult]:
    model_result_filepaths = [
        os.path.join(root, file)
        for root, _, files in os.walk(results_path)
        for file in sorted(files, key=lambda x: x.split("_")[-1], reverse=True) if file.endswith(".json")
    ]

    eval_results = {}
    for model_result_filepath in model_result_filepaths:
        eval_result = EvalResult.init_from_json_file(model_result_filepath)
        eval_name = eval_result.eval_name
        if eval_name not in eval_results:
            eval_results[eval_name] = eval_result
        else:
            eval_results[eval_name].results.update(eval_result.results)

    results = []
    for v in eval_results.values():
        try:
            v.to_dict()  # Test if the dict version is complete
            results.append(v)
        except KeyError:
            continue

    return results