Spaces:
Running
Running
Upload 34 files
Browse files- .gitattributes +35 -0
- .gitignore +13 -0
- .pre-commit-config.yaml +53 -0
- Classificação dos Dataset.txt +12 -0
- Makefile +13 -0
- README.md +46 -0
- app.py +275 -0
- leaderboard_funcionamento.txt +72 -0
- output/leaderboard_data_20250413_002202.csv +13 -0
- output/leaderboard_data_20250413_002339.csv +13 -0
- output/leaderboard_data_20250413_002339.json +650 -0
- output/leaderboard_data_20250413_002339.pkl +3 -0
- output/leaderboard_data_20250413_002339.xlsx +0 -0
- output/leaderboard_info_20250413_002339.txt +81 -0
- pyproject.toml +13 -0
- requirements.txt +9 -11
- src/about.py +103 -0
- src/about.pyZone.Identifier +0 -0
- src/display/css_html_js.py +105 -0
- src/display/css_html_js.pyZone.Identifier +0 -0
- src/display/formatting.py +27 -0
- src/display/formatting.pyZone.Identifier +0 -0
- src/display/utils.py +145 -0
- src/display/utils.pyZone.Identifier +0 -0
- src/envs.py +25 -0
- src/envs.pyZone.Identifier +0 -0
- src/leaderboard/read_evals.py +196 -0
- src/leaderboard/read_evals.pyZone.Identifier +0 -0
- src/populate.py +79 -0
- src/populate.pyZone.Identifier +0 -0
- src/submission/check_validity.py +99 -0
- src/submission/check_validity.pyZone.Identifier +0 -0
- src/submission/submit.py +119 -0
- src/submission/submit.pyZone.Identifier +0 -0
.gitattributes
ADDED
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
*.7z filter=lfs diff=lfs merge=lfs -text
|
2 |
+
*.arrow filter=lfs diff=lfs merge=lfs -text
|
3 |
+
*.bin filter=lfs diff=lfs merge=lfs -text
|
4 |
+
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
5 |
+
*.ckpt filter=lfs diff=lfs merge=lfs -text
|
6 |
+
*.ftz filter=lfs diff=lfs merge=lfs -text
|
7 |
+
*.gz filter=lfs diff=lfs merge=lfs -text
|
8 |
+
*.h5 filter=lfs diff=lfs merge=lfs -text
|
9 |
+
*.joblib filter=lfs diff=lfs merge=lfs -text
|
10 |
+
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
11 |
+
*.mlmodel filter=lfs diff=lfs merge=lfs -text
|
12 |
+
*.model filter=lfs diff=lfs merge=lfs -text
|
13 |
+
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
14 |
+
*.npy filter=lfs diff=lfs merge=lfs -text
|
15 |
+
*.npz filter=lfs diff=lfs merge=lfs -text
|
16 |
+
*.onnx filter=lfs diff=lfs merge=lfs -text
|
17 |
+
*.ot filter=lfs diff=lfs merge=lfs -text
|
18 |
+
*.parquet filter=lfs diff=lfs merge=lfs -text
|
19 |
+
*.pb filter=lfs diff=lfs merge=lfs -text
|
20 |
+
*.pickle filter=lfs diff=lfs merge=lfs -text
|
21 |
+
*.pkl filter=lfs diff=lfs merge=lfs -text
|
22 |
+
*.pt filter=lfs diff=lfs merge=lfs -text
|
23 |
+
*.pth filter=lfs diff=lfs merge=lfs -text
|
24 |
+
*.rar filter=lfs diff=lfs merge=lfs -text
|
25 |
+
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
26 |
+
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
27 |
+
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
28 |
+
*.tflite filter=lfs diff=lfs merge=lfs -text
|
29 |
+
*.tgz filter=lfs diff=lfs merge=lfs -text
|
30 |
+
*.wasm filter=lfs diff=lfs merge=lfs -text
|
31 |
+
*.xz filter=lfs diff=lfs merge=lfs -text
|
32 |
+
*.zip filter=lfs diff=lfs merge=lfs -text
|
33 |
+
*.zst filter=lfs diff=lfs merge=lfs -text
|
34 |
+
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
35 |
+
scale-hf-logo.png filter=lfs diff=lfs merge=lfs -text
|
.gitignore
ADDED
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
auto_evals/
|
2 |
+
venv/
|
3 |
+
__pycache__/
|
4 |
+
.env
|
5 |
+
.ipynb_checkpoints
|
6 |
+
*ipynb
|
7 |
+
.vscode/
|
8 |
+
|
9 |
+
eval-queue/
|
10 |
+
eval-results/
|
11 |
+
eval-queue-bk/
|
12 |
+
eval-results-bk/
|
13 |
+
logs/
|
.pre-commit-config.yaml
ADDED
@@ -0,0 +1,53 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
|
2 |
+
#
|
3 |
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
4 |
+
# you may not use this file except in compliance with the License.
|
5 |
+
# You may obtain a copy of the License at
|
6 |
+
#
|
7 |
+
# http://www.apache.org/licenses/LICENSE-2.0
|
8 |
+
#
|
9 |
+
# Unless required by applicable law or agreed to in writing, software
|
10 |
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
11 |
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
12 |
+
# See the License for the specific language governing permissions and
|
13 |
+
# limitations under the License.
|
14 |
+
|
15 |
+
default_language_version:
|
16 |
+
python: python3
|
17 |
+
|
18 |
+
ci:
|
19 |
+
autofix_prs: true
|
20 |
+
autoupdate_commit_msg: '[pre-commit.ci] pre-commit suggestions'
|
21 |
+
autoupdate_schedule: quarterly
|
22 |
+
|
23 |
+
repos:
|
24 |
+
- repo: https://github.com/pre-commit/pre-commit-hooks
|
25 |
+
rev: v4.3.0
|
26 |
+
hooks:
|
27 |
+
- id: check-yaml
|
28 |
+
- id: check-case-conflict
|
29 |
+
- id: detect-private-key
|
30 |
+
- id: check-added-large-files
|
31 |
+
args: ['--maxkb=1000']
|
32 |
+
- id: requirements-txt-fixer
|
33 |
+
- id: end-of-file-fixer
|
34 |
+
- id: trailing-whitespace
|
35 |
+
|
36 |
+
- repo: https://github.com/PyCQA/isort
|
37 |
+
rev: 5.12.0
|
38 |
+
hooks:
|
39 |
+
- id: isort
|
40 |
+
name: Format imports
|
41 |
+
|
42 |
+
- repo: https://github.com/psf/black
|
43 |
+
rev: 22.12.0
|
44 |
+
hooks:
|
45 |
+
- id: black
|
46 |
+
name: Format code
|
47 |
+
additional_dependencies: ['click==8.0.2']
|
48 |
+
|
49 |
+
- repo: https://github.com/charliermarsh/ruff-pre-commit
|
50 |
+
# Ruff version.
|
51 |
+
rev: 'v0.0.267'
|
52 |
+
hooks:
|
53 |
+
- id: ruff
|
Classificação dos Dataset.txt
ADDED
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Classificação dos Dataset
|
2 |
+
|
3 |
+
A seguir, serão descritos quais datasets pertencem a quais categorias:
|
4 |
+
|
5 |
+
Área Médica: Revalida, MREX
|
6 |
+
Área do Direito: OAB, ENAM
|
7 |
+
Provas Militares: AFA, ITA, IME
|
8 |
+
Computação: POSCOMP, OBI
|
9 |
+
Discurso de Ódio: HateBR, PT Hate Speech, tweetSentBR
|
10 |
+
Economia e Contabilidade: BCB, CFCES
|
11 |
+
Compreensão de Semântica e Inferência Textual: FAQUAD NLI, ASSIN2 RTE, ASSIN2 STS
|
12 |
+
Provas de Conhecimento Multidisciplinar: ENEM, BLUEX, CNPU, ENADE, BNDES, CACD (1ª fase), CACD (2ª fase)
|
Makefile
ADDED
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
.PHONY: style format
|
2 |
+
|
3 |
+
|
4 |
+
style:
|
5 |
+
python -m black --line-length 119 .
|
6 |
+
python -m isort .
|
7 |
+
ruff check --fix .
|
8 |
+
|
9 |
+
|
10 |
+
quality:
|
11 |
+
python -m black --check --line-length 119 .
|
12 |
+
python -m isort --check-only .
|
13 |
+
ruff check .
|
README.md
ADDED
@@ -0,0 +1,46 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
---
|
2 |
+
title: Cemig
|
3 |
+
emoji: 🥇
|
4 |
+
colorFrom: green
|
5 |
+
colorTo: indigo
|
6 |
+
sdk: gradio
|
7 |
+
app_file: app.py
|
8 |
+
pinned: true
|
9 |
+
license: apache-2.0
|
10 |
+
short_description: Teste para criação de uma leaderboard
|
11 |
+
sdk_version: 5.19.0
|
12 |
+
---
|
13 |
+
|
14 |
+
# Start the configuration
|
15 |
+
|
16 |
+
Most of the variables to change for a default leaderboard are in `src/env.py` (replace the path for your leaderboard) and `src/about.py` (for tasks).
|
17 |
+
|
18 |
+
Results files should have the following format and be stored as json files:
|
19 |
+
```json
|
20 |
+
{
|
21 |
+
"config": {
|
22 |
+
"model_dtype": "torch.float16", # or torch.bfloat16 or 8bit or 4bit
|
23 |
+
"model_name": "path of the model on the hub: org/model",
|
24 |
+
"model_sha": "revision on the hub",
|
25 |
+
},
|
26 |
+
"results": {
|
27 |
+
"task_name": {
|
28 |
+
"metric_name": score,
|
29 |
+
},
|
30 |
+
"task_name2": {
|
31 |
+
"metric_name": score,
|
32 |
+
}
|
33 |
+
}
|
34 |
+
}
|
35 |
+
```
|
36 |
+
|
37 |
+
Request files are created automatically by this tool.
|
38 |
+
|
39 |
+
If you encounter problem on the space, don't hesitate to restart it to remove the create eval-queue, eval-queue-bk, eval-results and eval-results-bk created folder.
|
40 |
+
|
41 |
+
# Code logic for more complex edits
|
42 |
+
|
43 |
+
You'll find
|
44 |
+
- the main table' columns names and properties in `src/display/utils.py`
|
45 |
+
- the logic to read all results and request files, then convert them in dataframe lines, in `src/leaderboard/read_evals.py`, and `src/populate.py`
|
46 |
+
- the logic to allow or filter submissions in `src/submission/submit.py` and `src/submission/check_validity.py`
|
app.py
ADDED
@@ -0,0 +1,275 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import gradio as gr
|
2 |
+
from gradio_leaderboard import Leaderboard, ColumnFilter, SelectColumns
|
3 |
+
import pandas as pd
|
4 |
+
from apscheduler.schedulers.background import BackgroundScheduler
|
5 |
+
from huggingface_hub import snapshot_download
|
6 |
+
import numpy as np
|
7 |
+
|
8 |
+
from src.about import (
|
9 |
+
CITATION_BUTTON_LABEL,
|
10 |
+
CITATION_BUTTON_TEXT,
|
11 |
+
EVALUATION_QUEUE_TEXT,
|
12 |
+
INTRODUCTION_TEXT,
|
13 |
+
TITLE,
|
14 |
+
Tasks
|
15 |
+
)
|
16 |
+
from src.display.css_html_js import custom_css
|
17 |
+
from src.display.utils import (
|
18 |
+
EVAL_COLS,
|
19 |
+
EVAL_TYPES,
|
20 |
+
AutoEvalColumn,
|
21 |
+
ModelType,
|
22 |
+
fields,
|
23 |
+
WeightType,
|
24 |
+
Precision,
|
25 |
+
AREA_DEFINITIONS,
|
26 |
+
AREA_AVG_COLUMN_MAP
|
27 |
+
)
|
28 |
+
from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, REPO_ID, RESULTS_REPO, TOKEN
|
29 |
+
from src.populate import get_evaluation_queue_df, get_leaderboard_df
|
30 |
+
from src.submission.submit import add_new_eval
|
31 |
+
|
32 |
+
|
33 |
+
def restart_space():
|
34 |
+
API.restart_space(repo_id=REPO_ID)
|
35 |
+
|
36 |
+
### Space initialisation
|
37 |
+
try:
|
38 |
+
print(EVAL_REQUESTS_PATH)
|
39 |
+
snapshot_download(
|
40 |
+
repo_id=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
|
41 |
+
)
|
42 |
+
except Exception as e:
|
43 |
+
print(f"Erro ao baixar EVAL_REQUESTS: {e}")
|
44 |
+
try:
|
45 |
+
print(EVAL_RESULTS_PATH)
|
46 |
+
snapshot_download(
|
47 |
+
repo_id=RESULTS_REPO, local_dir=EVAL_RESULTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
|
48 |
+
)
|
49 |
+
except Exception as e:
|
50 |
+
print(f"Erro ao baixar EVAL_RESULTS: {e}")
|
51 |
+
|
52 |
+
ALL_COLS = [c.name for c in fields(AutoEvalColumn)]
|
53 |
+
|
54 |
+
try:
|
55 |
+
LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, ALL_COLS)
|
56 |
+
except Exception as e:
|
57 |
+
print(f"Erro ao gerar o DataFrame do Leaderboard: {e}")
|
58 |
+
LEADERBOARD_DF = pd.DataFrame()
|
59 |
+
|
60 |
+
(
|
61 |
+
finished_eval_queue_df,
|
62 |
+
running_eval_queue_df,
|
63 |
+
pending_eval_queue_df,
|
64 |
+
) = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
|
65 |
+
|
66 |
+
|
67 |
+
def create_leaderboard_component(dataframe, displayed_cols, hidden_cols=None, cant_deselect_cols=None, title=None):
|
68 |
+
if dataframe is None or dataframe.empty:
|
69 |
+
return gr.Markdown(f"## {title or ''}\nNão há dados para exibir.")
|
70 |
+
|
71 |
+
if hidden_cols is None:
|
72 |
+
hidden_cols = []
|
73 |
+
if cant_deselect_cols is None:
|
74 |
+
cant_deselect_cols = [AutoEvalColumn.model_type_symbol.name, AutoEvalColumn.model.name]
|
75 |
+
|
76 |
+
all_required_cols = set(displayed_cols) | set(hidden_cols) | set(cant_deselect_cols) | {AutoEvalColumn.model_type.name, AutoEvalColumn.precision.name, AutoEvalColumn.params.name, AutoEvalColumn.still_on_hub.name}
|
77 |
+
available_cols = [col for col in all_required_cols if col in dataframe.columns]
|
78 |
+
filtered_df = dataframe[available_cols].copy()
|
79 |
+
|
80 |
+
for col in cant_deselect_cols:
|
81 |
+
if col not in filtered_df.columns:
|
82 |
+
filtered_df[col] = np.nan
|
83 |
+
|
84 |
+
return Leaderboard(
|
85 |
+
value=filtered_df,
|
86 |
+
datatype=[c.type for c in fields(AutoEvalColumn) if c.name in filtered_df.columns],
|
87 |
+
select_columns=SelectColumns(
|
88 |
+
default_selection=displayed_cols,
|
89 |
+
cant_deselect=cant_deselect_cols,
|
90 |
+
label="Selecionar Colunas para Exibir:",
|
91 |
+
),
|
92 |
+
search_columns=[AutoEvalColumn.model.name, AutoEvalColumn.license.name] if AutoEvalColumn.license.name in filtered_df.columns else [AutoEvalColumn.model.name],
|
93 |
+
hide_columns=[c for c in hidden_cols if c in filtered_df.columns],
|
94 |
+
filter_columns=[
|
95 |
+
ColumnFilter(AutoEvalColumn.model_type.name, type="checkboxgroup", label="Tipos de Modelo") if AutoEvalColumn.model_type.name in filtered_df.columns else None,
|
96 |
+
ColumnFilter(AutoEvalColumn.precision.name, type="checkboxgroup", label="Precisão") if AutoEvalColumn.precision.name in filtered_df.columns else None,
|
97 |
+
ColumnFilter(
|
98 |
+
AutoEvalColumn.params.name,
|
99 |
+
type="slider",
|
100 |
+
min=0.01,
|
101 |
+
max=max(150, filtered_df[AutoEvalColumn.params.name].max() if AutoEvalColumn.params.name in filtered_df.columns and not filtered_df[AutoEvalColumn.params.name].empty else 150),
|
102 |
+
label="Selecionar número de parâmetros (B)",
|
103 |
+
) if AutoEvalColumn.params.name in filtered_df.columns else None,
|
104 |
+
ColumnFilter(
|
105 |
+
AutoEvalColumn.still_on_hub.name, type="boolean", label="Deletado/incompleto", default=True
|
106 |
+
) if AutoEvalColumn.still_on_hub.name in filtered_df.columns else None,
|
107 |
+
],
|
108 |
+
bool_checkboxgroup_label="Ocultar modelos",
|
109 |
+
interactive=False,
|
110 |
+
)
|
111 |
+
|
112 |
+
|
113 |
+
demo = gr.Blocks(css=custom_css)
|
114 |
+
with demo:
|
115 |
+
gr.HTML(TITLE)
|
116 |
+
gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
|
117 |
+
|
118 |
+
with gr.Tabs(elem_classes="tab-buttons") as tabs:
|
119 |
+
with gr.TabItem("📊 Benchmark Geral", id=0):
|
120 |
+
general_cols_to_display = [
|
121 |
+
AutoEvalColumn.model_type_symbol.name,
|
122 |
+
AutoEvalColumn.model.name,
|
123 |
+
AutoEvalColumn.average.name,
|
124 |
+
] + list(AREA_AVG_COLUMN_MAP.values())
|
125 |
+
|
126 |
+
general_hidden_cols = [task.name for task in Tasks] + [
|
127 |
+
AutoEvalColumn.model_type.name,
|
128 |
+
AutoEvalColumn.architecture.name,
|
129 |
+
AutoEvalColumn.weight_type.name,
|
130 |
+
AutoEvalColumn.precision.name,
|
131 |
+
AutoEvalColumn.license.name,
|
132 |
+
AutoEvalColumn.params.name,
|
133 |
+
AutoEvalColumn.likes.name,
|
134 |
+
AutoEvalColumn.still_on_hub.name,
|
135 |
+
AutoEvalColumn.revision.name
|
136 |
+
]
|
137 |
+
|
138 |
+
create_leaderboard_component(
|
139 |
+
LEADERBOARD_DF,
|
140 |
+
displayed_cols=general_cols_to_display,
|
141 |
+
hidden_cols=general_hidden_cols,
|
142 |
+
title="Benchmark Geral"
|
143 |
+
)
|
144 |
+
|
145 |
+
tab_index = 1
|
146 |
+
for area_name, tasks_in_area in AREA_DEFINITIONS.items():
|
147 |
+
with gr.TabItem(f"🎓 {area_name}", id=tab_index):
|
148 |
+
area_cols_to_display = [
|
149 |
+
AutoEvalColumn.model_type_symbol.name,
|
150 |
+
AutoEvalColumn.model.name,
|
151 |
+
] + [task.name for task in tasks_in_area]
|
152 |
+
|
153 |
+
area_hidden_cols = list(AREA_AVG_COLUMN_MAP.values()) + [
|
154 |
+
task.name for task in Tasks if task not in tasks_in_area
|
155 |
+
] + [
|
156 |
+
AutoEvalColumn.model_type.name,
|
157 |
+
AutoEvalColumn.architecture.name,
|
158 |
+
AutoEvalColumn.weight_type.name,
|
159 |
+
AutoEvalColumn.precision.name,
|
160 |
+
AutoEvalColumn.license.name,
|
161 |
+
AutoEvalColumn.params.name,
|
162 |
+
AutoEvalColumn.likes.name,
|
163 |
+
AutoEvalColumn.still_on_hub.name,
|
164 |
+
AutoEvalColumn.revision.name
|
165 |
+
]
|
166 |
+
|
167 |
+
create_leaderboard_component(
|
168 |
+
LEADERBOARD_DF,
|
169 |
+
displayed_cols=area_cols_to_display,
|
170 |
+
hidden_cols=[col for col in area_hidden_cols if col != AutoEvalColumn.average.name],
|
171 |
+
title=area_name
|
172 |
+
)
|
173 |
+
tab_index += 1
|
174 |
+
|
175 |
+
with gr.TabItem("🚀 Submit aqui!", id=tab_index):
|
176 |
+
with gr.Column():
|
177 |
+
with gr.Row():
|
178 |
+
gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
|
179 |
+
|
180 |
+
with gr.Column():
|
181 |
+
with gr.Accordion(
|
182 |
+
f"✅ Avaliações Concluídas ({len(finished_eval_queue_df)})",
|
183 |
+
open=False,
|
184 |
+
):
|
185 |
+
with gr.Row():
|
186 |
+
finished_eval_table = gr.components.Dataframe(
|
187 |
+
value=finished_eval_queue_df,
|
188 |
+
headers=EVAL_COLS,
|
189 |
+
datatype=EVAL_TYPES,
|
190 |
+
row_count=5,
|
191 |
+
)
|
192 |
+
with gr.Accordion(
|
193 |
+
f"🔄 Fila de Avaliação em Execução ({len(running_eval_queue_df)})",
|
194 |
+
open=False,
|
195 |
+
):
|
196 |
+
with gr.Row():
|
197 |
+
running_eval_table = gr.components.Dataframe(
|
198 |
+
value=running_eval_queue_df,
|
199 |
+
headers=EVAL_COLS,
|
200 |
+
datatype=EVAL_TYPES,
|
201 |
+
row_count=5,
|
202 |
+
)
|
203 |
+
|
204 |
+
with gr.Accordion(
|
205 |
+
f"⏳ Fila de Avaliação Pendente ({len(pending_eval_queue_df)})",
|
206 |
+
open=False,
|
207 |
+
):
|
208 |
+
with gr.Row():
|
209 |
+
pending_eval_table = gr.components.Dataframe(
|
210 |
+
value=pending_eval_queue_df,
|
211 |
+
headers=EVAL_COLS,
|
212 |
+
datatype=EVAL_TYPES,
|
213 |
+
row_count=5,
|
214 |
+
)
|
215 |
+
with gr.Row():
|
216 |
+
gr.Markdown("# ✉️✨ Submeta seu modelo aqui!", elem_classes="markdown-text")
|
217 |
+
|
218 |
+
with gr.Row():
|
219 |
+
with gr.Column():
|
220 |
+
model_name_textbox = gr.Textbox(label="Nome do Modelo")
|
221 |
+
revision_name_textbox = gr.Textbox(label="Commit da Revisão", placeholder="main")
|
222 |
+
model_type = gr.Dropdown(
|
223 |
+
choices=[t.to_str(" : ") for t in ModelType if t != ModelType.Unknown],
|
224 |
+
label="Tipo do Modelo",
|
225 |
+
multiselect=False,
|
226 |
+
value=None,
|
227 |
+
interactive=True,
|
228 |
+
)
|
229 |
+
|
230 |
+
with gr.Column():
|
231 |
+
precision = gr.Dropdown(
|
232 |
+
choices=[i.value.name for i in Precision if i != Precision.Unknown],
|
233 |
+
label="Precisão",
|
234 |
+
multiselect=False,
|
235 |
+
value="float16",
|
236 |
+
interactive=True,
|
237 |
+
)
|
238 |
+
weight_type = gr.Dropdown(
|
239 |
+
choices=[i.value.name for i in WeightType],
|
240 |
+
label="Tipo dos Pesos",
|
241 |
+
multiselect=False,
|
242 |
+
value="Original",
|
243 |
+
interactive=True,
|
244 |
+
)
|
245 |
+
base_model_name_textbox = gr.Textbox(label="Modelo Base (para pesos delta ou adapter)")
|
246 |
+
|
247 |
+
submit_button = gr.Button("Submeter Avaliação")
|
248 |
+
submission_result = gr.Markdown()
|
249 |
+
submit_button.click(
|
250 |
+
add_new_eval,
|
251 |
+
[
|
252 |
+
model_name_textbox,
|
253 |
+
base_model_name_textbox,
|
254 |
+
revision_name_textbox,
|
255 |
+
precision,
|
256 |
+
weight_type,
|
257 |
+
model_type,
|
258 |
+
],
|
259 |
+
submission_result,
|
260 |
+
)
|
261 |
+
|
262 |
+
with gr.Row():
|
263 |
+
with gr.Accordion("📙 Citação", open=False):
|
264 |
+
citation_button = gr.Textbox(
|
265 |
+
value=CITATION_BUTTON_TEXT,
|
266 |
+
label=CITATION_BUTTON_LABEL,
|
267 |
+
lines=20,
|
268 |
+
elem_id="citation-button",
|
269 |
+
show_copy_button=True,
|
270 |
+
)
|
271 |
+
|
272 |
+
scheduler = BackgroundScheduler()
|
273 |
+
scheduler.add_job(restart_space, "interval", seconds=1800)
|
274 |
+
scheduler.start()
|
275 |
+
demo.queue(default_concurrency_limit=40).launch()
|
leaderboard_funcionamento.txt
ADDED
@@ -0,0 +1,72 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Funcionamento da Leaderboard para Avaliação de Modelos
|
2 |
+
|
3 |
+
## Visão Geral
|
4 |
+
Esta leaderboard é uma aplicação web desenvolvida com Gradio que permite avaliar, comparar e submeter modelos de linguagem para benchmarks específicos. O sistema é hospedado na plataforma HuggingFace Spaces e oferece uma interface interativa para visualizar resultados de avaliações de modelos em diferentes tarefas.
|
5 |
+
|
6 |
+
## Estrutura do Aplicativo
|
7 |
+
|
8 |
+
### Abas Principais
|
9 |
+
1. **🏅 LLM Benchmark** - Principal aba que exibe a tabela de classificação dos modelos avaliados
|
10 |
+
2. **📝 About** - Informações sobre a leaderboard, metodologia e funcionamento
|
11 |
+
3. **🚀 Submit here!** - Interface para usuários submeterem seus próprios modelos para avaliação
|
12 |
+
|
13 |
+
### Funcionalidades de Filtragem na Leaderboard
|
14 |
+
A tabela de classificação oferece as seguintes opções de filtragem:
|
15 |
+
|
16 |
+
- **Colunas Selecionáveis** - Permite escolher quais métricas e informações exibir
|
17 |
+
- **Filtros de Tipo de Modelo** - Opção para filtrar por categorias de modelos:
|
18 |
+
- 🟢 Pretrained (Pré-treinados)
|
19 |
+
- 🔶 Fine-tuned (Ajuste fino)
|
20 |
+
- ⭕ Instruction-tuned (Ajustados para instruções)
|
21 |
+
- 🟦 RL-tuned (Ajustados por reinforcement learning)
|
22 |
+
|
23 |
+
- **Filtros de Precisão** - Filtragem por formato de pesos:
|
24 |
+
- float16
|
25 |
+
- bfloat16
|
26 |
+
|
27 |
+
- **Filtro de Parâmetros** - Slider para filtrar por número de parâmetros (0.01B - 150B)
|
28 |
+
- **Filtro de Disponibilidade** - Opção para ocultar modelos excluídos ou incompletos
|
29 |
+
- **Busca por Modelo/Licença** - Campo de busca textual para encontrar modelos específicos
|
30 |
+
|
31 |
+
## Métricas e Benchmarks
|
32 |
+
A leaderboard avalia os modelos em benchmarks específicos:
|
33 |
+
- ANLI (Adversarial Natural Language Inference)
|
34 |
+
- LogiQA (Raciocínio lógico)
|
35 |
+
|
36 |
+
O desempenho final é calculado como a média dos resultados em todas as tarefas avaliadas.
|
37 |
+
|
38 |
+
## Sistema de Submissão
|
39 |
+
O sistema permite que usuários enviem seus modelos para avaliação através do formulário de submissão, que inclui:
|
40 |
+
|
41 |
+
1. **Informações do Modelo:**
|
42 |
+
- Nome do modelo (no formato organization/model)
|
43 |
+
- Revisão/commit específico
|
44 |
+
- Tipo de modelo (pretrained, fine-tuned, etc.)
|
45 |
+
- Precisão (float16, bfloat16)
|
46 |
+
- Tipo de pesos (Original, Adapter, Delta)
|
47 |
+
- Modelo base (para pesos delta ou adapter)
|
48 |
+
|
49 |
+
2. **Filas de Avaliação:**
|
50 |
+
- ✅ Avaliações Concluídas
|
51 |
+
- 🔄 Avaliações em Execução
|
52 |
+
- ⏳ Avaliações Pendentes
|
53 |
+
|
54 |
+
## Requisitos para Submissão
|
55 |
+
Os modelos submetidos devem:
|
56 |
+
1. Ser carregáveis através das classes Auto do Hugging Face
|
57 |
+
2. Preferencialmente usar o formato safetensors para armazenamento de pesos
|
58 |
+
3. Ter uma licença aberta
|
59 |
+
4. Ter um model card devidamente preenchido
|
60 |
+
|
61 |
+
## Backend e Armazenamento
|
62 |
+
A leaderboard utiliza:
|
63 |
+
- Repositórios HuggingFace para armazenar resultados de avaliação e requisições
|
64 |
+
- Datasets HuggingFace para gerenciar as filas de avaliação
|
65 |
+
- Sistema de atualização periódica para manter os dados atualizados
|
66 |
+
|
67 |
+
## Detalhes Técnicos
|
68 |
+
- Implementado usando Gradio para a interface
|
69 |
+
- Utiliza pandas para manipulação e exibição de dados
|
70 |
+
- Componente especializado gradio_leaderboard para a visualização da tabela
|
71 |
+
- Atualização automática da interface a cada 30 minutos
|
72 |
+
- Autenticação via token HF para gerenciamento dos repositórios
|
output/leaderboard_data_20250413_002202.csv
ADDED
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
T,Modelo,Tipo,Arquitetura,Tipo de Peso,Precisão,Licença,#Params (B),Hub Likes,Disponível no hub,SHA do modelo,Média Geral,Área Médica,Área do Direito,Provas Militares,Computação,Discurso de Ódio,Economia e Contabilidade,Semântica e Inferência,Multidisciplinar,Revalida,MREX,OAB,ENAM,AFA,ITA,IME,POSCOMP,OBI,HateBR,PT Hate Speech,tweetSentBR,BCB,CFCES,FAQUAD NLI,ASSIN2 RTE,ASSIN2 STS,ENEM,BLUEX,CNPU,ENADE,BNDES,CACD (1ª fase),CACD (2ª fase),Datasets Área Médica,Datasets Área do Direito,Datasets Provas Militares,Datasets Computação,Datasets Discurso de Ódio,Datasets Economia e Contabilidade,Datasets Semântica e Inferência,Datasets Multidisciplinar
|
2 |
+
PT,openai/gpt2-portuguese,PT : pré-treinado,,Original,float16,MIT,0.12,268,True,42b7792,0.7105925230941055,0.6188847305300255,0.6701955871546674,0.5883600439376051,0.7344674503873334,0.7475962540883628,0.849576998841669,0.7788317408159661,0.7090867005579059,0.6035585626832006,0.6342108983768503,0.6592469269015914,0.6811442474077435,0.5748457759326684,0.5677492084978396,0.6224851473823073,0.7001717151455216,0.7687631856291454,0.7670186275780199,0.7059318581743916,0.7698382765126768,0.894231731105497,0.8049222665778408,0.7802498017547961,0.744894322166147,0.811351098526955,0.6944001027581704,0.7122388959938982,0.710017915718404,0.7325868004943001,0.6521346635913674,0.7253374839411425,0.7368910414080588,"Revalida, MREX","OAB, ENAM","AFA, ITA, IME","POSCOMP, OBI","HateBR, PT Hate Speech, tweetSentBR","BCB, CFCES","FAQUAD NLI, ASSIN2 RTE, ASSIN2 STS","ENEM, BLUEX, CNPU, ENADE, BNDES, CACD (1ª fase), CACD (2ª fase)"
|
3 |
+
PT,rufimelo/bert-large-portuguese-cased,PT : pré-treinado,,Original,bfloat16,MIT,0.34,96,True,b1f4531,0.7663174449695491,0.7602091078571434,0.973711984625266,0.9684507475761012,0.6871687003745621,0.8875668825625945,0.680483320210085,0.7734089183287836,0.6143132582475183,0.8009095654537163,0.7195086502605705,0.99,0.957423969250532,0.9253522427283033,0.99,0.99,0.7048405210681232,0.669496879681001,0.884580080804517,0.8725119298929598,0.9056086369903068,0.6574021331921888,0.7035645072279809,0.7812621953082739,0.7898121521378834,0.7491524075401934,0.6587752839863983,0.6568440899210536,0.5762508105837758,0.5817111877078663,0.6622223097972831,0.6028837938647416,0.5615053318715086,"Revalida, MREX","OAB, ENAM","AFA, ITA, IME","POSCOMP, OBI","HateBR, PT Hate Speech, tweetSentBR","BCB, CFCES","FAQUAD NLI, ASSIN2 RTE, ASSIN2 STS","ENEM, BLUEX, CNPU, ENADE, BNDES, CACD (1ª fase), CACD (2ª fase)"
|
4 |
+
FT,unicamp-dl/mbert-portuguese-lener,FT : fine-tuned,,Original,float16,Apache-2.0,0.11,89,True,a764b32,0.5238981255673918,0.46910271433534434,0.6478804077379579,0.4764435027720609,0.6348258377007074,0.5679570688955583,0.545621864602512,0.672668649364131,0.4039265288241587,0.4501996677234772,0.4880057609472114,0.6284122869968649,0.6673485284790508,0.4831702853633705,0.4947379598495716,0.4514222631032407,0.5966991470611256,0.6729525283402894,0.5739315630212218,0.5841033582970794,0.5458362853683737,0.5869583065081572,0.5042854226968669,0.6817845836938061,0.7232768140967074,0.6129445503018796,0.3861249155636754,0.3784890609935717,0.4222874061015168,0.3739044853763201,0.4392823188665487,0.4131461427810349,0.41425137208644314,"Revalida, MREX","OAB, ENAM","AFA, ITA, IME","POSCOMP, OBI","HateBR, PT Hate Speech, tweetSentBR","BCB, CFCES","FAQUAD NLI, ASSIN2 RTE, ASSIN2 STS","ENEM, BLUEX, CNPU, ENADE, BNDES, CACD (1ª fase), CACD (2ª fase)"
|
5 |
+
RL,brasileira/llama-2-7b-pt,RL : RL-tuned,,Original,bfloat16,LLAMA 2,7.0,562,True,c24dd37,0.7190013380059581,0.99,0.6277621780492408,0.6710016620169995,0.8100443731646181,0.7199915914170014,0.8883611805594716,0.6228298935870216,0.6546038867904668,0.99,0.99,0.6037868088474728,0.651737547251009,0.7314858380256698,0.6415017399609673,0.6400174080643614,0.8332219046699918,0.7868668416592445,0.7390035975198623,0.737371682844131,0.6835994938870111,0.920605738500256,0.8561166226186872,0.6392000772840843,0.6328464029942036,0.5964432004827769,0.6811458854492797,0.7087276670361912,0.5953130091726074,0.6393706576325692,0.6943992143544933,0.6474751086467531,0.6157956652413729,"Revalida, MREX","OAB, ENAM","AFA, ITA, IME","POSCOMP, OBI","HateBR, PT Hate Speech, tweetSentBR","BCB, CFCES","FAQUAD NLI, ASSIN2 RTE, ASSIN2 STS","ENEM, BLUEX, CNPU, ENADE, BNDES, CACD (1ª fase), CACD (2ª fase)"
|
6 |
+
PT,neuralmind/bert-base-portuguese-cased,PT : pré-treinado,,Original,float16,MIT,0.11,153,True,main,0.7903320522382491,0.7525068067882859,0.790212164999971,0.5993185824512812,0.8281524838360852,0.8274517991360355,0.6240401489565062,0.7446158736697582,0.9234267541121511,0.731003703849237,0.7740099097273346,0.7285398327624035,0.8518844972375387,0.566841753054601,0.591417134883207,0.6396968594160356,0.8389798308453639,0.8173251368268065,0.9010121557045129,0.8171834652635162,0.7641597764400775,0.6784864262708603,0.5695938716421521,0.7647064598634123,0.7170997425103216,0.7520414186355404,0.9543505337931829,0.9315956953962545,0.8767306588367528,0.9090563746651121,0.9204153379699338,0.9424590333325819,0.9293796447912399,"Revalida, MREX","OAB, ENAM","AFA, ITA, IME","POSCOMP, OBI","HateBR, PT Hate Speech, tweetSentBR","BCB, CFCES","FAQUAD NLI, ASSIN2 RTE, ASSIN2 STS","ENEM, BLUEX, CNPU, ENADE, BNDES, CACD (1ª fase), CACD (2ª fase)"
|
7 |
+
IFT,tulioandrade/mistral-7b-pt-adapter,IFT : instruction-tuned,,Adapter,bfloat16,Apache-2.0,7.2,315,True,main,0.7029740896600845,0.5349382711992174,0.6692960782120554,0.5175172583982789,0.6031745463143081,0.5873322504231425,0.6299480416296037,0.9302316794025772,0.8416315303513803,0.5461004402959746,0.5237761021024602,0.6390972174687846,0.6994949389553262,0.5231758805454357,0.5044300377533704,0.5249458568960303,0.6106598540949609,0.5956892385336552,0.5658410795439047,0.5897717901156058,0.6063838816099171,0.6203659080880489,0.6395301751711585,0.9446092137517881,0.9014725977356242,0.9446132267203191,0.8016106913774061,0.8798065244263629,0.8270460513988156,0.886454918928402,0.7981520314515067,0.8865213636168642,0.8118291312603048,"Revalida, MREX","OAB, ENAM","AFA, ITA, IME","POSCOMP, OBI","HateBR, PT Hate Speech, tweetSentBR","BCB, CFCES","FAQUAD NLI, ASSIN2 RTE, ASSIN2 STS","ENEM, BLUEX, CNPU, ENADE, BNDES, CACD (1ª fase), CACD (2ª fase)"
|
8 |
+
IFT,PetroNLP/xlm-roberta-large-portuguese-instruct,IFT : instruction-tuned,,Original,bfloat16,Apache-2.0,0.56,173,True,8a67c19,0.6169427195732342,0.5533574226765212,0.4822753750865628,0.7312559564874549,0.8708286748117062,0.7547818164225487,0.7539473601359346,0.4607262875406695,0.5207879175691644,0.5183490470430046,0.588365798310038,0.47998532940130334,0.4845654207718223,0.6801757920914254,0.7463653840531308,0.7672266933178084,0.891268718690119,0.8503886309332934,0.7692787720488455,0.6844533933473784,0.8106132838714223,0.7078948291402046,0.7999998911316645,0.5057436202874445,0.45824193776195243,0.4181933045726116,0.5408312211672078,0.48116084507473167,0.500552102499582,0.5444194567570633,0.5195728015292668,0.5294734338791449,0.5295055620771545,"Revalida, MREX","OAB, ENAM","AFA, ITA, IME","POSCOMP, OBI","HateBR, PT Hate Speech, tweetSentBR","BCB, CFCES","FAQUAD NLI, ASSIN2 RTE, ASSIN2 STS","ENEM, BLUEX, CNPU, ENADE, BNDES, CACD (1ª fase), CACD (2ª fase)"
|
9 |
+
FT,pucpr/biobertpt-bio,FT : fine-tuned,,Original,float16,CC-BY-SA-4.0,0.11,47,True,ab2d4b9,0.6108657601152154,0.702988363094768,0.6872008974588979,0.7613994162478939,0.5591628508904611,0.5708983915370802,0.6267255976277573,0.5200744268291007,0.5645008743970255,0.7228867322736113,0.6830899939159248,0.6957087204307917,0.678693074487004,0.8375613035639826,0.7426540780752489,0.70398286710445,0.5558048959754675,0.5625208058054548,0.5322247419674755,0.5780859026067205,0.6023845300370447,0.6241177598442514,0.6293334354112632,0.5662521904860494,0.49510038883517177,0.49887070116608073,0.6043572001868638,0.6034587214831894,0.5179199478458284,0.5128404234538338,0.5440709445935807,0.6134068257246169,0.5554520574912651,"Revalida, MREX","OAB, ENAM","AFA, ITA, IME","POSCOMP, OBI","HateBR, PT Hate Speech, tweetSentBR","BCB, CFCES","FAQUAD NLI, ASSIN2 RTE, ASSIN2 STS","ENEM, BLUEX, CNPU, ENADE, BNDES, CACD (1ª fase), CACD (2ª fase)"
|
10 |
+
IFT,ai-forever/gpt-pequeno-pt,IFT : instruction-tuned,,Original,float16,MIT,1.3,409,True,main,0.75196128994191,0.8670933989159613,0.8348443783260642,0.7223937364955204,0.5604037920638382,0.7773722062427244,0.8326558639935904,0.8139561288452296,0.7022728396080725,0.937260492607467,0.7969263052244557,0.8233168492437696,0.8463719074083589,0.7605339427733732,0.7120343105252683,0.6946129561879192,0.5626662966116917,0.5581412875159847,0.8322948812545272,0.7351328585423854,0.7646888789312607,0.7818810820254256,0.8834306459617549,0.8026792301025911,0.8560128858812226,0.7831762705518751,0.6721405607195572,0.6766199465199311,0.7002599041017649,0.7020562275544762,0.7412478076487755,0.7257566032842915,0.6978288274277119,"Revalida, MREX","OAB, ENAM","AFA, ITA, IME","POSCOMP, OBI","HateBR, PT Hate Speech, tweetSentBR","BCB, CFCES","FAQUAD NLI, ASSIN2 RTE, ASSIN2 STS","ENEM, BLUEX, CNPU, ENADE, BNDES, CACD (1ª fase), CACD (2ª fase)"
|
11 |
+
PT,saramago/roberta-base-portuguese,PT : pré-treinado,,Original,float16,MIT,0.13,112,True,main,0.9312283237428517,0.6179539103982901,0.99,0.9578333048317907,0.9130800909989987,0.8935172820223088,0.9548532555906575,0.9745700292057768,0.9885633439532743,0.6180836055639037,0.6178242152326763,0.99,0.99,0.9647166009011852,0.9449254684441978,0.9638578451499892,0.9581953587133843,0.8679648232846131,0.9161393680272095,0.8602959374608022,0.9041165405789148,0.9449285867327771,0.9647779244485377,0.99,0.99,0.9437100876173303,0.9799434076729195,0.99,0.99,0.99,0.99,0.99,0.99,"Revalida, MREX","OAB, ENAM","AFA, ITA, IME","POSCOMP, OBI","HateBR, PT Hate Speech, tweetSentBR","BCB, CFCES","FAQUAD NLI, ASSIN2 RTE, ASSIN2 STS","ENEM, BLUEX, CNPU, ENADE, BNDES, CACD (1ª fase), CACD (2ª fase)"
|
12 |
+
FT,pierreguillou/bert-base-brpt-clinical,FT : fine-tuned,,Original,float16,MIT,0.11,73,True,c7bef2a,0.5482271789287327,0.4041051952506239,0.6969643618569595,0.6095811315978034,0.6695079353203791,0.6304290306986099,0.4762540627043566,0.5236257225581472,0.48184021849446684,0.37785374915437375,0.43035664134687407,0.6946451229482293,0.69928360076569,0.6208503603101987,0.5800215298462423,0.6278715046369693,0.7053490347771434,0.6336668358636147,0.6278205554878714,0.6237519629728817,0.6397145736350766,0.49769596156757845,0.4548121638411348,0.49858536577398616,0.537775323953586,0.5345164779468695,0.4616904995777591,0.46535002756586913,0.49571926994757104,0.4725184614551914,0.4771470703267811,0.48038288044301486,0.5200733201450809,"Revalida, MREX","OAB, ENAM","AFA, ITA, IME","POSCOMP, OBI","HateBR, PT Hate Speech, tweetSentBR","BCB, CFCES","FAQUAD NLI, ASSIN2 RTE, ASSIN2 STS","ENEM, BLUEX, CNPU, ENADE, BNDES, CACD (1ª fase), CACD (2ª fase)"
|
13 |
+
PT,nlp-wyldlab/deberta-v3-base-portuguese,PT : pré-treinado,,Original,float16,MIT,0.18,128,True,main,0.7465021220826226,0.5946564630491511,0.6710791004110548,0.8314281273352276,0.6823194837016078,0.7755501949635453,0.8795739094780679,0.7241149481320521,0.7525018864872434,0.5792849969048782,0.6100279291934239,0.6985800323961385,0.6435781684259713,0.7914821771702616,0.8881410293755269,0.8146611754598944,0.6519856198221405,0.712653347581075,0.8175383934276091,0.7687241725532503,0.7403880189097766,0.8527339923251589,0.906413826630977,0.6930474129852162,0.6893290656821874,0.7899683657287526,0.7604902451021204,0.815325076521949,0.6935039827485028,0.7749131065853034,0.6890146652904909,0.7499769597205909,0.7842891694417463,"Revalida, MREX","OAB, ENAM","AFA, ITA, IME","POSCOMP, OBI","HateBR, PT Hate Speech, tweetSentBR","BCB, CFCES","FAQUAD NLI, ASSIN2 RTE, ASSIN2 STS","ENEM, BLUEX, CNPU, ENADE, BNDES, CACD (1ª fase), CACD (2ª fase)"
|
output/leaderboard_data_20250413_002339.csv
ADDED
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
T,Modelo,Tipo,Arquitetura,Tipo de Peso,Precisão,Licença,#Params (B),Hub Likes,Disponível no hub,SHA do modelo,Média Geral,Área Médica,Área do Direito,Provas Militares,Computação,Discurso de Ódio,Economia e Contabilidade,Semântica e Inferência,Multidisciplinar,Revalida,MREX,OAB,ENAM,AFA,ITA,IME,POSCOMP,OBI,HateBR,PT Hate Speech,tweetSentBR,BCB,CFCES,FAQUAD NLI,ASSIN2 RTE,ASSIN2 STS,ENEM,BLUEX,CNPU,ENADE,BNDES,CACD (1ª fase),CACD (2ª fase),Datasets Área Médica,Datasets Área do Direito,Datasets Provas Militares,Datasets Computação,Datasets Discurso de Ódio,Datasets Economia e Contabilidade,Datasets Semântica e Inferência,Datasets Multidisciplinar
|
2 |
+
PT,openai/gpt2-portuguese,PT : pré-treinado,,Original,float16,MIT,0.12,268,True,42b7792,0.7105925230941055,0.6188847305300255,0.6701955871546674,0.5883600439376051,0.7344674503873334,0.7475962540883628,0.849576998841669,0.7788317408159661,0.7090867005579059,0.6035585626832006,0.6342108983768503,0.6592469269015914,0.6811442474077435,0.5748457759326684,0.5677492084978396,0.6224851473823073,0.7001717151455216,0.7687631856291454,0.7670186275780199,0.7059318581743916,0.7698382765126768,0.894231731105497,0.8049222665778408,0.7802498017547961,0.744894322166147,0.811351098526955,0.6944001027581704,0.7122388959938982,0.710017915718404,0.7325868004943001,0.6521346635913674,0.7253374839411425,0.7368910414080588,"Revalida, MREX","OAB, ENAM","AFA, ITA, IME","POSCOMP, OBI","HateBR, PT Hate Speech, tweetSentBR","BCB, CFCES","FAQUAD NLI, ASSIN2 RTE, ASSIN2 STS","ENEM, BLUEX, CNPU, ENADE, BNDES, CACD (1ª fase), CACD (2ª fase)"
|
3 |
+
PT,rufimelo/bert-large-portuguese-cased,PT : pré-treinado,,Original,bfloat16,MIT,0.34,96,True,b1f4531,0.7663174449695491,0.7602091078571434,0.973711984625266,0.9684507475761012,0.6871687003745621,0.8875668825625945,0.680483320210085,0.7734089183287836,0.6143132582475183,0.8009095654537163,0.7195086502605705,0.99,0.957423969250532,0.9253522427283033,0.99,0.99,0.7048405210681232,0.669496879681001,0.884580080804517,0.8725119298929598,0.9056086369903068,0.6574021331921888,0.7035645072279809,0.7812621953082739,0.7898121521378834,0.7491524075401934,0.6587752839863983,0.6568440899210536,0.5762508105837758,0.5817111877078663,0.6622223097972831,0.6028837938647416,0.5615053318715086,"Revalida, MREX","OAB, ENAM","AFA, ITA, IME","POSCOMP, OBI","HateBR, PT Hate Speech, tweetSentBR","BCB, CFCES","FAQUAD NLI, ASSIN2 RTE, ASSIN2 STS","ENEM, BLUEX, CNPU, ENADE, BNDES, CACD (1ª fase), CACD (2ª fase)"
|
4 |
+
FT,unicamp-dl/mbert-portuguese-lener,FT : fine-tuned,,Original,float16,Apache-2.0,0.11,89,True,a764b32,0.5238981255673918,0.46910271433534434,0.6478804077379579,0.4764435027720609,0.6348258377007074,0.5679570688955583,0.545621864602512,0.672668649364131,0.4039265288241587,0.4501996677234772,0.4880057609472114,0.6284122869968649,0.6673485284790508,0.4831702853633705,0.4947379598495716,0.4514222631032407,0.5966991470611256,0.6729525283402894,0.5739315630212218,0.5841033582970794,0.5458362853683737,0.5869583065081572,0.5042854226968669,0.6817845836938061,0.7232768140967074,0.6129445503018796,0.3861249155636754,0.3784890609935717,0.4222874061015168,0.3739044853763201,0.4392823188665487,0.4131461427810349,0.41425137208644314,"Revalida, MREX","OAB, ENAM","AFA, ITA, IME","POSCOMP, OBI","HateBR, PT Hate Speech, tweetSentBR","BCB, CFCES","FAQUAD NLI, ASSIN2 RTE, ASSIN2 STS","ENEM, BLUEX, CNPU, ENADE, BNDES, CACD (1ª fase), CACD (2ª fase)"
|
5 |
+
RL,brasileira/llama-2-7b-pt,RL : RL-tuned,,Original,bfloat16,LLAMA 2,7.0,562,True,c24dd37,0.7190013380059581,0.99,0.6277621780492408,0.6710016620169995,0.8100443731646181,0.7199915914170014,0.8883611805594716,0.6228298935870216,0.6546038867904668,0.99,0.99,0.6037868088474728,0.651737547251009,0.7314858380256698,0.6415017399609673,0.6400174080643614,0.8332219046699918,0.7868668416592445,0.7390035975198623,0.737371682844131,0.6835994938870111,0.920605738500256,0.8561166226186872,0.6392000772840843,0.6328464029942036,0.5964432004827769,0.6811458854492797,0.7087276670361912,0.5953130091726074,0.6393706576325692,0.6943992143544933,0.6474751086467531,0.6157956652413729,"Revalida, MREX","OAB, ENAM","AFA, ITA, IME","POSCOMP, OBI","HateBR, PT Hate Speech, tweetSentBR","BCB, CFCES","FAQUAD NLI, ASSIN2 RTE, ASSIN2 STS","ENEM, BLUEX, CNPU, ENADE, BNDES, CACD (1ª fase), CACD (2ª fase)"
|
6 |
+
PT,neuralmind/bert-base-portuguese-cased,PT : pré-treinado,,Original,float16,MIT,0.11,153,True,main,0.7903320522382491,0.7525068067882859,0.790212164999971,0.5993185824512812,0.8281524838360852,0.8274517991360355,0.6240401489565062,0.7446158736697582,0.9234267541121511,0.731003703849237,0.7740099097273346,0.7285398327624035,0.8518844972375387,0.566841753054601,0.591417134883207,0.6396968594160356,0.8389798308453639,0.8173251368268065,0.9010121557045129,0.8171834652635162,0.7641597764400775,0.6784864262708603,0.5695938716421521,0.7647064598634123,0.7170997425103216,0.7520414186355404,0.9543505337931829,0.9315956953962545,0.8767306588367528,0.9090563746651121,0.9204153379699338,0.9424590333325819,0.9293796447912399,"Revalida, MREX","OAB, ENAM","AFA, ITA, IME","POSCOMP, OBI","HateBR, PT Hate Speech, tweetSentBR","BCB, CFCES","FAQUAD NLI, ASSIN2 RTE, ASSIN2 STS","ENEM, BLUEX, CNPU, ENADE, BNDES, CACD (1ª fase), CACD (2ª fase)"
|
7 |
+
IFT,tulioandrade/mistral-7b-pt-adapter,IFT : instruction-tuned,,Adapter,bfloat16,Apache-2.0,7.2,315,True,main,0.7029740896600845,0.5349382711992174,0.6692960782120554,0.5175172583982789,0.6031745463143081,0.5873322504231425,0.6299480416296037,0.9302316794025772,0.8416315303513803,0.5461004402959746,0.5237761021024602,0.6390972174687846,0.6994949389553262,0.5231758805454357,0.5044300377533704,0.5249458568960303,0.6106598540949609,0.5956892385336552,0.5658410795439047,0.5897717901156058,0.6063838816099171,0.6203659080880489,0.6395301751711585,0.9446092137517881,0.9014725977356242,0.9446132267203191,0.8016106913774061,0.8798065244263629,0.8270460513988156,0.886454918928402,0.7981520314515067,0.8865213636168642,0.8118291312603048,"Revalida, MREX","OAB, ENAM","AFA, ITA, IME","POSCOMP, OBI","HateBR, PT Hate Speech, tweetSentBR","BCB, CFCES","FAQUAD NLI, ASSIN2 RTE, ASSIN2 STS","ENEM, BLUEX, CNPU, ENADE, BNDES, CACD (1ª fase), CACD (2ª fase)"
|
8 |
+
IFT,PetroNLP/xlm-roberta-large-portuguese-instruct,IFT : instruction-tuned,,Original,bfloat16,Apache-2.0,0.56,173,True,8a67c19,0.6169427195732342,0.5533574226765212,0.4822753750865628,0.7312559564874549,0.8708286748117062,0.7547818164225487,0.7539473601359346,0.4607262875406695,0.5207879175691644,0.5183490470430046,0.588365798310038,0.47998532940130334,0.4845654207718223,0.6801757920914254,0.7463653840531308,0.7672266933178084,0.891268718690119,0.8503886309332934,0.7692787720488455,0.6844533933473784,0.8106132838714223,0.7078948291402046,0.7999998911316645,0.5057436202874445,0.45824193776195243,0.4181933045726116,0.5408312211672078,0.48116084507473167,0.500552102499582,0.5444194567570633,0.5195728015292668,0.5294734338791449,0.5295055620771545,"Revalida, MREX","OAB, ENAM","AFA, ITA, IME","POSCOMP, OBI","HateBR, PT Hate Speech, tweetSentBR","BCB, CFCES","FAQUAD NLI, ASSIN2 RTE, ASSIN2 STS","ENEM, BLUEX, CNPU, ENADE, BNDES, CACD (1ª fase), CACD (2ª fase)"
|
9 |
+
FT,pucpr/biobertpt-bio,FT : fine-tuned,,Original,float16,CC-BY-SA-4.0,0.11,47,True,ab2d4b9,0.6108657601152154,0.702988363094768,0.6872008974588979,0.7613994162478939,0.5591628508904611,0.5708983915370802,0.6267255976277573,0.5200744268291007,0.5645008743970255,0.7228867322736113,0.6830899939159248,0.6957087204307917,0.678693074487004,0.8375613035639826,0.7426540780752489,0.70398286710445,0.5558048959754675,0.5625208058054548,0.5322247419674755,0.5780859026067205,0.6023845300370447,0.6241177598442514,0.6293334354112632,0.5662521904860494,0.49510038883517177,0.49887070116608073,0.6043572001868638,0.6034587214831894,0.5179199478458284,0.5128404234538338,0.5440709445935807,0.6134068257246169,0.5554520574912651,"Revalida, MREX","OAB, ENAM","AFA, ITA, IME","POSCOMP, OBI","HateBR, PT Hate Speech, tweetSentBR","BCB, CFCES","FAQUAD NLI, ASSIN2 RTE, ASSIN2 STS","ENEM, BLUEX, CNPU, ENADE, BNDES, CACD (1ª fase), CACD (2ª fase)"
|
10 |
+
IFT,ai-forever/gpt-pequeno-pt,IFT : instruction-tuned,,Original,float16,MIT,1.3,409,True,main,0.75196128994191,0.8670933989159613,0.8348443783260642,0.7223937364955204,0.5604037920638382,0.7773722062427244,0.8326558639935904,0.8139561288452296,0.7022728396080725,0.937260492607467,0.7969263052244557,0.8233168492437696,0.8463719074083589,0.7605339427733732,0.7120343105252683,0.6946129561879192,0.5626662966116917,0.5581412875159847,0.8322948812545272,0.7351328585423854,0.7646888789312607,0.7818810820254256,0.8834306459617549,0.8026792301025911,0.8560128858812226,0.7831762705518751,0.6721405607195572,0.6766199465199311,0.7002599041017649,0.7020562275544762,0.7412478076487755,0.7257566032842915,0.6978288274277119,"Revalida, MREX","OAB, ENAM","AFA, ITA, IME","POSCOMP, OBI","HateBR, PT Hate Speech, tweetSentBR","BCB, CFCES","FAQUAD NLI, ASSIN2 RTE, ASSIN2 STS","ENEM, BLUEX, CNPU, ENADE, BNDES, CACD (1ª fase), CACD (2ª fase)"
|
11 |
+
PT,saramago/roberta-base-portuguese,PT : pré-treinado,,Original,float16,MIT,0.13,112,True,main,0.9312283237428517,0.6179539103982901,0.99,0.9578333048317907,0.9130800909989987,0.8935172820223088,0.9548532555906575,0.9745700292057768,0.9885633439532743,0.6180836055639037,0.6178242152326763,0.99,0.99,0.9647166009011852,0.9449254684441978,0.9638578451499892,0.9581953587133843,0.8679648232846131,0.9161393680272095,0.8602959374608022,0.9041165405789148,0.9449285867327771,0.9647779244485377,0.99,0.99,0.9437100876173303,0.9799434076729195,0.99,0.99,0.99,0.99,0.99,0.99,"Revalida, MREX","OAB, ENAM","AFA, ITA, IME","POSCOMP, OBI","HateBR, PT Hate Speech, tweetSentBR","BCB, CFCES","FAQUAD NLI, ASSIN2 RTE, ASSIN2 STS","ENEM, BLUEX, CNPU, ENADE, BNDES, CACD (1ª fase), CACD (2ª fase)"
|
12 |
+
FT,pierreguillou/bert-base-brpt-clinical,FT : fine-tuned,,Original,float16,MIT,0.11,73,True,c7bef2a,0.5482271789287327,0.4041051952506239,0.6969643618569595,0.6095811315978034,0.6695079353203791,0.6304290306986099,0.4762540627043566,0.5236257225581472,0.48184021849446684,0.37785374915437375,0.43035664134687407,0.6946451229482293,0.69928360076569,0.6208503603101987,0.5800215298462423,0.6278715046369693,0.7053490347771434,0.6336668358636147,0.6278205554878714,0.6237519629728817,0.6397145736350766,0.49769596156757845,0.4548121638411348,0.49858536577398616,0.537775323953586,0.5345164779468695,0.4616904995777591,0.46535002756586913,0.49571926994757104,0.4725184614551914,0.4771470703267811,0.48038288044301486,0.5200733201450809,"Revalida, MREX","OAB, ENAM","AFA, ITA, IME","POSCOMP, OBI","HateBR, PT Hate Speech, tweetSentBR","BCB, CFCES","FAQUAD NLI, ASSIN2 RTE, ASSIN2 STS","ENEM, BLUEX, CNPU, ENADE, BNDES, CACD (1ª fase), CACD (2ª fase)"
|
13 |
+
PT,nlp-wyldlab/deberta-v3-base-portuguese,PT : pré-treinado,,Original,float16,MIT,0.18,128,True,main,0.7465021220826226,0.5946564630491511,0.6710791004110548,0.8314281273352276,0.6823194837016078,0.7755501949635453,0.8795739094780679,0.7241149481320521,0.7525018864872434,0.5792849969048782,0.6100279291934239,0.6985800323961385,0.6435781684259713,0.7914821771702616,0.8881410293755269,0.8146611754598944,0.6519856198221405,0.712653347581075,0.8175383934276091,0.7687241725532503,0.7403880189097766,0.8527339923251589,0.906413826630977,0.6930474129852162,0.6893290656821874,0.7899683657287526,0.7604902451021204,0.815325076521949,0.6935039827485028,0.7749131065853034,0.6890146652904909,0.7499769597205909,0.7842891694417463,"Revalida, MREX","OAB, ENAM","AFA, ITA, IME","POSCOMP, OBI","HateBR, PT Hate Speech, tweetSentBR","BCB, CFCES","FAQUAD NLI, ASSIN2 RTE, ASSIN2 STS","ENEM, BLUEX, CNPU, ENADE, BNDES, CACD (1ª fase), CACD (2ª fase)"
|
output/leaderboard_data_20250413_002339.json
ADDED
@@ -0,0 +1,650 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
[
|
2 |
+
{
|
3 |
+
"T":"PT",
|
4 |
+
"Modelo":"openai\/gpt2-portuguese",
|
5 |
+
"Tipo":"PT : pr\u00e9-treinado",
|
6 |
+
"Arquitetura":"",
|
7 |
+
"Tipo de Peso":"Original",
|
8 |
+
"Precis\u00e3o":"float16",
|
9 |
+
"Licen\u00e7a":"MIT",
|
10 |
+
"#Params (B)":0.12,
|
11 |
+
"Hub Likes":268,
|
12 |
+
"Dispon\u00edvel no hub":true,
|
13 |
+
"SHA do modelo":"42b7792",
|
14 |
+
"M\u00e9dia Geral":0.7105925231,
|
15 |
+
"\u00c1rea M\u00e9dica":0.6188847305,
|
16 |
+
"\u00c1rea do Direito":0.6701955872,
|
17 |
+
"Provas Militares":0.5883600439,
|
18 |
+
"Computa\u00e7\u00e3o":0.7344674504,
|
19 |
+
"Discurso de \u00d3dio":0.7475962541,
|
20 |
+
"Economia e Contabilidade":0.8495769988,
|
21 |
+
"Sem\u00e2ntica e Infer\u00eancia":0.7788317408,
|
22 |
+
"Multidisciplinar":0.7090867006,
|
23 |
+
"Revalida":0.6035585627,
|
24 |
+
"MREX":0.6342108984,
|
25 |
+
"OAB":0.6592469269,
|
26 |
+
"ENAM":0.6811442474,
|
27 |
+
"AFA":0.5748457759,
|
28 |
+
"ITA":0.5677492085,
|
29 |
+
"IME":0.6224851474,
|
30 |
+
"POSCOMP":0.7001717151,
|
31 |
+
"OBI":0.7687631856,
|
32 |
+
"HateBR":0.7670186276,
|
33 |
+
"PT Hate Speech":0.7059318582,
|
34 |
+
"tweetSentBR":0.7698382765,
|
35 |
+
"BCB":0.8942317311,
|
36 |
+
"CFCES":0.8049222666,
|
37 |
+
"FAQUAD NLI":0.7802498018,
|
38 |
+
"ASSIN2 RTE":0.7448943222,
|
39 |
+
"ASSIN2 STS":0.8113510985,
|
40 |
+
"ENEM":0.6944001028,
|
41 |
+
"BLUEX":0.712238896,
|
42 |
+
"CNPU":0.7100179157,
|
43 |
+
"ENADE":0.7325868005,
|
44 |
+
"BNDES":0.6521346636,
|
45 |
+
"CACD (1\u00aa fase)":0.7253374839,
|
46 |
+
"CACD (2\u00aa fase)":0.7368910414,
|
47 |
+
"Datasets \u00c1rea M\u00e9dica":"Revalida, MREX",
|
48 |
+
"Datasets \u00c1rea do Direito":"OAB, ENAM",
|
49 |
+
"Datasets Provas Militares":"AFA, ITA, IME",
|
50 |
+
"Datasets Computa\u00e7\u00e3o":"POSCOMP, OBI",
|
51 |
+
"Datasets Discurso de \u00d3dio":"HateBR, PT Hate Speech, tweetSentBR",
|
52 |
+
"Datasets Economia e Contabilidade":"BCB, CFCES",
|
53 |
+
"Datasets Sem\u00e2ntica e Infer\u00eancia":"FAQUAD NLI, ASSIN2 RTE, ASSIN2 STS",
|
54 |
+
"Datasets Multidisciplinar":"ENEM, BLUEX, CNPU, ENADE, BNDES, CACD (1\u00aa fase), CACD (2\u00aa fase)"
|
55 |
+
},
|
56 |
+
{
|
57 |
+
"T":"PT",
|
58 |
+
"Modelo":"rufimelo\/bert-large-portuguese-cased",
|
59 |
+
"Tipo":"PT : pr\u00e9-treinado",
|
60 |
+
"Arquitetura":"",
|
61 |
+
"Tipo de Peso":"Original",
|
62 |
+
"Precis\u00e3o":"bfloat16",
|
63 |
+
"Licen\u00e7a":"MIT",
|
64 |
+
"#Params (B)":0.34,
|
65 |
+
"Hub Likes":96,
|
66 |
+
"Dispon\u00edvel no hub":true,
|
67 |
+
"SHA do modelo":"b1f4531",
|
68 |
+
"M\u00e9dia Geral":0.766317445,
|
69 |
+
"\u00c1rea M\u00e9dica":0.7602091079,
|
70 |
+
"\u00c1rea do Direito":0.9737119846,
|
71 |
+
"Provas Militares":0.9684507476,
|
72 |
+
"Computa\u00e7\u00e3o":0.6871687004,
|
73 |
+
"Discurso de \u00d3dio":0.8875668826,
|
74 |
+
"Economia e Contabilidade":0.6804833202,
|
75 |
+
"Sem\u00e2ntica e Infer\u00eancia":0.7734089183,
|
76 |
+
"Multidisciplinar":0.6143132582,
|
77 |
+
"Revalida":0.8009095655,
|
78 |
+
"MREX":0.7195086503,
|
79 |
+
"OAB":0.99,
|
80 |
+
"ENAM":0.9574239693,
|
81 |
+
"AFA":0.9253522427,
|
82 |
+
"ITA":0.99,
|
83 |
+
"IME":0.99,
|
84 |
+
"POSCOMP":0.7048405211,
|
85 |
+
"OBI":0.6694968797,
|
86 |
+
"HateBR":0.8845800808,
|
87 |
+
"PT Hate Speech":0.8725119299,
|
88 |
+
"tweetSentBR":0.905608637,
|
89 |
+
"BCB":0.6574021332,
|
90 |
+
"CFCES":0.7035645072,
|
91 |
+
"FAQUAD NLI":0.7812621953,
|
92 |
+
"ASSIN2 RTE":0.7898121521,
|
93 |
+
"ASSIN2 STS":0.7491524075,
|
94 |
+
"ENEM":0.658775284,
|
95 |
+
"BLUEX":0.6568440899,
|
96 |
+
"CNPU":0.5762508106,
|
97 |
+
"ENADE":0.5817111877,
|
98 |
+
"BNDES":0.6622223098,
|
99 |
+
"CACD (1\u00aa fase)":0.6028837939,
|
100 |
+
"CACD (2\u00aa fase)":0.5615053319,
|
101 |
+
"Datasets \u00c1rea M\u00e9dica":"Revalida, MREX",
|
102 |
+
"Datasets \u00c1rea do Direito":"OAB, ENAM",
|
103 |
+
"Datasets Provas Militares":"AFA, ITA, IME",
|
104 |
+
"Datasets Computa\u00e7\u00e3o":"POSCOMP, OBI",
|
105 |
+
"Datasets Discurso de \u00d3dio":"HateBR, PT Hate Speech, tweetSentBR",
|
106 |
+
"Datasets Economia e Contabilidade":"BCB, CFCES",
|
107 |
+
"Datasets Sem\u00e2ntica e Infer\u00eancia":"FAQUAD NLI, ASSIN2 RTE, ASSIN2 STS",
|
108 |
+
"Datasets Multidisciplinar":"ENEM, BLUEX, CNPU, ENADE, BNDES, CACD (1\u00aa fase), CACD (2\u00aa fase)"
|
109 |
+
},
|
110 |
+
{
|
111 |
+
"T":"FT",
|
112 |
+
"Modelo":"unicamp-dl\/mbert-portuguese-lener",
|
113 |
+
"Tipo":"FT : fine-tuned",
|
114 |
+
"Arquitetura":"",
|
115 |
+
"Tipo de Peso":"Original",
|
116 |
+
"Precis\u00e3o":"float16",
|
117 |
+
"Licen\u00e7a":"Apache-2.0",
|
118 |
+
"#Params (B)":0.11,
|
119 |
+
"Hub Likes":89,
|
120 |
+
"Dispon\u00edvel no hub":true,
|
121 |
+
"SHA do modelo":"a764b32",
|
122 |
+
"M\u00e9dia Geral":0.5238981256,
|
123 |
+
"\u00c1rea M\u00e9dica":0.4691027143,
|
124 |
+
"\u00c1rea do Direito":0.6478804077,
|
125 |
+
"Provas Militares":0.4764435028,
|
126 |
+
"Computa\u00e7\u00e3o":0.6348258377,
|
127 |
+
"Discurso de \u00d3dio":0.5679570689,
|
128 |
+
"Economia e Contabilidade":0.5456218646,
|
129 |
+
"Sem\u00e2ntica e Infer\u00eancia":0.6726686494,
|
130 |
+
"Multidisciplinar":0.4039265288,
|
131 |
+
"Revalida":0.4501996677,
|
132 |
+
"MREX":0.4880057609,
|
133 |
+
"OAB":0.628412287,
|
134 |
+
"ENAM":0.6673485285,
|
135 |
+
"AFA":0.4831702854,
|
136 |
+
"ITA":0.4947379598,
|
137 |
+
"IME":0.4514222631,
|
138 |
+
"POSCOMP":0.5966991471,
|
139 |
+
"OBI":0.6729525283,
|
140 |
+
"HateBR":0.573931563,
|
141 |
+
"PT Hate Speech":0.5841033583,
|
142 |
+
"tweetSentBR":0.5458362854,
|
143 |
+
"BCB":0.5869583065,
|
144 |
+
"CFCES":0.5042854227,
|
145 |
+
"FAQUAD NLI":0.6817845837,
|
146 |
+
"ASSIN2 RTE":0.7232768141,
|
147 |
+
"ASSIN2 STS":0.6129445503,
|
148 |
+
"ENEM":0.3861249156,
|
149 |
+
"BLUEX":0.378489061,
|
150 |
+
"CNPU":0.4222874061,
|
151 |
+
"ENADE":0.3739044854,
|
152 |
+
"BNDES":0.4392823189,
|
153 |
+
"CACD (1\u00aa fase)":0.4131461428,
|
154 |
+
"CACD (2\u00aa fase)":0.4142513721,
|
155 |
+
"Datasets \u00c1rea M\u00e9dica":"Revalida, MREX",
|
156 |
+
"Datasets \u00c1rea do Direito":"OAB, ENAM",
|
157 |
+
"Datasets Provas Militares":"AFA, ITA, IME",
|
158 |
+
"Datasets Computa\u00e7\u00e3o":"POSCOMP, OBI",
|
159 |
+
"Datasets Discurso de \u00d3dio":"HateBR, PT Hate Speech, tweetSentBR",
|
160 |
+
"Datasets Economia e Contabilidade":"BCB, CFCES",
|
161 |
+
"Datasets Sem\u00e2ntica e Infer\u00eancia":"FAQUAD NLI, ASSIN2 RTE, ASSIN2 STS",
|
162 |
+
"Datasets Multidisciplinar":"ENEM, BLUEX, CNPU, ENADE, BNDES, CACD (1\u00aa fase), CACD (2\u00aa fase)"
|
163 |
+
},
|
164 |
+
{
|
165 |
+
"T":"RL",
|
166 |
+
"Modelo":"brasileira\/llama-2-7b-pt",
|
167 |
+
"Tipo":"RL : RL-tuned",
|
168 |
+
"Arquitetura":"",
|
169 |
+
"Tipo de Peso":"Original",
|
170 |
+
"Precis\u00e3o":"bfloat16",
|
171 |
+
"Licen\u00e7a":"LLAMA 2",
|
172 |
+
"#Params (B)":7.0,
|
173 |
+
"Hub Likes":562,
|
174 |
+
"Dispon\u00edvel no hub":true,
|
175 |
+
"SHA do modelo":"c24dd37",
|
176 |
+
"M\u00e9dia Geral":0.719001338,
|
177 |
+
"\u00c1rea M\u00e9dica":0.99,
|
178 |
+
"\u00c1rea do Direito":0.627762178,
|
179 |
+
"Provas Militares":0.671001662,
|
180 |
+
"Computa\u00e7\u00e3o":0.8100443732,
|
181 |
+
"Discurso de \u00d3dio":0.7199915914,
|
182 |
+
"Economia e Contabilidade":0.8883611806,
|
183 |
+
"Sem\u00e2ntica e Infer\u00eancia":0.6228298936,
|
184 |
+
"Multidisciplinar":0.6546038868,
|
185 |
+
"Revalida":0.99,
|
186 |
+
"MREX":0.99,
|
187 |
+
"OAB":0.6037868088,
|
188 |
+
"ENAM":0.6517375473,
|
189 |
+
"AFA":0.731485838,
|
190 |
+
"ITA":0.64150174,
|
191 |
+
"IME":0.6400174081,
|
192 |
+
"POSCOMP":0.8332219047,
|
193 |
+
"OBI":0.7868668417,
|
194 |
+
"HateBR":0.7390035975,
|
195 |
+
"PT Hate Speech":0.7373716828,
|
196 |
+
"tweetSentBR":0.6835994939,
|
197 |
+
"BCB":0.9206057385,
|
198 |
+
"CFCES":0.8561166226,
|
199 |
+
"FAQUAD NLI":0.6392000773,
|
200 |
+
"ASSIN2 RTE":0.632846403,
|
201 |
+
"ASSIN2 STS":0.5964432005,
|
202 |
+
"ENEM":0.6811458854,
|
203 |
+
"BLUEX":0.708727667,
|
204 |
+
"CNPU":0.5953130092,
|
205 |
+
"ENADE":0.6393706576,
|
206 |
+
"BNDES":0.6943992144,
|
207 |
+
"CACD (1\u00aa fase)":0.6474751086,
|
208 |
+
"CACD (2\u00aa fase)":0.6157956652,
|
209 |
+
"Datasets \u00c1rea M\u00e9dica":"Revalida, MREX",
|
210 |
+
"Datasets \u00c1rea do Direito":"OAB, ENAM",
|
211 |
+
"Datasets Provas Militares":"AFA, ITA, IME",
|
212 |
+
"Datasets Computa\u00e7\u00e3o":"POSCOMP, OBI",
|
213 |
+
"Datasets Discurso de \u00d3dio":"HateBR, PT Hate Speech, tweetSentBR",
|
214 |
+
"Datasets Economia e Contabilidade":"BCB, CFCES",
|
215 |
+
"Datasets Sem\u00e2ntica e Infer\u00eancia":"FAQUAD NLI, ASSIN2 RTE, ASSIN2 STS",
|
216 |
+
"Datasets Multidisciplinar":"ENEM, BLUEX, CNPU, ENADE, BNDES, CACD (1\u00aa fase), CACD (2\u00aa fase)"
|
217 |
+
},
|
218 |
+
{
|
219 |
+
"T":"PT",
|
220 |
+
"Modelo":"neuralmind\/bert-base-portuguese-cased",
|
221 |
+
"Tipo":"PT : pr\u00e9-treinado",
|
222 |
+
"Arquitetura":"",
|
223 |
+
"Tipo de Peso":"Original",
|
224 |
+
"Precis\u00e3o":"float16",
|
225 |
+
"Licen\u00e7a":"MIT",
|
226 |
+
"#Params (B)":0.11,
|
227 |
+
"Hub Likes":153,
|
228 |
+
"Dispon\u00edvel no hub":true,
|
229 |
+
"SHA do modelo":"main",
|
230 |
+
"M\u00e9dia Geral":0.7903320522,
|
231 |
+
"\u00c1rea M\u00e9dica":0.7525068068,
|
232 |
+
"\u00c1rea do Direito":0.790212165,
|
233 |
+
"Provas Militares":0.5993185825,
|
234 |
+
"Computa\u00e7\u00e3o":0.8281524838,
|
235 |
+
"Discurso de \u00d3dio":0.8274517991,
|
236 |
+
"Economia e Contabilidade":0.624040149,
|
237 |
+
"Sem\u00e2ntica e Infer\u00eancia":0.7446158737,
|
238 |
+
"Multidisciplinar":0.9234267541,
|
239 |
+
"Revalida":0.7310037038,
|
240 |
+
"MREX":0.7740099097,
|
241 |
+
"OAB":0.7285398328,
|
242 |
+
"ENAM":0.8518844972,
|
243 |
+
"AFA":0.5668417531,
|
244 |
+
"ITA":0.5914171349,
|
245 |
+
"IME":0.6396968594,
|
246 |
+
"POSCOMP":0.8389798308,
|
247 |
+
"OBI":0.8173251368,
|
248 |
+
"HateBR":0.9010121557,
|
249 |
+
"PT Hate Speech":0.8171834653,
|
250 |
+
"tweetSentBR":0.7641597764,
|
251 |
+
"BCB":0.6784864263,
|
252 |
+
"CFCES":0.5695938716,
|
253 |
+
"FAQUAD NLI":0.7647064599,
|
254 |
+
"ASSIN2 RTE":0.7170997425,
|
255 |
+
"ASSIN2 STS":0.7520414186,
|
256 |
+
"ENEM":0.9543505338,
|
257 |
+
"BLUEX":0.9315956954,
|
258 |
+
"CNPU":0.8767306588,
|
259 |
+
"ENADE":0.9090563747,
|
260 |
+
"BNDES":0.920415338,
|
261 |
+
"CACD (1\u00aa fase)":0.9424590333,
|
262 |
+
"CACD (2\u00aa fase)":0.9293796448,
|
263 |
+
"Datasets \u00c1rea M\u00e9dica":"Revalida, MREX",
|
264 |
+
"Datasets \u00c1rea do Direito":"OAB, ENAM",
|
265 |
+
"Datasets Provas Militares":"AFA, ITA, IME",
|
266 |
+
"Datasets Computa\u00e7\u00e3o":"POSCOMP, OBI",
|
267 |
+
"Datasets Discurso de \u00d3dio":"HateBR, PT Hate Speech, tweetSentBR",
|
268 |
+
"Datasets Economia e Contabilidade":"BCB, CFCES",
|
269 |
+
"Datasets Sem\u00e2ntica e Infer\u00eancia":"FAQUAD NLI, ASSIN2 RTE, ASSIN2 STS",
|
270 |
+
"Datasets Multidisciplinar":"ENEM, BLUEX, CNPU, ENADE, BNDES, CACD (1\u00aa fase), CACD (2\u00aa fase)"
|
271 |
+
},
|
272 |
+
{
|
273 |
+
"T":"IFT",
|
274 |
+
"Modelo":"tulioandrade\/mistral-7b-pt-adapter",
|
275 |
+
"Tipo":"IFT : instruction-tuned",
|
276 |
+
"Arquitetura":"",
|
277 |
+
"Tipo de Peso":"Adapter",
|
278 |
+
"Precis\u00e3o":"bfloat16",
|
279 |
+
"Licen\u00e7a":"Apache-2.0",
|
280 |
+
"#Params (B)":7.2,
|
281 |
+
"Hub Likes":315,
|
282 |
+
"Dispon\u00edvel no hub":true,
|
283 |
+
"SHA do modelo":"main",
|
284 |
+
"M\u00e9dia Geral":0.7029740897,
|
285 |
+
"\u00c1rea M\u00e9dica":0.5349382712,
|
286 |
+
"\u00c1rea do Direito":0.6692960782,
|
287 |
+
"Provas Militares":0.5175172584,
|
288 |
+
"Computa\u00e7\u00e3o":0.6031745463,
|
289 |
+
"Discurso de \u00d3dio":0.5873322504,
|
290 |
+
"Economia e Contabilidade":0.6299480416,
|
291 |
+
"Sem\u00e2ntica e Infer\u00eancia":0.9302316794,
|
292 |
+
"Multidisciplinar":0.8416315304,
|
293 |
+
"Revalida":0.5461004403,
|
294 |
+
"MREX":0.5237761021,
|
295 |
+
"OAB":0.6390972175,
|
296 |
+
"ENAM":0.699494939,
|
297 |
+
"AFA":0.5231758805,
|
298 |
+
"ITA":0.5044300378,
|
299 |
+
"IME":0.5249458569,
|
300 |
+
"POSCOMP":0.6106598541,
|
301 |
+
"OBI":0.5956892385,
|
302 |
+
"HateBR":0.5658410795,
|
303 |
+
"PT Hate Speech":0.5897717901,
|
304 |
+
"tweetSentBR":0.6063838816,
|
305 |
+
"BCB":0.6203659081,
|
306 |
+
"CFCES":0.6395301752,
|
307 |
+
"FAQUAD NLI":0.9446092138,
|
308 |
+
"ASSIN2 RTE":0.9014725977,
|
309 |
+
"ASSIN2 STS":0.9446132267,
|
310 |
+
"ENEM":0.8016106914,
|
311 |
+
"BLUEX":0.8798065244,
|
312 |
+
"CNPU":0.8270460514,
|
313 |
+
"ENADE":0.8864549189,
|
314 |
+
"BNDES":0.7981520315,
|
315 |
+
"CACD (1\u00aa fase)":0.8865213636,
|
316 |
+
"CACD (2\u00aa fase)":0.8118291313,
|
317 |
+
"Datasets \u00c1rea M\u00e9dica":"Revalida, MREX",
|
318 |
+
"Datasets \u00c1rea do Direito":"OAB, ENAM",
|
319 |
+
"Datasets Provas Militares":"AFA, ITA, IME",
|
320 |
+
"Datasets Computa\u00e7\u00e3o":"POSCOMP, OBI",
|
321 |
+
"Datasets Discurso de \u00d3dio":"HateBR, PT Hate Speech, tweetSentBR",
|
322 |
+
"Datasets Economia e Contabilidade":"BCB, CFCES",
|
323 |
+
"Datasets Sem\u00e2ntica e Infer\u00eancia":"FAQUAD NLI, ASSIN2 RTE, ASSIN2 STS",
|
324 |
+
"Datasets Multidisciplinar":"ENEM, BLUEX, CNPU, ENADE, BNDES, CACD (1\u00aa fase), CACD (2\u00aa fase)"
|
325 |
+
},
|
326 |
+
{
|
327 |
+
"T":"IFT",
|
328 |
+
"Modelo":"PetroNLP\/xlm-roberta-large-portuguese-instruct",
|
329 |
+
"Tipo":"IFT : instruction-tuned",
|
330 |
+
"Arquitetura":"",
|
331 |
+
"Tipo de Peso":"Original",
|
332 |
+
"Precis\u00e3o":"bfloat16",
|
333 |
+
"Licen\u00e7a":"Apache-2.0",
|
334 |
+
"#Params (B)":0.56,
|
335 |
+
"Hub Likes":173,
|
336 |
+
"Dispon\u00edvel no hub":true,
|
337 |
+
"SHA do modelo":"8a67c19",
|
338 |
+
"M\u00e9dia Geral":0.6169427196,
|
339 |
+
"\u00c1rea M\u00e9dica":0.5533574227,
|
340 |
+
"\u00c1rea do Direito":0.4822753751,
|
341 |
+
"Provas Militares":0.7312559565,
|
342 |
+
"Computa\u00e7\u00e3o":0.8708286748,
|
343 |
+
"Discurso de \u00d3dio":0.7547818164,
|
344 |
+
"Economia e Contabilidade":0.7539473601,
|
345 |
+
"Sem\u00e2ntica e Infer\u00eancia":0.4607262875,
|
346 |
+
"Multidisciplinar":0.5207879176,
|
347 |
+
"Revalida":0.518349047,
|
348 |
+
"MREX":0.5883657983,
|
349 |
+
"OAB":0.4799853294,
|
350 |
+
"ENAM":0.4845654208,
|
351 |
+
"AFA":0.6801757921,
|
352 |
+
"ITA":0.7463653841,
|
353 |
+
"IME":0.7672266933,
|
354 |
+
"POSCOMP":0.8912687187,
|
355 |
+
"OBI":0.8503886309,
|
356 |
+
"HateBR":0.769278772,
|
357 |
+
"PT Hate Speech":0.6844533933,
|
358 |
+
"tweetSentBR":0.8106132839,
|
359 |
+
"BCB":0.7078948291,
|
360 |
+
"CFCES":0.7999998911,
|
361 |
+
"FAQUAD NLI":0.5057436203,
|
362 |
+
"ASSIN2 RTE":0.4582419378,
|
363 |
+
"ASSIN2 STS":0.4181933046,
|
364 |
+
"ENEM":0.5408312212,
|
365 |
+
"BLUEX":0.4811608451,
|
366 |
+
"CNPU":0.5005521025,
|
367 |
+
"ENADE":0.5444194568,
|
368 |
+
"BNDES":0.5195728015,
|
369 |
+
"CACD (1\u00aa fase)":0.5294734339,
|
370 |
+
"CACD (2\u00aa fase)":0.5295055621,
|
371 |
+
"Datasets \u00c1rea M\u00e9dica":"Revalida, MREX",
|
372 |
+
"Datasets \u00c1rea do Direito":"OAB, ENAM",
|
373 |
+
"Datasets Provas Militares":"AFA, ITA, IME",
|
374 |
+
"Datasets Computa\u00e7\u00e3o":"POSCOMP, OBI",
|
375 |
+
"Datasets Discurso de \u00d3dio":"HateBR, PT Hate Speech, tweetSentBR",
|
376 |
+
"Datasets Economia e Contabilidade":"BCB, CFCES",
|
377 |
+
"Datasets Sem\u00e2ntica e Infer\u00eancia":"FAQUAD NLI, ASSIN2 RTE, ASSIN2 STS",
|
378 |
+
"Datasets Multidisciplinar":"ENEM, BLUEX, CNPU, ENADE, BNDES, CACD (1\u00aa fase), CACD (2\u00aa fase)"
|
379 |
+
},
|
380 |
+
{
|
381 |
+
"T":"FT",
|
382 |
+
"Modelo":"pucpr\/biobertpt-bio",
|
383 |
+
"Tipo":"FT : fine-tuned",
|
384 |
+
"Arquitetura":"",
|
385 |
+
"Tipo de Peso":"Original",
|
386 |
+
"Precis\u00e3o":"float16",
|
387 |
+
"Licen\u00e7a":"CC-BY-SA-4.0",
|
388 |
+
"#Params (B)":0.11,
|
389 |
+
"Hub Likes":47,
|
390 |
+
"Dispon\u00edvel no hub":true,
|
391 |
+
"SHA do modelo":"ab2d4b9",
|
392 |
+
"M\u00e9dia Geral":0.6108657601,
|
393 |
+
"\u00c1rea M\u00e9dica":0.7029883631,
|
394 |
+
"\u00c1rea do Direito":0.6872008975,
|
395 |
+
"Provas Militares":0.7613994162,
|
396 |
+
"Computa\u00e7\u00e3o":0.5591628509,
|
397 |
+
"Discurso de \u00d3dio":0.5708983915,
|
398 |
+
"Economia e Contabilidade":0.6267255976,
|
399 |
+
"Sem\u00e2ntica e Infer\u00eancia":0.5200744268,
|
400 |
+
"Multidisciplinar":0.5645008744,
|
401 |
+
"Revalida":0.7228867323,
|
402 |
+
"MREX":0.6830899939,
|
403 |
+
"OAB":0.6957087204,
|
404 |
+
"ENAM":0.6786930745,
|
405 |
+
"AFA":0.8375613036,
|
406 |
+
"ITA":0.7426540781,
|
407 |
+
"IME":0.7039828671,
|
408 |
+
"POSCOMP":0.555804896,
|
409 |
+
"OBI":0.5625208058,
|
410 |
+
"HateBR":0.532224742,
|
411 |
+
"PT Hate Speech":0.5780859026,
|
412 |
+
"tweetSentBR":0.60238453,
|
413 |
+
"BCB":0.6241177598,
|
414 |
+
"CFCES":0.6293334354,
|
415 |
+
"FAQUAD NLI":0.5662521905,
|
416 |
+
"ASSIN2 RTE":0.4951003888,
|
417 |
+
"ASSIN2 STS":0.4988707012,
|
418 |
+
"ENEM":0.6043572002,
|
419 |
+
"BLUEX":0.6034587215,
|
420 |
+
"CNPU":0.5179199478,
|
421 |
+
"ENADE":0.5128404235,
|
422 |
+
"BNDES":0.5440709446,
|
423 |
+
"CACD (1\u00aa fase)":0.6134068257,
|
424 |
+
"CACD (2\u00aa fase)":0.5554520575,
|
425 |
+
"Datasets \u00c1rea M\u00e9dica":"Revalida, MREX",
|
426 |
+
"Datasets \u00c1rea do Direito":"OAB, ENAM",
|
427 |
+
"Datasets Provas Militares":"AFA, ITA, IME",
|
428 |
+
"Datasets Computa\u00e7\u00e3o":"POSCOMP, OBI",
|
429 |
+
"Datasets Discurso de \u00d3dio":"HateBR, PT Hate Speech, tweetSentBR",
|
430 |
+
"Datasets Economia e Contabilidade":"BCB, CFCES",
|
431 |
+
"Datasets Sem\u00e2ntica e Infer\u00eancia":"FAQUAD NLI, ASSIN2 RTE, ASSIN2 STS",
|
432 |
+
"Datasets Multidisciplinar":"ENEM, BLUEX, CNPU, ENADE, BNDES, CACD (1\u00aa fase), CACD (2\u00aa fase)"
|
433 |
+
},
|
434 |
+
{
|
435 |
+
"T":"IFT",
|
436 |
+
"Modelo":"ai-forever\/gpt-pequeno-pt",
|
437 |
+
"Tipo":"IFT : instruction-tuned",
|
438 |
+
"Arquitetura":"",
|
439 |
+
"Tipo de Peso":"Original",
|
440 |
+
"Precis\u00e3o":"float16",
|
441 |
+
"Licen\u00e7a":"MIT",
|
442 |
+
"#Params (B)":1.3,
|
443 |
+
"Hub Likes":409,
|
444 |
+
"Dispon\u00edvel no hub":true,
|
445 |
+
"SHA do modelo":"main",
|
446 |
+
"M\u00e9dia Geral":0.7519612899,
|
447 |
+
"\u00c1rea M\u00e9dica":0.8670933989,
|
448 |
+
"\u00c1rea do Direito":0.8348443783,
|
449 |
+
"Provas Militares":0.7223937365,
|
450 |
+
"Computa\u00e7\u00e3o":0.5604037921,
|
451 |
+
"Discurso de \u00d3dio":0.7773722062,
|
452 |
+
"Economia e Contabilidade":0.832655864,
|
453 |
+
"Sem\u00e2ntica e Infer\u00eancia":0.8139561288,
|
454 |
+
"Multidisciplinar":0.7022728396,
|
455 |
+
"Revalida":0.9372604926,
|
456 |
+
"MREX":0.7969263052,
|
457 |
+
"OAB":0.8233168492,
|
458 |
+
"ENAM":0.8463719074,
|
459 |
+
"AFA":0.7605339428,
|
460 |
+
"ITA":0.7120343105,
|
461 |
+
"IME":0.6946129562,
|
462 |
+
"POSCOMP":0.5626662966,
|
463 |
+
"OBI":0.5581412875,
|
464 |
+
"HateBR":0.8322948813,
|
465 |
+
"PT Hate Speech":0.7351328585,
|
466 |
+
"tweetSentBR":0.7646888789,
|
467 |
+
"BCB":0.781881082,
|
468 |
+
"CFCES":0.883430646,
|
469 |
+
"FAQUAD NLI":0.8026792301,
|
470 |
+
"ASSIN2 RTE":0.8560128859,
|
471 |
+
"ASSIN2 STS":0.7831762706,
|
472 |
+
"ENEM":0.6721405607,
|
473 |
+
"BLUEX":0.6766199465,
|
474 |
+
"CNPU":0.7002599041,
|
475 |
+
"ENADE":0.7020562276,
|
476 |
+
"BNDES":0.7412478076,
|
477 |
+
"CACD (1\u00aa fase)":0.7257566033,
|
478 |
+
"CACD (2\u00aa fase)":0.6978288274,
|
479 |
+
"Datasets \u00c1rea M\u00e9dica":"Revalida, MREX",
|
480 |
+
"Datasets \u00c1rea do Direito":"OAB, ENAM",
|
481 |
+
"Datasets Provas Militares":"AFA, ITA, IME",
|
482 |
+
"Datasets Computa\u00e7\u00e3o":"POSCOMP, OBI",
|
483 |
+
"Datasets Discurso de \u00d3dio":"HateBR, PT Hate Speech, tweetSentBR",
|
484 |
+
"Datasets Economia e Contabilidade":"BCB, CFCES",
|
485 |
+
"Datasets Sem\u00e2ntica e Infer\u00eancia":"FAQUAD NLI, ASSIN2 RTE, ASSIN2 STS",
|
486 |
+
"Datasets Multidisciplinar":"ENEM, BLUEX, CNPU, ENADE, BNDES, CACD (1\u00aa fase), CACD (2\u00aa fase)"
|
487 |
+
},
|
488 |
+
{
|
489 |
+
"T":"PT",
|
490 |
+
"Modelo":"saramago\/roberta-base-portuguese",
|
491 |
+
"Tipo":"PT : pr\u00e9-treinado",
|
492 |
+
"Arquitetura":"",
|
493 |
+
"Tipo de Peso":"Original",
|
494 |
+
"Precis\u00e3o":"float16",
|
495 |
+
"Licen\u00e7a":"MIT",
|
496 |
+
"#Params (B)":0.13,
|
497 |
+
"Hub Likes":112,
|
498 |
+
"Dispon\u00edvel no hub":true,
|
499 |
+
"SHA do modelo":"main",
|
500 |
+
"M\u00e9dia Geral":0.9312283237,
|
501 |
+
"\u00c1rea M\u00e9dica":0.6179539104,
|
502 |
+
"\u00c1rea do Direito":0.99,
|
503 |
+
"Provas Militares":0.9578333048,
|
504 |
+
"Computa\u00e7\u00e3o":0.913080091,
|
505 |
+
"Discurso de \u00d3dio":0.893517282,
|
506 |
+
"Economia e Contabilidade":0.9548532556,
|
507 |
+
"Sem\u00e2ntica e Infer\u00eancia":0.9745700292,
|
508 |
+
"Multidisciplinar":0.988563344,
|
509 |
+
"Revalida":0.6180836056,
|
510 |
+
"MREX":0.6178242152,
|
511 |
+
"OAB":0.99,
|
512 |
+
"ENAM":0.99,
|
513 |
+
"AFA":0.9647166009,
|
514 |
+
"ITA":0.9449254684,
|
515 |
+
"IME":0.9638578451,
|
516 |
+
"POSCOMP":0.9581953587,
|
517 |
+
"OBI":0.8679648233,
|
518 |
+
"HateBR":0.916139368,
|
519 |
+
"PT Hate Speech":0.8602959375,
|
520 |
+
"tweetSentBR":0.9041165406,
|
521 |
+
"BCB":0.9449285867,
|
522 |
+
"CFCES":0.9647779244,
|
523 |
+
"FAQUAD NLI":0.99,
|
524 |
+
"ASSIN2 RTE":0.99,
|
525 |
+
"ASSIN2 STS":0.9437100876,
|
526 |
+
"ENEM":0.9799434077,
|
527 |
+
"BLUEX":0.99,
|
528 |
+
"CNPU":0.99,
|
529 |
+
"ENADE":0.99,
|
530 |
+
"BNDES":0.99,
|
531 |
+
"CACD (1\u00aa fase)":0.99,
|
532 |
+
"CACD (2\u00aa fase)":0.99,
|
533 |
+
"Datasets \u00c1rea M\u00e9dica":"Revalida, MREX",
|
534 |
+
"Datasets \u00c1rea do Direito":"OAB, ENAM",
|
535 |
+
"Datasets Provas Militares":"AFA, ITA, IME",
|
536 |
+
"Datasets Computa\u00e7\u00e3o":"POSCOMP, OBI",
|
537 |
+
"Datasets Discurso de \u00d3dio":"HateBR, PT Hate Speech, tweetSentBR",
|
538 |
+
"Datasets Economia e Contabilidade":"BCB, CFCES",
|
539 |
+
"Datasets Sem\u00e2ntica e Infer\u00eancia":"FAQUAD NLI, ASSIN2 RTE, ASSIN2 STS",
|
540 |
+
"Datasets Multidisciplinar":"ENEM, BLUEX, CNPU, ENADE, BNDES, CACD (1\u00aa fase), CACD (2\u00aa fase)"
|
541 |
+
},
|
542 |
+
{
|
543 |
+
"T":"FT",
|
544 |
+
"Modelo":"pierreguillou\/bert-base-brpt-clinical",
|
545 |
+
"Tipo":"FT : fine-tuned",
|
546 |
+
"Arquitetura":"",
|
547 |
+
"Tipo de Peso":"Original",
|
548 |
+
"Precis\u00e3o":"float16",
|
549 |
+
"Licen\u00e7a":"MIT",
|
550 |
+
"#Params (B)":0.11,
|
551 |
+
"Hub Likes":73,
|
552 |
+
"Dispon\u00edvel no hub":true,
|
553 |
+
"SHA do modelo":"c7bef2a",
|
554 |
+
"M\u00e9dia Geral":0.5482271789,
|
555 |
+
"\u00c1rea M\u00e9dica":0.4041051953,
|
556 |
+
"\u00c1rea do Direito":0.6969643619,
|
557 |
+
"Provas Militares":0.6095811316,
|
558 |
+
"Computa\u00e7\u00e3o":0.6695079353,
|
559 |
+
"Discurso de \u00d3dio":0.6304290307,
|
560 |
+
"Economia e Contabilidade":0.4762540627,
|
561 |
+
"Sem\u00e2ntica e Infer\u00eancia":0.5236257226,
|
562 |
+
"Multidisciplinar":0.4818402185,
|
563 |
+
"Revalida":0.3778537492,
|
564 |
+
"MREX":0.4303566413,
|
565 |
+
"OAB":0.6946451229,
|
566 |
+
"ENAM":0.6992836008,
|
567 |
+
"AFA":0.6208503603,
|
568 |
+
"ITA":0.5800215298,
|
569 |
+
"IME":0.6278715046,
|
570 |
+
"POSCOMP":0.7053490348,
|
571 |
+
"OBI":0.6336668359,
|
572 |
+
"HateBR":0.6278205555,
|
573 |
+
"PT Hate Speech":0.623751963,
|
574 |
+
"tweetSentBR":0.6397145736,
|
575 |
+
"BCB":0.4976959616,
|
576 |
+
"CFCES":0.4548121638,
|
577 |
+
"FAQUAD NLI":0.4985853658,
|
578 |
+
"ASSIN2 RTE":0.537775324,
|
579 |
+
"ASSIN2 STS":0.5345164779,
|
580 |
+
"ENEM":0.4616904996,
|
581 |
+
"BLUEX":0.4653500276,
|
582 |
+
"CNPU":0.4957192699,
|
583 |
+
"ENADE":0.4725184615,
|
584 |
+
"BNDES":0.4771470703,
|
585 |
+
"CACD (1\u00aa fase)":0.4803828804,
|
586 |
+
"CACD (2\u00aa fase)":0.5200733201,
|
587 |
+
"Datasets \u00c1rea M\u00e9dica":"Revalida, MREX",
|
588 |
+
"Datasets \u00c1rea do Direito":"OAB, ENAM",
|
589 |
+
"Datasets Provas Militares":"AFA, ITA, IME",
|
590 |
+
"Datasets Computa\u00e7\u00e3o":"POSCOMP, OBI",
|
591 |
+
"Datasets Discurso de \u00d3dio":"HateBR, PT Hate Speech, tweetSentBR",
|
592 |
+
"Datasets Economia e Contabilidade":"BCB, CFCES",
|
593 |
+
"Datasets Sem\u00e2ntica e Infer\u00eancia":"FAQUAD NLI, ASSIN2 RTE, ASSIN2 STS",
|
594 |
+
"Datasets Multidisciplinar":"ENEM, BLUEX, CNPU, ENADE, BNDES, CACD (1\u00aa fase), CACD (2\u00aa fase)"
|
595 |
+
},
|
596 |
+
{
|
597 |
+
"T":"PT",
|
598 |
+
"Modelo":"nlp-wyldlab\/deberta-v3-base-portuguese",
|
599 |
+
"Tipo":"PT : pr\u00e9-treinado",
|
600 |
+
"Arquitetura":"",
|
601 |
+
"Tipo de Peso":"Original",
|
602 |
+
"Precis\u00e3o":"float16",
|
603 |
+
"Licen\u00e7a":"MIT",
|
604 |
+
"#Params (B)":0.18,
|
605 |
+
"Hub Likes":128,
|
606 |
+
"Dispon\u00edvel no hub":true,
|
607 |
+
"SHA do modelo":"main",
|
608 |
+
"M\u00e9dia Geral":0.7465021221,
|
609 |
+
"\u00c1rea M\u00e9dica":0.594656463,
|
610 |
+
"\u00c1rea do Direito":0.6710791004,
|
611 |
+
"Provas Militares":0.8314281273,
|
612 |
+
"Computa\u00e7\u00e3o":0.6823194837,
|
613 |
+
"Discurso de \u00d3dio":0.775550195,
|
614 |
+
"Economia e Contabilidade":0.8795739095,
|
615 |
+
"Sem\u00e2ntica e Infer\u00eancia":0.7241149481,
|
616 |
+
"Multidisciplinar":0.7525018865,
|
617 |
+
"Revalida":0.5792849969,
|
618 |
+
"MREX":0.6100279292,
|
619 |
+
"OAB":0.6985800324,
|
620 |
+
"ENAM":0.6435781684,
|
621 |
+
"AFA":0.7914821772,
|
622 |
+
"ITA":0.8881410294,
|
623 |
+
"IME":0.8146611755,
|
624 |
+
"POSCOMP":0.6519856198,
|
625 |
+
"OBI":0.7126533476,
|
626 |
+
"HateBR":0.8175383934,
|
627 |
+
"PT Hate Speech":0.7687241726,
|
628 |
+
"tweetSentBR":0.7403880189,
|
629 |
+
"BCB":0.8527339923,
|
630 |
+
"CFCES":0.9064138266,
|
631 |
+
"FAQUAD NLI":0.693047413,
|
632 |
+
"ASSIN2 RTE":0.6893290657,
|
633 |
+
"ASSIN2 STS":0.7899683657,
|
634 |
+
"ENEM":0.7604902451,
|
635 |
+
"BLUEX":0.8153250765,
|
636 |
+
"CNPU":0.6935039827,
|
637 |
+
"ENADE":0.7749131066,
|
638 |
+
"BNDES":0.6890146653,
|
639 |
+
"CACD (1\u00aa fase)":0.7499769597,
|
640 |
+
"CACD (2\u00aa fase)":0.7842891694,
|
641 |
+
"Datasets \u00c1rea M\u00e9dica":"Revalida, MREX",
|
642 |
+
"Datasets \u00c1rea do Direito":"OAB, ENAM",
|
643 |
+
"Datasets Provas Militares":"AFA, ITA, IME",
|
644 |
+
"Datasets Computa\u00e7\u00e3o":"POSCOMP, OBI",
|
645 |
+
"Datasets Discurso de \u00d3dio":"HateBR, PT Hate Speech, tweetSentBR",
|
646 |
+
"Datasets Economia e Contabilidade":"BCB, CFCES",
|
647 |
+
"Datasets Sem\u00e2ntica e Infer\u00eancia":"FAQUAD NLI, ASSIN2 RTE, ASSIN2 STS",
|
648 |
+
"Datasets Multidisciplinar":"ENEM, BLUEX, CNPU, ENADE, BNDES, CACD (1\u00aa fase), CACD (2\u00aa fase)"
|
649 |
+
}
|
650 |
+
]
|
output/leaderboard_data_20250413_002339.pkl
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:a27686dc08775fc43f0b92365c1affd11dfc08026a41a926b2a0c3c22739807d
|
3 |
+
size 7463
|
output/leaderboard_data_20250413_002339.xlsx
ADDED
Binary file (12.3 kB). View file
|
|
output/leaderboard_info_20250413_002339.txt
ADDED
@@ -0,0 +1,81 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
DataFrame Shape: (12, 52)
|
2 |
+
|
3 |
+
Colunas:
|
4 |
+
- T
|
5 |
+
- Modelo
|
6 |
+
- Tipo
|
7 |
+
- Arquitetura
|
8 |
+
- Tipo de Peso
|
9 |
+
- Precisão
|
10 |
+
- Licença
|
11 |
+
- #Params (B)
|
12 |
+
- Hub Likes
|
13 |
+
- Disponível no hub
|
14 |
+
- SHA do modelo
|
15 |
+
- Média Geral
|
16 |
+
- Área Médica
|
17 |
+
- Área do Direito
|
18 |
+
- Provas Militares
|
19 |
+
- Computação
|
20 |
+
- Discurso de Ódio
|
21 |
+
- Economia e Contabilidade
|
22 |
+
- Semântica e Inferência
|
23 |
+
- Multidisciplinar
|
24 |
+
- Revalida
|
25 |
+
- MREX
|
26 |
+
- OAB
|
27 |
+
- ENAM
|
28 |
+
- AFA
|
29 |
+
- ITA
|
30 |
+
- IME
|
31 |
+
- POSCOMP
|
32 |
+
- OBI
|
33 |
+
- HateBR
|
34 |
+
- PT Hate Speech
|
35 |
+
- tweetSentBR
|
36 |
+
- BCB
|
37 |
+
- CFCES
|
38 |
+
- FAQUAD NLI
|
39 |
+
- ASSIN2 RTE
|
40 |
+
- ASSIN2 STS
|
41 |
+
- ENEM
|
42 |
+
- BLUEX
|
43 |
+
- CNPU
|
44 |
+
- ENADE
|
45 |
+
- BNDES
|
46 |
+
- CACD (1ª fase)
|
47 |
+
- CACD (2ª fase)
|
48 |
+
- Datasets Área Médica
|
49 |
+
- Datasets Área do Direito
|
50 |
+
- Datasets Provas Militares
|
51 |
+
- Datasets Computação
|
52 |
+
- Datasets Discurso de Ódio
|
53 |
+
- Datasets Economia e Contabilidade
|
54 |
+
- Datasets Semântica e Inferência
|
55 |
+
- Datasets Multidisciplinar
|
56 |
+
|
57 |
+
Informações por área:
|
58 |
+
|
59 |
+
Área Médica:
|
60 |
+
- Datasets: ['Revalida', 'MREX']
|
61 |
+
|
62 |
+
Área do Direito:
|
63 |
+
- Datasets: ['OAB', 'ENAM']
|
64 |
+
|
65 |
+
Provas Militares:
|
66 |
+
- Datasets: ['AFA', 'ITA', 'IME']
|
67 |
+
|
68 |
+
Computação:
|
69 |
+
- Datasets: ['POSCOMP', 'OBI']
|
70 |
+
|
71 |
+
Discurso de Ódio:
|
72 |
+
- Datasets: ['HateBR', 'PT Hate Speech', 'tweetSentBR']
|
73 |
+
|
74 |
+
Economia e Contabilidade:
|
75 |
+
- Datasets: ['BCB', 'CFCES']
|
76 |
+
|
77 |
+
Semântica e Inferência:
|
78 |
+
- Datasets: ['FAQUAD NLI', 'ASSIN2 RTE', 'ASSIN2 STS']
|
79 |
+
|
80 |
+
Multidisciplinar:
|
81 |
+
- Datasets: ['ENEM', 'BLUEX', 'CNPU', 'ENADE', 'BNDES', 'CACD (1ª fase)', 'CACD (2ª fase)']
|
pyproject.toml
ADDED
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
[tool.ruff]
|
2 |
+
# Enable pycodestyle (`E`) and Pyflakes (`F`) codes by default.
|
3 |
+
select = ["E", "F"]
|
4 |
+
ignore = ["E501"] # line too long (black is taking care of this)
|
5 |
+
line-length = 119
|
6 |
+
fixable = ["A", "B", "C", "D", "E", "F", "G", "I", "N", "Q", "S", "T", "W", "ANN", "ARG", "BLE", "COM", "DJ", "DTZ", "EM", "ERA", "EXE", "FBT", "ICN", "INP", "ISC", "NPY", "PD", "PGH", "PIE", "PL", "PT", "PTH", "PYI", "RET", "RSE", "RUF", "SIM", "SLF", "TCH", "TID", "TRY", "UP", "YTT"]
|
7 |
+
|
8 |
+
[tool.isort]
|
9 |
+
profile = "black"
|
10 |
+
line_length = 119
|
11 |
+
|
12 |
+
[tool.black]
|
13 |
+
line-length = 119
|
requirements.txt
CHANGED
@@ -1,18 +1,16 @@
|
|
1 |
-
APScheduler
|
2 |
black
|
3 |
datasets
|
4 |
-
gradio
|
5 |
gradio[oauth]
|
6 |
-
|
7 |
gradio_client
|
8 |
-
|
9 |
matplotlib
|
10 |
-
numpy
|
11 |
-
pandas
|
12 |
-
python-dateutil
|
13 |
-
streamlit>=1.31.0
|
14 |
tqdm
|
15 |
-
transformers
|
16 |
tokenizers>=0.15.0
|
17 |
-
sentencepiece
|
18 |
-
safetensors>=0.4.0
|
|
|
1 |
+
APScheduler
|
2 |
black
|
3 |
datasets
|
4 |
+
gradio
|
5 |
gradio[oauth]
|
6 |
+
gradio_leaderboard==0.0.13
|
7 |
gradio_client
|
8 |
+
huggingface-hub>=0.18.0
|
9 |
matplotlib
|
10 |
+
numpy
|
11 |
+
pandas
|
12 |
+
python-dateutil
|
|
|
13 |
tqdm
|
14 |
+
transformers
|
15 |
tokenizers>=0.15.0
|
16 |
+
sentencepiece
|
|
src/about.py
ADDED
@@ -0,0 +1,103 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from dataclasses import dataclass
|
2 |
+
from enum import Enum
|
3 |
+
|
4 |
+
@dataclass
|
5 |
+
class Task:
|
6 |
+
benchmark: str
|
7 |
+
metric: str
|
8 |
+
col_name: str
|
9 |
+
|
10 |
+
|
11 |
+
# Select your tasks here
|
12 |
+
# ---------------------------------------------------
|
13 |
+
class Tasks(Enum):
|
14 |
+
# task_key in the json file, metric_key in the json file, name to display in the leaderboard
|
15 |
+
# Área Médica
|
16 |
+
REVALIDA = Task("revalida", "acc", "Revalida")
|
17 |
+
MREX = Task("mrex", "acc", "MREX")
|
18 |
+
# Área do Direito
|
19 |
+
OAB = Task("oab", "acc", "OAB")
|
20 |
+
ENAM = Task("enam", "acc", "ENAM")
|
21 |
+
# Provas Militares
|
22 |
+
AFA = Task("afa", "acc", "AFA")
|
23 |
+
ITA = Task("ita", "acc", "ITA")
|
24 |
+
IME = Task("ime", "acc", "IME")
|
25 |
+
# Computação
|
26 |
+
POSCOMP = Task("poscomp", "acc", "POSCOMP")
|
27 |
+
OBI = Task("obi", "acc", "OBI")
|
28 |
+
# Discurso de Ódio
|
29 |
+
HATEBR = Task("hatebr", "acc", "HateBR")
|
30 |
+
PT_HATE_SPEECH = Task("pt_hate_speech", "acc", "PT Hate Speech")
|
31 |
+
TWEETSENTBR = Task("tweetsentbr", "acc", "tweetSentBR")
|
32 |
+
# Economia e Contabilidade
|
33 |
+
BCB = Task("bcb", "acc", "BCB")
|
34 |
+
CFCES = Task("cfces", "acc", "CFCES")
|
35 |
+
# Compreensão de Semântica e Inferência Textual
|
36 |
+
FAQUAD_NLI = Task("faquad_nli", "acc", "FAQUAD NLI")
|
37 |
+
ASSIN2_RTE = Task("assin2_rte", "acc", "ASSIN2 RTE")
|
38 |
+
ASSIN2_STS = Task("assin2_sts", "acc", "ASSIN2 STS")
|
39 |
+
# Provas de Conhecimento Multidisciplinar
|
40 |
+
ENEM = Task("enem", "acc", "ENEM")
|
41 |
+
BLUEX = Task("bluex", "acc", "BLUEX")
|
42 |
+
CNPU = Task("cnpu", "acc", "CNPU")
|
43 |
+
ENADE = Task("enade", "acc", "ENADE")
|
44 |
+
BNDES = Task("bndes", "acc", "BNDES")
|
45 |
+
CACD_1 = Task("cacd_1", "acc", "CACD (1ª fase)")
|
46 |
+
CACD_2 = Task("cacd_2", "acc", "CACD (2ª fase)")
|
47 |
+
|
48 |
+
|
49 |
+
NUM_FEWSHOT = 0 # Change with your few shot
|
50 |
+
# ---------------------------------------------------
|
51 |
+
|
52 |
+
|
53 |
+
|
54 |
+
# Your leaderboard name
|
55 |
+
TITLE = """<h1 align="center" id="space-title">Demo leaderboard</h1>"""
|
56 |
+
|
57 |
+
# What does your leaderboard evaluate?
|
58 |
+
INTRODUCTION_TEXT = """
|
59 |
+
Intro text
|
60 |
+
"""
|
61 |
+
|
62 |
+
# Which evaluations are you running? how can people reproduce what you have?
|
63 |
+
LLM_BENCHMARKS_TEXT = f"""
|
64 |
+
## How it works
|
65 |
+
|
66 |
+
## Reproducibility
|
67 |
+
To reproduce our results, here is the commands you can run:
|
68 |
+
|
69 |
+
"""
|
70 |
+
|
71 |
+
EVALUATION_QUEUE_TEXT = """
|
72 |
+
## Some good practices before submitting a model
|
73 |
+
|
74 |
+
### 1) Make sure you can load your model and tokenizer using AutoClasses:
|
75 |
+
```python
|
76 |
+
from transformers import AutoConfig, AutoModel, AutoTokenizer
|
77 |
+
config = AutoConfig.from_pretrained("your model name", revision=revision)
|
78 |
+
model = AutoModel.from_pretrained("your model name", revision=revision)
|
79 |
+
tokenizer = AutoTokenizer.from_pretrained("your model name", revision=revision)
|
80 |
+
```
|
81 |
+
If this step fails, follow the error messages to debug your model before submitting it. It's likely your model has been improperly uploaded.
|
82 |
+
|
83 |
+
Note: make sure your model is public!
|
84 |
+
Note: if your model needs `use_remote_code=True`, we do not support this option yet but we are working on adding it, stay posted!
|
85 |
+
|
86 |
+
### 2) Convert your model weights to [safetensors](https://huggingface.co/docs/safetensors/index)
|
87 |
+
It's a new format for storing weights which is safer and faster to load and use. It will also allow us to add the number of parameters of your model to the `Extended Viewer`!
|
88 |
+
|
89 |
+
### 3) Make sure your model has an open license!
|
90 |
+
This is a leaderboard for Open LLMs, and we'd love for as many people as possible to know they can use your model 🤗
|
91 |
+
|
92 |
+
### 4) Fill up your model card
|
93 |
+
When we add extra information about models to the leaderboard, it will be automatically taken from the model card
|
94 |
+
|
95 |
+
## In case of model failure
|
96 |
+
If your model is displayed in the `FAILED` category, its execution stopped.
|
97 |
+
Make sure you have followed the above steps first.
|
98 |
+
If everything is done, check you can launch the EleutherAIHarness on your model locally, using the above command without modifications (you can add `--limit` to limit the number of examples per task).
|
99 |
+
"""
|
100 |
+
|
101 |
+
CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
|
102 |
+
CITATION_BUTTON_TEXT = r"""
|
103 |
+
"""
|
src/about.pyZone.Identifier
ADDED
File without changes
|
src/display/css_html_js.py
ADDED
@@ -0,0 +1,105 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
custom_css = """
|
2 |
+
|
3 |
+
.markdown-text {
|
4 |
+
font-size: 16px !important;
|
5 |
+
}
|
6 |
+
|
7 |
+
#models-to-add-text {
|
8 |
+
font-size: 18px !important;
|
9 |
+
}
|
10 |
+
|
11 |
+
#citation-button span {
|
12 |
+
font-size: 16px !important;
|
13 |
+
}
|
14 |
+
|
15 |
+
#citation-button textarea {
|
16 |
+
font-size: 16px !important;
|
17 |
+
}
|
18 |
+
|
19 |
+
#citation-button > label > button {
|
20 |
+
margin: 6px;
|
21 |
+
transform: scale(1.3);
|
22 |
+
}
|
23 |
+
|
24 |
+
#leaderboard-table {
|
25 |
+
margin-top: 15px
|
26 |
+
}
|
27 |
+
|
28 |
+
#leaderboard-table-lite {
|
29 |
+
margin-top: 15px
|
30 |
+
}
|
31 |
+
|
32 |
+
#search-bar-table-box > div:first-child {
|
33 |
+
background: none;
|
34 |
+
border: none;
|
35 |
+
}
|
36 |
+
|
37 |
+
#search-bar {
|
38 |
+
padding: 0px;
|
39 |
+
}
|
40 |
+
|
41 |
+
/* Limit the width of the first AutoEvalColumn so that names don't expand too much */
|
42 |
+
#leaderboard-table td:nth-child(2),
|
43 |
+
#leaderboard-table th:nth-child(2) {
|
44 |
+
max-width: 400px;
|
45 |
+
overflow: auto;
|
46 |
+
white-space: nowrap;
|
47 |
+
}
|
48 |
+
|
49 |
+
.tab-buttons button {
|
50 |
+
font-size: 20px;
|
51 |
+
}
|
52 |
+
|
53 |
+
#scale-logo {
|
54 |
+
border-style: none !important;
|
55 |
+
box-shadow: none;
|
56 |
+
display: block;
|
57 |
+
margin-left: auto;
|
58 |
+
margin-right: auto;
|
59 |
+
max-width: 600px;
|
60 |
+
}
|
61 |
+
|
62 |
+
#scale-logo .download {
|
63 |
+
display: none;
|
64 |
+
}
|
65 |
+
#filter_type{
|
66 |
+
border: 0;
|
67 |
+
padding-left: 0;
|
68 |
+
padding-top: 0;
|
69 |
+
}
|
70 |
+
#filter_type label {
|
71 |
+
display: flex;
|
72 |
+
}
|
73 |
+
#filter_type label > span{
|
74 |
+
margin-top: var(--spacing-lg);
|
75 |
+
margin-right: 0.5em;
|
76 |
+
}
|
77 |
+
#filter_type label > .wrap{
|
78 |
+
width: 103px;
|
79 |
+
}
|
80 |
+
#filter_type label > .wrap .wrap-inner{
|
81 |
+
padding: 2px;
|
82 |
+
}
|
83 |
+
#filter_type label > .wrap .wrap-inner input{
|
84 |
+
width: 1px
|
85 |
+
}
|
86 |
+
#filter-columns-type{
|
87 |
+
border:0;
|
88 |
+
padding:0.5;
|
89 |
+
}
|
90 |
+
#filter-columns-size{
|
91 |
+
border:0;
|
92 |
+
padding:0.5;
|
93 |
+
}
|
94 |
+
#box-filter > .form{
|
95 |
+
border: 0
|
96 |
+
}
|
97 |
+
"""
|
98 |
+
|
99 |
+
get_window_url_params = """
|
100 |
+
function(url_params) {
|
101 |
+
const params = new URLSearchParams(window.location.search);
|
102 |
+
url_params = Object.fromEntries(params);
|
103 |
+
return url_params;
|
104 |
+
}
|
105 |
+
"""
|
src/display/css_html_js.pyZone.Identifier
ADDED
File without changes
|
src/display/formatting.py
ADDED
@@ -0,0 +1,27 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
def model_hyperlink(link, model_name):
|
2 |
+
return f'<a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model_name}</a>'
|
3 |
+
|
4 |
+
|
5 |
+
def make_clickable_model(model_name):
|
6 |
+
link = f"https://huggingface.co/{model_name}"
|
7 |
+
return model_hyperlink(link, model_name)
|
8 |
+
|
9 |
+
|
10 |
+
def styled_error(error):
|
11 |
+
return f"<p style='color: red; font-size: 20px; text-align: center;'>{error}</p>"
|
12 |
+
|
13 |
+
|
14 |
+
def styled_warning(warn):
|
15 |
+
return f"<p style='color: orange; font-size: 20px; text-align: center;'>{warn}</p>"
|
16 |
+
|
17 |
+
|
18 |
+
def styled_message(message):
|
19 |
+
return f"<p style='color: green; font-size: 20px; text-align: center;'>{message}</p>"
|
20 |
+
|
21 |
+
|
22 |
+
def has_no_nan_values(df, columns):
|
23 |
+
return df[columns].notna().all(axis=1)
|
24 |
+
|
25 |
+
|
26 |
+
def has_nan_values(df, columns):
|
27 |
+
return df[columns].isna().any(axis=1)
|
src/display/formatting.pyZone.Identifier
ADDED
File without changes
|
src/display/utils.py
ADDED
@@ -0,0 +1,145 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from dataclasses import dataclass, make_dataclass
|
2 |
+
from enum import Enum
|
3 |
+
|
4 |
+
import pandas as pd
|
5 |
+
|
6 |
+
from src.about import Tasks
|
7 |
+
|
8 |
+
def fields(raw_class):
|
9 |
+
return [v for k, v in raw_class.__dict__.items() if k[:2] != "__" and k[-2:] != "__"]
|
10 |
+
|
11 |
+
|
12 |
+
# These classes are for user facing column names,
|
13 |
+
# to avoid having to change them all around the code
|
14 |
+
# when a modif is needed
|
15 |
+
@dataclass
|
16 |
+
class ColumnContent:
|
17 |
+
name: str
|
18 |
+
type: str
|
19 |
+
displayed_by_default: bool
|
20 |
+
hidden: bool = False
|
21 |
+
never_hidden: bool = False
|
22 |
+
|
23 |
+
## Leaderboard columns
|
24 |
+
auto_eval_column_dict = []
|
25 |
+
# Init
|
26 |
+
auto_eval_column_dict.append(["model_type_symbol", ColumnContent, ColumnContent("T", "str", True, never_hidden=True)])
|
27 |
+
auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)])
|
28 |
+
#Scores
|
29 |
+
auto_eval_column_dict.append(["average", ColumnContent, ColumnContent("Média Geral ⬆️", "number", True)])
|
30 |
+
|
31 |
+
# Adicionando colunas para as médias das áreas
|
32 |
+
auto_eval_column_dict.append(["area_medica_avg", ColumnContent, ColumnContent("Área Médica", "number", True)])
|
33 |
+
auto_eval_column_dict.append(["area_direito_avg", ColumnContent, ColumnContent("Área do Direito", "number", True)])
|
34 |
+
auto_eval_column_dict.append(["provas_militares_avg", ColumnContent, ColumnContent("Provas Militares", "number", True)])
|
35 |
+
auto_eval_column_dict.append(["computacao_avg", ColumnContent, ColumnContent("Computação", "number", True)])
|
36 |
+
auto_eval_column_dict.append(["discurso_odio_avg", ColumnContent, ColumnContent("Discurso de Ódio", "number", True)])
|
37 |
+
auto_eval_column_dict.append(["economia_contabilidade_avg", ColumnContent, ColumnContent("Economia e Contabilidade", "number", True)])
|
38 |
+
auto_eval_column_dict.append(["semantica_inferencia_avg", ColumnContent, ColumnContent("Semântica e Inferência", "number", True)])
|
39 |
+
auto_eval_column_dict.append(["multidisciplinar_avg", ColumnContent, ColumnContent("Multidisciplinar", "number", True)])
|
40 |
+
|
41 |
+
for task in Tasks:
|
42 |
+
auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", False)]) # Mudar para False para não exibir por padrão na aba geral
|
43 |
+
# Model information
|
44 |
+
auto_eval_column_dict.append(["model_type", ColumnContent, ColumnContent("Type", "str", False)])
|
45 |
+
auto_eval_column_dict.append(["architecture", ColumnContent, ColumnContent("Architecture", "str", False)])
|
46 |
+
auto_eval_column_dict.append(["weight_type", ColumnContent, ColumnContent("Weight type", "str", False, True)])
|
47 |
+
auto_eval_column_dict.append(["precision", ColumnContent, ColumnContent("Precision", "str", False)])
|
48 |
+
auto_eval_column_dict.append(["license", ColumnContent, ColumnContent("Hub License", "str", False)])
|
49 |
+
auto_eval_column_dict.append(["params", ColumnContent, ColumnContent("#Params (B)", "number", False)])
|
50 |
+
auto_eval_column_dict.append(["likes", ColumnContent, ColumnContent("Hub ❤️", "number", False)])
|
51 |
+
auto_eval_column_dict.append(["still_on_hub", ColumnContent, ColumnContent("Available on the hub", "bool", False)])
|
52 |
+
auto_eval_column_dict.append(["revision", ColumnContent, ColumnContent("Model sha", "str", False, False)])
|
53 |
+
|
54 |
+
# We use make dataclass to dynamically fill the scores from Tasks
|
55 |
+
AutoEvalColumn = make_dataclass("AutoEvalColumn", auto_eval_column_dict, frozen=True)
|
56 |
+
|
57 |
+
# Mapeamento das áreas de conhecimento para os Tasks correspondentes
|
58 |
+
AREA_DEFINITIONS = {
|
59 |
+
"Área Médica": [Tasks.REVALIDA, Tasks.MREX],
|
60 |
+
"Área do Direito": [Tasks.OAB, Tasks.ENAM],
|
61 |
+
"Provas Militares": [Tasks.AFA, Tasks.ITA, Tasks.IME],
|
62 |
+
"Computação": [Tasks.POSCOMP, Tasks.OBI],
|
63 |
+
"Discurso de Ódio": [Tasks.HATEBR, Tasks.PT_HATE_SPEECH, Tasks.TWEETSENTBR],
|
64 |
+
"Economia e Contabilidade": [Tasks.BCB, Tasks.CFCES],
|
65 |
+
"Semântica e Inferência": [Tasks.FAQUAD_NLI, Tasks.ASSIN2_RTE, Tasks.ASSIN2_STS],
|
66 |
+
"Multidisciplinar": [Tasks.ENEM, Tasks.BLUEX, Tasks.CNPU, Tasks.ENADE, Tasks.BNDES, Tasks.CACD_1, Tasks.CACD_2],
|
67 |
+
}
|
68 |
+
|
69 |
+
# Mapeamento dos nomes das áreas para as colunas de média correspondentes
|
70 |
+
AREA_AVG_COLUMN_MAP = {
|
71 |
+
"Área Médica": AutoEvalColumn.area_medica_avg.name,
|
72 |
+
"Área do Direito": AutoEvalColumn.area_direito_avg.name,
|
73 |
+
"Provas Militares": AutoEvalColumn.provas_militares_avg.name,
|
74 |
+
"Computação": AutoEvalColumn.computacao_avg.name,
|
75 |
+
"Discurso de Ódio": AutoEvalColumn.discurso_odio_avg.name,
|
76 |
+
"Economia e Contabilidade": AutoEvalColumn.economia_contabilidade_avg.name,
|
77 |
+
"Semântica e Inferência": AutoEvalColumn.semantica_inferencia_avg.name,
|
78 |
+
"Multidisciplinar": AutoEvalColumn.multidisciplinar_avg.name,
|
79 |
+
}
|
80 |
+
|
81 |
+
## For the queue columns in the submission tab
|
82 |
+
@dataclass(frozen=True)
|
83 |
+
class EvalQueueColumn: # Queue column
|
84 |
+
model = ColumnContent("model", "markdown", True)
|
85 |
+
revision = ColumnContent("revision", "str", True)
|
86 |
+
private = ColumnContent("private", "bool", True)
|
87 |
+
precision = ColumnContent("precision", "str", True)
|
88 |
+
weight_type = ColumnContent("weight_type", "str", "Original")
|
89 |
+
status = ColumnContent("status", "str", True)
|
90 |
+
|
91 |
+
## All the model information that we might need
|
92 |
+
@dataclass
|
93 |
+
class ModelDetails:
|
94 |
+
name: str
|
95 |
+
display_name: str = ""
|
96 |
+
symbol: str = "" # emoji
|
97 |
+
|
98 |
+
|
99 |
+
class ModelType(Enum):
|
100 |
+
PT = ModelDetails(name="pretrained", symbol="🟢")
|
101 |
+
FT = ModelDetails(name="fine-tuned", symbol="🔶")
|
102 |
+
IFT = ModelDetails(name="instruction-tuned", symbol="⭕")
|
103 |
+
RL = ModelDetails(name="RL-tuned", symbol="🟦")
|
104 |
+
Unknown = ModelDetails(name="", symbol="?")
|
105 |
+
|
106 |
+
def to_str(self, separator=" "):
|
107 |
+
return f"{self.value.symbol}{separator}{self.value.name}"
|
108 |
+
|
109 |
+
@staticmethod
|
110 |
+
def from_str(type):
|
111 |
+
if "fine-tuned" in type or "🔶" in type:
|
112 |
+
return ModelType.FT
|
113 |
+
if "pretrained" in type or "🟢" in type:
|
114 |
+
return ModelType.PT
|
115 |
+
if "RL-tuned" in type or "🟦" in type:
|
116 |
+
return ModelType.RL
|
117 |
+
if "instruction-tuned" in type or "⭕" in type:
|
118 |
+
return ModelType.IFT
|
119 |
+
return ModelType.Unknown
|
120 |
+
|
121 |
+
class WeightType(Enum):
|
122 |
+
Adapter = ModelDetails("Adapter")
|
123 |
+
Original = ModelDetails("Original")
|
124 |
+
Delta = ModelDetails("Delta")
|
125 |
+
|
126 |
+
class Precision(Enum):
|
127 |
+
float16 = ModelDetails("float16")
|
128 |
+
bfloat16 = ModelDetails("bfloat16")
|
129 |
+
Unknown = ModelDetails("?")
|
130 |
+
|
131 |
+
def from_str(precision):
|
132 |
+
if precision in ["torch.float16", "float16"]:
|
133 |
+
return Precision.float16
|
134 |
+
if precision in ["torch.bfloat16", "bfloat16"]:
|
135 |
+
return Precision.bfloat16
|
136 |
+
return Precision.Unknown
|
137 |
+
|
138 |
+
# Column selection
|
139 |
+
COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden]
|
140 |
+
|
141 |
+
EVAL_COLS = [c.name for c in fields(EvalQueueColumn)]
|
142 |
+
EVAL_TYPES = [c.type for c in fields(EvalQueueColumn)]
|
143 |
+
|
144 |
+
BENCHMARK_COLS = [t.value.col_name for t in Tasks]
|
145 |
+
|
src/display/utils.pyZone.Identifier
ADDED
File without changes
|
src/envs.py
ADDED
@@ -0,0 +1,25 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
|
3 |
+
from huggingface_hub import HfApi
|
4 |
+
|
5 |
+
# Info to change for your repository
|
6 |
+
# ----------------------------------
|
7 |
+
TOKEN = os.environ.get("HF_TOKEN") # A read/write token for your org
|
8 |
+
|
9 |
+
OWNER = "demo-leaderboard-backend" # Change to your org - don't forget to create a results and request dataset, with the correct format!
|
10 |
+
# ----------------------------------
|
11 |
+
|
12 |
+
REPO_ID = f"{OWNER}/leaderboard"
|
13 |
+
QUEUE_REPO = f"{OWNER}/requests"
|
14 |
+
RESULTS_REPO = f"{OWNER}/results"
|
15 |
+
|
16 |
+
# If you setup a cache later, just change HF_HOME
|
17 |
+
CACHE_PATH=os.getenv("HF_HOME", ".")
|
18 |
+
|
19 |
+
# Local caches
|
20 |
+
EVAL_REQUESTS_PATH = os.path.join(CACHE_PATH, "eval-queue")
|
21 |
+
EVAL_RESULTS_PATH = os.path.join(CACHE_PATH, "eval-results")
|
22 |
+
EVAL_REQUESTS_PATH_BACKEND = os.path.join(CACHE_PATH, "eval-queue-bk")
|
23 |
+
EVAL_RESULTS_PATH_BACKEND = os.path.join(CACHE_PATH, "eval-results-bk")
|
24 |
+
|
25 |
+
API = HfApi(token=TOKEN)
|
src/envs.pyZone.Identifier
ADDED
File without changes
|
src/leaderboard/read_evals.py
ADDED
@@ -0,0 +1,196 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import glob
|
2 |
+
import json
|
3 |
+
import math
|
4 |
+
import os
|
5 |
+
from dataclasses import dataclass
|
6 |
+
|
7 |
+
import dateutil
|
8 |
+
import numpy as np
|
9 |
+
|
10 |
+
from src.display.formatting import make_clickable_model
|
11 |
+
from src.display.utils import AutoEvalColumn, ModelType, Tasks, Precision, WeightType
|
12 |
+
from src.submission.check_validity import is_model_on_hub
|
13 |
+
|
14 |
+
|
15 |
+
@dataclass
|
16 |
+
class EvalResult:
|
17 |
+
"""Represents one full evaluation. Built from a combination of the result and request file for a given run.
|
18 |
+
"""
|
19 |
+
eval_name: str # org_model_precision (uid)
|
20 |
+
full_model: str # org/model (path on hub)
|
21 |
+
org: str
|
22 |
+
model: str
|
23 |
+
revision: str # commit hash, "" if main
|
24 |
+
results: dict
|
25 |
+
precision: Precision = Precision.Unknown
|
26 |
+
model_type: ModelType = ModelType.Unknown # Pretrained, fine tuned, ...
|
27 |
+
weight_type: WeightType = WeightType.Original # Original or Adapter
|
28 |
+
architecture: str = "Unknown"
|
29 |
+
license: str = "?"
|
30 |
+
likes: int = 0
|
31 |
+
num_params: int = 0
|
32 |
+
date: str = "" # submission date of request file
|
33 |
+
still_on_hub: bool = False
|
34 |
+
|
35 |
+
@classmethod
|
36 |
+
def init_from_json_file(self, json_filepath):
|
37 |
+
"""Inits the result from the specific model result file"""
|
38 |
+
with open(json_filepath) as fp:
|
39 |
+
data = json.load(fp)
|
40 |
+
|
41 |
+
config = data.get("config")
|
42 |
+
|
43 |
+
# Precision
|
44 |
+
precision = Precision.from_str(config.get("model_dtype"))
|
45 |
+
|
46 |
+
# Get model and org
|
47 |
+
org_and_model = config.get("model_name", config.get("model_args", None))
|
48 |
+
org_and_model = org_and_model.split("/", 1)
|
49 |
+
|
50 |
+
if len(org_and_model) == 1:
|
51 |
+
org = None
|
52 |
+
model = org_and_model[0]
|
53 |
+
result_key = f"{model}_{precision.value.name}"
|
54 |
+
else:
|
55 |
+
org = org_and_model[0]
|
56 |
+
model = org_and_model[1]
|
57 |
+
result_key = f"{org}_{model}_{precision.value.name}"
|
58 |
+
full_model = "/".join(org_and_model)
|
59 |
+
|
60 |
+
still_on_hub, _, model_config = is_model_on_hub(
|
61 |
+
full_model, config.get("model_sha", "main"), trust_remote_code=True, test_tokenizer=False
|
62 |
+
)
|
63 |
+
architecture = "?"
|
64 |
+
if model_config is not None:
|
65 |
+
architectures = getattr(model_config, "architectures", None)
|
66 |
+
if architectures:
|
67 |
+
architecture = ";".join(architectures)
|
68 |
+
|
69 |
+
# Extract results available in this file (some results are split in several files)
|
70 |
+
results = {}
|
71 |
+
for task in Tasks:
|
72 |
+
task = task.value
|
73 |
+
|
74 |
+
# We average all scores of a given metric (not all metrics are present in all files)
|
75 |
+
accs = np.array([v.get(task.metric, None) for k, v in data["results"].items() if task.benchmark == k])
|
76 |
+
if accs.size == 0 or any([acc is None for acc in accs]):
|
77 |
+
continue
|
78 |
+
|
79 |
+
mean_acc = np.mean(accs) * 100.0
|
80 |
+
results[task.benchmark] = mean_acc
|
81 |
+
|
82 |
+
return self(
|
83 |
+
eval_name=result_key,
|
84 |
+
full_model=full_model,
|
85 |
+
org=org,
|
86 |
+
model=model,
|
87 |
+
results=results,
|
88 |
+
precision=precision,
|
89 |
+
revision= config.get("model_sha", ""),
|
90 |
+
still_on_hub=still_on_hub,
|
91 |
+
architecture=architecture
|
92 |
+
)
|
93 |
+
|
94 |
+
def update_with_request_file(self, requests_path):
|
95 |
+
"""Finds the relevant request file for the current model and updates info with it"""
|
96 |
+
request_file = get_request_file_for_model(requests_path, self.full_model, self.precision.value.name)
|
97 |
+
|
98 |
+
try:
|
99 |
+
with open(request_file, "r") as f:
|
100 |
+
request = json.load(f)
|
101 |
+
self.model_type = ModelType.from_str(request.get("model_type", ""))
|
102 |
+
self.weight_type = WeightType[request.get("weight_type", "Original")]
|
103 |
+
self.license = request.get("license", "?")
|
104 |
+
self.likes = request.get("likes", 0)
|
105 |
+
self.num_params = request.get("params", 0)
|
106 |
+
self.date = request.get("submitted_time", "")
|
107 |
+
except Exception:
|
108 |
+
print(f"Could not find request file for {self.org}/{self.model} with precision {self.precision.value.name}")
|
109 |
+
|
110 |
+
def to_dict(self):
|
111 |
+
"""Converts the Eval Result to a dict compatible with our dataframe display"""
|
112 |
+
average = sum([v for v in self.results.values() if v is not None]) / len(Tasks)
|
113 |
+
data_dict = {
|
114 |
+
"eval_name": self.eval_name, # not a column, just a save name,
|
115 |
+
AutoEvalColumn.precision.name: self.precision.value.name,
|
116 |
+
AutoEvalColumn.model_type.name: self.model_type.value.name,
|
117 |
+
AutoEvalColumn.model_type_symbol.name: self.model_type.value.symbol,
|
118 |
+
AutoEvalColumn.weight_type.name: self.weight_type.value.name,
|
119 |
+
AutoEvalColumn.architecture.name: self.architecture,
|
120 |
+
AutoEvalColumn.model.name: make_clickable_model(self.full_model),
|
121 |
+
AutoEvalColumn.revision.name: self.revision,
|
122 |
+
AutoEvalColumn.average.name: average,
|
123 |
+
AutoEvalColumn.license.name: self.license,
|
124 |
+
AutoEvalColumn.likes.name: self.likes,
|
125 |
+
AutoEvalColumn.params.name: self.num_params,
|
126 |
+
AutoEvalColumn.still_on_hub.name: self.still_on_hub,
|
127 |
+
}
|
128 |
+
|
129 |
+
for task in Tasks:
|
130 |
+
data_dict[task.value.col_name] = self.results[task.value.benchmark]
|
131 |
+
|
132 |
+
return data_dict
|
133 |
+
|
134 |
+
|
135 |
+
def get_request_file_for_model(requests_path, model_name, precision):
|
136 |
+
"""Selects the correct request file for a given model. Only keeps runs tagged as FINISHED"""
|
137 |
+
request_files = os.path.join(
|
138 |
+
requests_path,
|
139 |
+
f"{model_name}_eval_request_*.json",
|
140 |
+
)
|
141 |
+
request_files = glob.glob(request_files)
|
142 |
+
|
143 |
+
# Select correct request file (precision)
|
144 |
+
request_file = ""
|
145 |
+
request_files = sorted(request_files, reverse=True)
|
146 |
+
for tmp_request_file in request_files:
|
147 |
+
with open(tmp_request_file, "r") as f:
|
148 |
+
req_content = json.load(f)
|
149 |
+
if (
|
150 |
+
req_content["status"] in ["FINISHED"]
|
151 |
+
and req_content["precision"] == precision.split(".")[-1]
|
152 |
+
):
|
153 |
+
request_file = tmp_request_file
|
154 |
+
return request_file
|
155 |
+
|
156 |
+
|
157 |
+
def get_raw_eval_results(results_path: str, requests_path: str) -> list[EvalResult]:
|
158 |
+
"""From the path of the results folder root, extract all needed info for results"""
|
159 |
+
model_result_filepaths = []
|
160 |
+
|
161 |
+
for root, _, files in os.walk(results_path):
|
162 |
+
# We should only have json files in model results
|
163 |
+
if len(files) == 0 or any([not f.endswith(".json") for f in files]):
|
164 |
+
continue
|
165 |
+
|
166 |
+
# Sort the files by date
|
167 |
+
try:
|
168 |
+
files.sort(key=lambda x: x.removesuffix(".json").removeprefix("results_")[:-7])
|
169 |
+
except dateutil.parser._parser.ParserError:
|
170 |
+
files = [files[-1]]
|
171 |
+
|
172 |
+
for file in files:
|
173 |
+
model_result_filepaths.append(os.path.join(root, file))
|
174 |
+
|
175 |
+
eval_results = {}
|
176 |
+
for model_result_filepath in model_result_filepaths:
|
177 |
+
# Creation of result
|
178 |
+
eval_result = EvalResult.init_from_json_file(model_result_filepath)
|
179 |
+
eval_result.update_with_request_file(requests_path)
|
180 |
+
|
181 |
+
# Store results of same eval together
|
182 |
+
eval_name = eval_result.eval_name
|
183 |
+
if eval_name in eval_results.keys():
|
184 |
+
eval_results[eval_name].results.update({k: v for k, v in eval_result.results.items() if v is not None})
|
185 |
+
else:
|
186 |
+
eval_results[eval_name] = eval_result
|
187 |
+
|
188 |
+
results = []
|
189 |
+
for v in eval_results.values():
|
190 |
+
try:
|
191 |
+
v.to_dict() # we test if the dict version is complete
|
192 |
+
results.append(v)
|
193 |
+
except KeyError: # not all eval values present
|
194 |
+
continue
|
195 |
+
|
196 |
+
return results
|
src/leaderboard/read_evals.pyZone.Identifier
ADDED
File without changes
|
src/populate.py
ADDED
@@ -0,0 +1,79 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import json
|
2 |
+
import os
|
3 |
+
import numpy as np
|
4 |
+
import pandas as pd
|
5 |
+
|
6 |
+
from src.display.formatting import has_no_nan_values, make_clickable_model
|
7 |
+
from src.display.utils import AutoEvalColumn, EvalQueueColumn, AREA_DEFINITIONS, AREA_AVG_COLUMN_MAP
|
8 |
+
from src.leaderboard.read_evals import get_raw_eval_results
|
9 |
+
from src.about import Tasks
|
10 |
+
|
11 |
+
|
12 |
+
def get_leaderboard_df(results_path: str, requests_path: str, cols: list) -> pd.DataFrame:
|
13 |
+
"""Creates a dataframe from all the individual experiment results"""
|
14 |
+
raw_data = get_raw_eval_results(results_path, requests_path)
|
15 |
+
all_data_json = [v.to_dict() for v in raw_data]
|
16 |
+
|
17 |
+
df = pd.DataFrame.from_records(all_data_json)
|
18 |
+
|
19 |
+
# Calcular médias por área
|
20 |
+
for area_name, tasks_in_area in AREA_DEFINITIONS.items():
|
21 |
+
area_cols = [task.name for task in tasks_in_area if task.name in df.columns]
|
22 |
+
avg_col_name = AREA_AVG_COLUMN_MAP[area_name]
|
23 |
+
if area_cols: # Calcula a média apenas se houver colunas da área no DataFrame
|
24 |
+
df[avg_col_name] = df[area_cols].mean(axis=1)
|
25 |
+
else:
|
26 |
+
df[avg_col_name] = np.nan # Define como NaN se nenhuma coluna da área estiver presente
|
27 |
+
|
28 |
+
# Calcular Média Geral (agora baseada nas médias das áreas)
|
29 |
+
avg_area_cols = list(AREA_AVG_COLUMN_MAP.values())
|
30 |
+
df[AutoEvalColumn.average.name] = df[avg_area_cols].mean(axis=1)
|
31 |
+
|
32 |
+
df = df.sort_values(by=[AutoEvalColumn.average.name], ascending=False)
|
33 |
+
|
34 |
+
# Selecionar e arredondar colunas
|
35 |
+
all_display_cols = [c.name for c in fields(AutoEvalColumn)] # Obter todas as colunas definidas
|
36 |
+
df = df[[col for col in all_display_cols if col in df.columns]] # Manter apenas colunas existentes no df
|
37 |
+
df = df.round(decimals=2)
|
38 |
+
|
39 |
+
# Filtrar linhas com valores NaN nas colunas de benchmark originais (se necessário)
|
40 |
+
# benchmark_cols = [t.name for t in Tasks] # Descomentar se precisar do filtro original
|
41 |
+
# df = df[has_no_nan_values(df, benchmark_cols)]
|
42 |
+
|
43 |
+
return df
|
44 |
+
|
45 |
+
|
46 |
+
def get_evaluation_queue_df(save_path: str, cols: list) -> list[pd.DataFrame]:
|
47 |
+
"""Creates the different dataframes for the evaluation queues requestes"""
|
48 |
+
entries = [entry for entry in os.listdir(save_path) if not entry.startswith(".")]
|
49 |
+
all_evals = []
|
50 |
+
|
51 |
+
for entry in entries:
|
52 |
+
if ".json" in entry:
|
53 |
+
file_path = os.path.join(save_path, entry)
|
54 |
+
with open(file_path) as fp:
|
55 |
+
data = json.load(fp)
|
56 |
+
|
57 |
+
data[EvalQueueColumn.model.name] = make_clickable_model(data["model"])
|
58 |
+
data[EvalQueueColumn.revision.name] = data.get("revision", "main")
|
59 |
+
|
60 |
+
all_evals.append(data)
|
61 |
+
elif ".md" not in entry:
|
62 |
+
# this is a folder
|
63 |
+
sub_entries = [e for e in os.listdir(f"{save_path}/{entry}") if os.path.isfile(e) and not e.startswith(".")]
|
64 |
+
for sub_entry in sub_entries:
|
65 |
+
file_path = os.path.join(save_path, entry, sub_entry)
|
66 |
+
with open(file_path) as fp:
|
67 |
+
data = json.load(fp)
|
68 |
+
|
69 |
+
data[EvalQueueColumn.model.name] = make_clickable_model(data["model"])
|
70 |
+
data[EvalQueueColumn.revision.name] = data.get("revision", "main")
|
71 |
+
all_evals.append(data)
|
72 |
+
|
73 |
+
pending_list = [e for e in all_evals if e["status"] in ["PENDING", "RERUN"]]
|
74 |
+
running_list = [e for e in all_evals if e["status"] == "RUNNING"]
|
75 |
+
finished_list = [e for e in all_evals if e["status"].startswith("FINISHED") or e["status"] == "PENDING_NEW_EVAL"]
|
76 |
+
df_pending = pd.DataFrame.from_records(pending_list, columns=cols)
|
77 |
+
df_running = pd.DataFrame.from_records(running_list, columns=cols)
|
78 |
+
df_finished = pd.DataFrame.from_records(finished_list, columns=cols)
|
79 |
+
return df_finished[cols], df_running[cols], df_pending[cols]
|
src/populate.pyZone.Identifier
ADDED
File without changes
|
src/submission/check_validity.py
ADDED
@@ -0,0 +1,99 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import json
|
2 |
+
import os
|
3 |
+
import re
|
4 |
+
from collections import defaultdict
|
5 |
+
from datetime import datetime, timedelta, timezone
|
6 |
+
|
7 |
+
import huggingface_hub
|
8 |
+
from huggingface_hub import ModelCard
|
9 |
+
from huggingface_hub.hf_api import ModelInfo
|
10 |
+
from transformers import AutoConfig
|
11 |
+
from transformers.models.auto.tokenization_auto import AutoTokenizer
|
12 |
+
|
13 |
+
def check_model_card(repo_id: str) -> tuple[bool, str]:
|
14 |
+
"""Checks if the model card and license exist and have been filled"""
|
15 |
+
try:
|
16 |
+
card = ModelCard.load(repo_id)
|
17 |
+
except huggingface_hub.utils.EntryNotFoundError:
|
18 |
+
return False, "Please add a model card to your model to explain how you trained/fine-tuned it."
|
19 |
+
|
20 |
+
# Enforce license metadata
|
21 |
+
if card.data.license is None:
|
22 |
+
if not ("license_name" in card.data and "license_link" in card.data):
|
23 |
+
return False, (
|
24 |
+
"License not found. Please add a license to your model card using the `license` metadata or a"
|
25 |
+
" `license_name`/`license_link` pair."
|
26 |
+
)
|
27 |
+
|
28 |
+
# Enforce card content
|
29 |
+
if len(card.text) < 200:
|
30 |
+
return False, "Please add a description to your model card, it is too short."
|
31 |
+
|
32 |
+
return True, ""
|
33 |
+
|
34 |
+
def is_model_on_hub(model_name: str, revision: str, token: str = None, trust_remote_code=False, test_tokenizer=False) -> tuple[bool, str]:
|
35 |
+
"""Checks if the model model_name is on the hub, and whether it (and its tokenizer) can be loaded with AutoClasses."""
|
36 |
+
try:
|
37 |
+
config = AutoConfig.from_pretrained(model_name, revision=revision, trust_remote_code=trust_remote_code, token=token)
|
38 |
+
if test_tokenizer:
|
39 |
+
try:
|
40 |
+
tk = AutoTokenizer.from_pretrained(model_name, revision=revision, trust_remote_code=trust_remote_code, token=token)
|
41 |
+
except ValueError as e:
|
42 |
+
return (
|
43 |
+
False,
|
44 |
+
f"uses a tokenizer which is not in a transformers release: {e}",
|
45 |
+
None
|
46 |
+
)
|
47 |
+
except Exception as e:
|
48 |
+
return (False, "'s tokenizer cannot be loaded. Is your tokenizer class in a stable transformers release, and correctly configured?", None)
|
49 |
+
return True, None, config
|
50 |
+
|
51 |
+
except ValueError:
|
52 |
+
return (
|
53 |
+
False,
|
54 |
+
"needs to be launched with `trust_remote_code=True`. For safety reason, we do not allow these models to be automatically submitted to the leaderboard.",
|
55 |
+
None
|
56 |
+
)
|
57 |
+
|
58 |
+
except Exception as e:
|
59 |
+
return False, "was not found on hub!", None
|
60 |
+
|
61 |
+
|
62 |
+
def get_model_size(model_info: ModelInfo, precision: str):
|
63 |
+
"""Gets the model size from the configuration, or the model name if the configuration does not contain the information."""
|
64 |
+
try:
|
65 |
+
model_size = round(model_info.safetensors["total"] / 1e9, 3)
|
66 |
+
except (AttributeError, TypeError):
|
67 |
+
return 0 # Unknown model sizes are indicated as 0, see NUMERIC_INTERVALS in app.py
|
68 |
+
|
69 |
+
size_factor = 8 if (precision == "GPTQ" or "gptq" in model_info.modelId.lower()) else 1
|
70 |
+
model_size = size_factor * model_size
|
71 |
+
return model_size
|
72 |
+
|
73 |
+
def get_model_arch(model_info: ModelInfo):
|
74 |
+
"""Gets the model architecture from the configuration"""
|
75 |
+
return model_info.config.get("architectures", "Unknown")
|
76 |
+
|
77 |
+
def already_submitted_models(requested_models_dir: str) -> set[str]:
|
78 |
+
"""Gather a list of already submitted models to avoid duplicates"""
|
79 |
+
depth = 1
|
80 |
+
file_names = []
|
81 |
+
users_to_submission_dates = defaultdict(list)
|
82 |
+
|
83 |
+
for root, _, files in os.walk(requested_models_dir):
|
84 |
+
current_depth = root.count(os.sep) - requested_models_dir.count(os.sep)
|
85 |
+
if current_depth == depth:
|
86 |
+
for file in files:
|
87 |
+
if not file.endswith(".json"):
|
88 |
+
continue
|
89 |
+
with open(os.path.join(root, file), "r") as f:
|
90 |
+
info = json.load(f)
|
91 |
+
file_names.append(f"{info['model']}_{info['revision']}_{info['precision']}")
|
92 |
+
|
93 |
+
# Select organisation
|
94 |
+
if info["model"].count("/") == 0 or "submitted_time" not in info:
|
95 |
+
continue
|
96 |
+
organisation, _ = info["model"].split("/")
|
97 |
+
users_to_submission_dates[organisation].append(info["submitted_time"])
|
98 |
+
|
99 |
+
return set(file_names), users_to_submission_dates
|
src/submission/check_validity.pyZone.Identifier
ADDED
File without changes
|
src/submission/submit.py
ADDED
@@ -0,0 +1,119 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import json
|
2 |
+
import os
|
3 |
+
from datetime import datetime, timezone
|
4 |
+
|
5 |
+
from src.display.formatting import styled_error, styled_message, styled_warning
|
6 |
+
from src.envs import API, EVAL_REQUESTS_PATH, TOKEN, QUEUE_REPO
|
7 |
+
from src.submission.check_validity import (
|
8 |
+
already_submitted_models,
|
9 |
+
check_model_card,
|
10 |
+
get_model_size,
|
11 |
+
is_model_on_hub,
|
12 |
+
)
|
13 |
+
|
14 |
+
REQUESTED_MODELS = None
|
15 |
+
USERS_TO_SUBMISSION_DATES = None
|
16 |
+
|
17 |
+
def add_new_eval(
|
18 |
+
model: str,
|
19 |
+
base_model: str,
|
20 |
+
revision: str,
|
21 |
+
precision: str,
|
22 |
+
weight_type: str,
|
23 |
+
model_type: str,
|
24 |
+
):
|
25 |
+
global REQUESTED_MODELS
|
26 |
+
global USERS_TO_SUBMISSION_DATES
|
27 |
+
if not REQUESTED_MODELS:
|
28 |
+
REQUESTED_MODELS, USERS_TO_SUBMISSION_DATES = already_submitted_models(EVAL_REQUESTS_PATH)
|
29 |
+
|
30 |
+
user_name = ""
|
31 |
+
model_path = model
|
32 |
+
if "/" in model:
|
33 |
+
user_name = model.split("/")[0]
|
34 |
+
model_path = model.split("/")[1]
|
35 |
+
|
36 |
+
precision = precision.split(" ")[0]
|
37 |
+
current_time = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
|
38 |
+
|
39 |
+
if model_type is None or model_type == "":
|
40 |
+
return styled_error("Please select a model type.")
|
41 |
+
|
42 |
+
# Does the model actually exist?
|
43 |
+
if revision == "":
|
44 |
+
revision = "main"
|
45 |
+
|
46 |
+
# Is the model on the hub?
|
47 |
+
if weight_type in ["Delta", "Adapter"]:
|
48 |
+
base_model_on_hub, error, _ = is_model_on_hub(model_name=base_model, revision=revision, token=TOKEN, test_tokenizer=True)
|
49 |
+
if not base_model_on_hub:
|
50 |
+
return styled_error(f'Base model "{base_model}" {error}')
|
51 |
+
|
52 |
+
if not weight_type == "Adapter":
|
53 |
+
model_on_hub, error, _ = is_model_on_hub(model_name=model, revision=revision, token=TOKEN, test_tokenizer=True)
|
54 |
+
if not model_on_hub:
|
55 |
+
return styled_error(f'Model "{model}" {error}')
|
56 |
+
|
57 |
+
# Is the model info correctly filled?
|
58 |
+
try:
|
59 |
+
model_info = API.model_info(repo_id=model, revision=revision)
|
60 |
+
except Exception:
|
61 |
+
return styled_error("Could not get your model information. Please fill it up properly.")
|
62 |
+
|
63 |
+
model_size = get_model_size(model_info=model_info, precision=precision)
|
64 |
+
|
65 |
+
# Were the model card and license filled?
|
66 |
+
try:
|
67 |
+
license = model_info.cardData["license"]
|
68 |
+
except Exception:
|
69 |
+
return styled_error("Please select a license for your model")
|
70 |
+
|
71 |
+
modelcard_OK, error_msg = check_model_card(model)
|
72 |
+
if not modelcard_OK:
|
73 |
+
return styled_error(error_msg)
|
74 |
+
|
75 |
+
# Seems good, creating the eval
|
76 |
+
print("Adding new eval")
|
77 |
+
|
78 |
+
eval_entry = {
|
79 |
+
"model": model,
|
80 |
+
"base_model": base_model,
|
81 |
+
"revision": revision,
|
82 |
+
"precision": precision,
|
83 |
+
"weight_type": weight_type,
|
84 |
+
"status": "PENDING",
|
85 |
+
"submitted_time": current_time,
|
86 |
+
"model_type": model_type,
|
87 |
+
"likes": model_info.likes,
|
88 |
+
"params": model_size,
|
89 |
+
"license": license,
|
90 |
+
"private": False,
|
91 |
+
}
|
92 |
+
|
93 |
+
# Check for duplicate submission
|
94 |
+
if f"{model}_{revision}_{precision}" in REQUESTED_MODELS:
|
95 |
+
return styled_warning("This model has been already submitted.")
|
96 |
+
|
97 |
+
print("Creating eval file")
|
98 |
+
OUT_DIR = f"{EVAL_REQUESTS_PATH}/{user_name}"
|
99 |
+
os.makedirs(OUT_DIR, exist_ok=True)
|
100 |
+
out_path = f"{OUT_DIR}/{model_path}_eval_request_False_{precision}_{weight_type}.json"
|
101 |
+
|
102 |
+
with open(out_path, "w") as f:
|
103 |
+
f.write(json.dumps(eval_entry))
|
104 |
+
|
105 |
+
print("Uploading eval file")
|
106 |
+
API.upload_file(
|
107 |
+
path_or_fileobj=out_path,
|
108 |
+
path_in_repo=out_path.split("eval-queue/")[1],
|
109 |
+
repo_id=QUEUE_REPO,
|
110 |
+
repo_type="dataset",
|
111 |
+
commit_message=f"Add {model} to eval queue",
|
112 |
+
)
|
113 |
+
|
114 |
+
# Remove the local file
|
115 |
+
os.remove(out_path)
|
116 |
+
|
117 |
+
return styled_message(
|
118 |
+
"Your request has been submitted to the evaluation queue!\nPlease wait for up to an hour for the model to show in the PENDING list."
|
119 |
+
)
|
src/submission/submit.pyZone.Identifier
ADDED
File without changes
|