LucasLima commited on
Commit
ac500fb
·
verified ·
1 Parent(s): 087d1c0

Upload 34 files

Browse files
.gitattributes ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tflite filter=lfs diff=lfs merge=lfs -text
29
+ *.tgz filter=lfs diff=lfs merge=lfs -text
30
+ *.wasm filter=lfs diff=lfs merge=lfs -text
31
+ *.xz filter=lfs diff=lfs merge=lfs -text
32
+ *.zip filter=lfs diff=lfs merge=lfs -text
33
+ *.zst filter=lfs diff=lfs merge=lfs -text
34
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
35
+ scale-hf-logo.png filter=lfs diff=lfs merge=lfs -text
.gitignore ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ auto_evals/
2
+ venv/
3
+ __pycache__/
4
+ .env
5
+ .ipynb_checkpoints
6
+ *ipynb
7
+ .vscode/
8
+
9
+ eval-queue/
10
+ eval-results/
11
+ eval-queue-bk/
12
+ eval-results-bk/
13
+ logs/
.pre-commit-config.yaml ADDED
@@ -0,0 +1,53 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ default_language_version:
16
+ python: python3
17
+
18
+ ci:
19
+ autofix_prs: true
20
+ autoupdate_commit_msg: '[pre-commit.ci] pre-commit suggestions'
21
+ autoupdate_schedule: quarterly
22
+
23
+ repos:
24
+ - repo: https://github.com/pre-commit/pre-commit-hooks
25
+ rev: v4.3.0
26
+ hooks:
27
+ - id: check-yaml
28
+ - id: check-case-conflict
29
+ - id: detect-private-key
30
+ - id: check-added-large-files
31
+ args: ['--maxkb=1000']
32
+ - id: requirements-txt-fixer
33
+ - id: end-of-file-fixer
34
+ - id: trailing-whitespace
35
+
36
+ - repo: https://github.com/PyCQA/isort
37
+ rev: 5.12.0
38
+ hooks:
39
+ - id: isort
40
+ name: Format imports
41
+
42
+ - repo: https://github.com/psf/black
43
+ rev: 22.12.0
44
+ hooks:
45
+ - id: black
46
+ name: Format code
47
+ additional_dependencies: ['click==8.0.2']
48
+
49
+ - repo: https://github.com/charliermarsh/ruff-pre-commit
50
+ # Ruff version.
51
+ rev: 'v0.0.267'
52
+ hooks:
53
+ - id: ruff
Classificação dos Dataset.txt ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Classificação dos Dataset
2
+
3
+ A seguir, serão descritos quais datasets pertencem a quais categorias:
4
+
5
+ Área Médica: Revalida, MREX
6
+ Área do Direito: OAB, ENAM
7
+ Provas Militares: AFA, ITA, IME
8
+ Computação: POSCOMP, OBI
9
+ Discurso de Ódio: HateBR, PT Hate Speech, tweetSentBR
10
+ Economia e Contabilidade: BCB, CFCES
11
+ Compreensão de Semântica e Inferência Textual: FAQUAD NLI, ASSIN2 RTE, ASSIN2 STS
12
+ Provas de Conhecimento Multidisciplinar: ENEM, BLUEX, CNPU, ENADE, BNDES, CACD (1ª fase), CACD (2ª fase)
Makefile ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ .PHONY: style format
2
+
3
+
4
+ style:
5
+ python -m black --line-length 119 .
6
+ python -m isort .
7
+ ruff check --fix .
8
+
9
+
10
+ quality:
11
+ python -m black --check --line-length 119 .
12
+ python -m isort --check-only .
13
+ ruff check .
README.md ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: Cemig
3
+ emoji: 🥇
4
+ colorFrom: green
5
+ colorTo: indigo
6
+ sdk: gradio
7
+ app_file: app.py
8
+ pinned: true
9
+ license: apache-2.0
10
+ short_description: Teste para criação de uma leaderboard
11
+ sdk_version: 5.19.0
12
+ ---
13
+
14
+ # Start the configuration
15
+
16
+ Most of the variables to change for a default leaderboard are in `src/env.py` (replace the path for your leaderboard) and `src/about.py` (for tasks).
17
+
18
+ Results files should have the following format and be stored as json files:
19
+ ```json
20
+ {
21
+ "config": {
22
+ "model_dtype": "torch.float16", # or torch.bfloat16 or 8bit or 4bit
23
+ "model_name": "path of the model on the hub: org/model",
24
+ "model_sha": "revision on the hub",
25
+ },
26
+ "results": {
27
+ "task_name": {
28
+ "metric_name": score,
29
+ },
30
+ "task_name2": {
31
+ "metric_name": score,
32
+ }
33
+ }
34
+ }
35
+ ```
36
+
37
+ Request files are created automatically by this tool.
38
+
39
+ If you encounter problem on the space, don't hesitate to restart it to remove the create eval-queue, eval-queue-bk, eval-results and eval-results-bk created folder.
40
+
41
+ # Code logic for more complex edits
42
+
43
+ You'll find
44
+ - the main table' columns names and properties in `src/display/utils.py`
45
+ - the logic to read all results and request files, then convert them in dataframe lines, in `src/leaderboard/read_evals.py`, and `src/populate.py`
46
+ - the logic to allow or filter submissions in `src/submission/submit.py` and `src/submission/check_validity.py`
app.py ADDED
@@ -0,0 +1,275 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from gradio_leaderboard import Leaderboard, ColumnFilter, SelectColumns
3
+ import pandas as pd
4
+ from apscheduler.schedulers.background import BackgroundScheduler
5
+ from huggingface_hub import snapshot_download
6
+ import numpy as np
7
+
8
+ from src.about import (
9
+ CITATION_BUTTON_LABEL,
10
+ CITATION_BUTTON_TEXT,
11
+ EVALUATION_QUEUE_TEXT,
12
+ INTRODUCTION_TEXT,
13
+ TITLE,
14
+ Tasks
15
+ )
16
+ from src.display.css_html_js import custom_css
17
+ from src.display.utils import (
18
+ EVAL_COLS,
19
+ EVAL_TYPES,
20
+ AutoEvalColumn,
21
+ ModelType,
22
+ fields,
23
+ WeightType,
24
+ Precision,
25
+ AREA_DEFINITIONS,
26
+ AREA_AVG_COLUMN_MAP
27
+ )
28
+ from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, REPO_ID, RESULTS_REPO, TOKEN
29
+ from src.populate import get_evaluation_queue_df, get_leaderboard_df
30
+ from src.submission.submit import add_new_eval
31
+
32
+
33
+ def restart_space():
34
+ API.restart_space(repo_id=REPO_ID)
35
+
36
+ ### Space initialisation
37
+ try:
38
+ print(EVAL_REQUESTS_PATH)
39
+ snapshot_download(
40
+ repo_id=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
41
+ )
42
+ except Exception as e:
43
+ print(f"Erro ao baixar EVAL_REQUESTS: {e}")
44
+ try:
45
+ print(EVAL_RESULTS_PATH)
46
+ snapshot_download(
47
+ repo_id=RESULTS_REPO, local_dir=EVAL_RESULTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
48
+ )
49
+ except Exception as e:
50
+ print(f"Erro ao baixar EVAL_RESULTS: {e}")
51
+
52
+ ALL_COLS = [c.name for c in fields(AutoEvalColumn)]
53
+
54
+ try:
55
+ LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, ALL_COLS)
56
+ except Exception as e:
57
+ print(f"Erro ao gerar o DataFrame do Leaderboard: {e}")
58
+ LEADERBOARD_DF = pd.DataFrame()
59
+
60
+ (
61
+ finished_eval_queue_df,
62
+ running_eval_queue_df,
63
+ pending_eval_queue_df,
64
+ ) = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
65
+
66
+
67
+ def create_leaderboard_component(dataframe, displayed_cols, hidden_cols=None, cant_deselect_cols=None, title=None):
68
+ if dataframe is None or dataframe.empty:
69
+ return gr.Markdown(f"## {title or ''}\nNão há dados para exibir.")
70
+
71
+ if hidden_cols is None:
72
+ hidden_cols = []
73
+ if cant_deselect_cols is None:
74
+ cant_deselect_cols = [AutoEvalColumn.model_type_symbol.name, AutoEvalColumn.model.name]
75
+
76
+ all_required_cols = set(displayed_cols) | set(hidden_cols) | set(cant_deselect_cols) | {AutoEvalColumn.model_type.name, AutoEvalColumn.precision.name, AutoEvalColumn.params.name, AutoEvalColumn.still_on_hub.name}
77
+ available_cols = [col for col in all_required_cols if col in dataframe.columns]
78
+ filtered_df = dataframe[available_cols].copy()
79
+
80
+ for col in cant_deselect_cols:
81
+ if col not in filtered_df.columns:
82
+ filtered_df[col] = np.nan
83
+
84
+ return Leaderboard(
85
+ value=filtered_df,
86
+ datatype=[c.type for c in fields(AutoEvalColumn) if c.name in filtered_df.columns],
87
+ select_columns=SelectColumns(
88
+ default_selection=displayed_cols,
89
+ cant_deselect=cant_deselect_cols,
90
+ label="Selecionar Colunas para Exibir:",
91
+ ),
92
+ search_columns=[AutoEvalColumn.model.name, AutoEvalColumn.license.name] if AutoEvalColumn.license.name in filtered_df.columns else [AutoEvalColumn.model.name],
93
+ hide_columns=[c for c in hidden_cols if c in filtered_df.columns],
94
+ filter_columns=[
95
+ ColumnFilter(AutoEvalColumn.model_type.name, type="checkboxgroup", label="Tipos de Modelo") if AutoEvalColumn.model_type.name in filtered_df.columns else None,
96
+ ColumnFilter(AutoEvalColumn.precision.name, type="checkboxgroup", label="Precisão") if AutoEvalColumn.precision.name in filtered_df.columns else None,
97
+ ColumnFilter(
98
+ AutoEvalColumn.params.name,
99
+ type="slider",
100
+ min=0.01,
101
+ max=max(150, filtered_df[AutoEvalColumn.params.name].max() if AutoEvalColumn.params.name in filtered_df.columns and not filtered_df[AutoEvalColumn.params.name].empty else 150),
102
+ label="Selecionar número de parâmetros (B)",
103
+ ) if AutoEvalColumn.params.name in filtered_df.columns else None,
104
+ ColumnFilter(
105
+ AutoEvalColumn.still_on_hub.name, type="boolean", label="Deletado/incompleto", default=True
106
+ ) if AutoEvalColumn.still_on_hub.name in filtered_df.columns else None,
107
+ ],
108
+ bool_checkboxgroup_label="Ocultar modelos",
109
+ interactive=False,
110
+ )
111
+
112
+
113
+ demo = gr.Blocks(css=custom_css)
114
+ with demo:
115
+ gr.HTML(TITLE)
116
+ gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
117
+
118
+ with gr.Tabs(elem_classes="tab-buttons") as tabs:
119
+ with gr.TabItem("📊 Benchmark Geral", id=0):
120
+ general_cols_to_display = [
121
+ AutoEvalColumn.model_type_symbol.name,
122
+ AutoEvalColumn.model.name,
123
+ AutoEvalColumn.average.name,
124
+ ] + list(AREA_AVG_COLUMN_MAP.values())
125
+
126
+ general_hidden_cols = [task.name for task in Tasks] + [
127
+ AutoEvalColumn.model_type.name,
128
+ AutoEvalColumn.architecture.name,
129
+ AutoEvalColumn.weight_type.name,
130
+ AutoEvalColumn.precision.name,
131
+ AutoEvalColumn.license.name,
132
+ AutoEvalColumn.params.name,
133
+ AutoEvalColumn.likes.name,
134
+ AutoEvalColumn.still_on_hub.name,
135
+ AutoEvalColumn.revision.name
136
+ ]
137
+
138
+ create_leaderboard_component(
139
+ LEADERBOARD_DF,
140
+ displayed_cols=general_cols_to_display,
141
+ hidden_cols=general_hidden_cols,
142
+ title="Benchmark Geral"
143
+ )
144
+
145
+ tab_index = 1
146
+ for area_name, tasks_in_area in AREA_DEFINITIONS.items():
147
+ with gr.TabItem(f"🎓 {area_name}", id=tab_index):
148
+ area_cols_to_display = [
149
+ AutoEvalColumn.model_type_symbol.name,
150
+ AutoEvalColumn.model.name,
151
+ ] + [task.name for task in tasks_in_area]
152
+
153
+ area_hidden_cols = list(AREA_AVG_COLUMN_MAP.values()) + [
154
+ task.name for task in Tasks if task not in tasks_in_area
155
+ ] + [
156
+ AutoEvalColumn.model_type.name,
157
+ AutoEvalColumn.architecture.name,
158
+ AutoEvalColumn.weight_type.name,
159
+ AutoEvalColumn.precision.name,
160
+ AutoEvalColumn.license.name,
161
+ AutoEvalColumn.params.name,
162
+ AutoEvalColumn.likes.name,
163
+ AutoEvalColumn.still_on_hub.name,
164
+ AutoEvalColumn.revision.name
165
+ ]
166
+
167
+ create_leaderboard_component(
168
+ LEADERBOARD_DF,
169
+ displayed_cols=area_cols_to_display,
170
+ hidden_cols=[col for col in area_hidden_cols if col != AutoEvalColumn.average.name],
171
+ title=area_name
172
+ )
173
+ tab_index += 1
174
+
175
+ with gr.TabItem("🚀 Submit aqui!", id=tab_index):
176
+ with gr.Column():
177
+ with gr.Row():
178
+ gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
179
+
180
+ with gr.Column():
181
+ with gr.Accordion(
182
+ f"✅ Avaliações Concluídas ({len(finished_eval_queue_df)})",
183
+ open=False,
184
+ ):
185
+ with gr.Row():
186
+ finished_eval_table = gr.components.Dataframe(
187
+ value=finished_eval_queue_df,
188
+ headers=EVAL_COLS,
189
+ datatype=EVAL_TYPES,
190
+ row_count=5,
191
+ )
192
+ with gr.Accordion(
193
+ f"🔄 Fila de Avaliação em Execução ({len(running_eval_queue_df)})",
194
+ open=False,
195
+ ):
196
+ with gr.Row():
197
+ running_eval_table = gr.components.Dataframe(
198
+ value=running_eval_queue_df,
199
+ headers=EVAL_COLS,
200
+ datatype=EVAL_TYPES,
201
+ row_count=5,
202
+ )
203
+
204
+ with gr.Accordion(
205
+ f"⏳ Fila de Avaliação Pendente ({len(pending_eval_queue_df)})",
206
+ open=False,
207
+ ):
208
+ with gr.Row():
209
+ pending_eval_table = gr.components.Dataframe(
210
+ value=pending_eval_queue_df,
211
+ headers=EVAL_COLS,
212
+ datatype=EVAL_TYPES,
213
+ row_count=5,
214
+ )
215
+ with gr.Row():
216
+ gr.Markdown("# ✉️✨ Submeta seu modelo aqui!", elem_classes="markdown-text")
217
+
218
+ with gr.Row():
219
+ with gr.Column():
220
+ model_name_textbox = gr.Textbox(label="Nome do Modelo")
221
+ revision_name_textbox = gr.Textbox(label="Commit da Revisão", placeholder="main")
222
+ model_type = gr.Dropdown(
223
+ choices=[t.to_str(" : ") for t in ModelType if t != ModelType.Unknown],
224
+ label="Tipo do Modelo",
225
+ multiselect=False,
226
+ value=None,
227
+ interactive=True,
228
+ )
229
+
230
+ with gr.Column():
231
+ precision = gr.Dropdown(
232
+ choices=[i.value.name for i in Precision if i != Precision.Unknown],
233
+ label="Precisão",
234
+ multiselect=False,
235
+ value="float16",
236
+ interactive=True,
237
+ )
238
+ weight_type = gr.Dropdown(
239
+ choices=[i.value.name for i in WeightType],
240
+ label="Tipo dos Pesos",
241
+ multiselect=False,
242
+ value="Original",
243
+ interactive=True,
244
+ )
245
+ base_model_name_textbox = gr.Textbox(label="Modelo Base (para pesos delta ou adapter)")
246
+
247
+ submit_button = gr.Button("Submeter Avaliação")
248
+ submission_result = gr.Markdown()
249
+ submit_button.click(
250
+ add_new_eval,
251
+ [
252
+ model_name_textbox,
253
+ base_model_name_textbox,
254
+ revision_name_textbox,
255
+ precision,
256
+ weight_type,
257
+ model_type,
258
+ ],
259
+ submission_result,
260
+ )
261
+
262
+ with gr.Row():
263
+ with gr.Accordion("📙 Citação", open=False):
264
+ citation_button = gr.Textbox(
265
+ value=CITATION_BUTTON_TEXT,
266
+ label=CITATION_BUTTON_LABEL,
267
+ lines=20,
268
+ elem_id="citation-button",
269
+ show_copy_button=True,
270
+ )
271
+
272
+ scheduler = BackgroundScheduler()
273
+ scheduler.add_job(restart_space, "interval", seconds=1800)
274
+ scheduler.start()
275
+ demo.queue(default_concurrency_limit=40).launch()
leaderboard_funcionamento.txt ADDED
@@ -0,0 +1,72 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Funcionamento da Leaderboard para Avaliação de Modelos
2
+
3
+ ## Visão Geral
4
+ Esta leaderboard é uma aplicação web desenvolvida com Gradio que permite avaliar, comparar e submeter modelos de linguagem para benchmarks específicos. O sistema é hospedado na plataforma HuggingFace Spaces e oferece uma interface interativa para visualizar resultados de avaliações de modelos em diferentes tarefas.
5
+
6
+ ## Estrutura do Aplicativo
7
+
8
+ ### Abas Principais
9
+ 1. **🏅 LLM Benchmark** - Principal aba que exibe a tabela de classificação dos modelos avaliados
10
+ 2. **📝 About** - Informações sobre a leaderboard, metodologia e funcionamento
11
+ 3. **🚀 Submit here!** - Interface para usuários submeterem seus próprios modelos para avaliação
12
+
13
+ ### Funcionalidades de Filtragem na Leaderboard
14
+ A tabela de classificação oferece as seguintes opções de filtragem:
15
+
16
+ - **Colunas Selecionáveis** - Permite escolher quais métricas e informações exibir
17
+ - **Filtros de Tipo de Modelo** - Opção para filtrar por categorias de modelos:
18
+ - 🟢 Pretrained (Pré-treinados)
19
+ - 🔶 Fine-tuned (Ajuste fino)
20
+ - ⭕ Instruction-tuned (Ajustados para instruções)
21
+ - 🟦 RL-tuned (Ajustados por reinforcement learning)
22
+
23
+ - **Filtros de Precisão** - Filtragem por formato de pesos:
24
+ - float16
25
+ - bfloat16
26
+
27
+ - **Filtro de Parâmetros** - Slider para filtrar por número de parâmetros (0.01B - 150B)
28
+ - **Filtro de Disponibilidade** - Opção para ocultar modelos excluídos ou incompletos
29
+ - **Busca por Modelo/Licença** - Campo de busca textual para encontrar modelos específicos
30
+
31
+ ## Métricas e Benchmarks
32
+ A leaderboard avalia os modelos em benchmarks específicos:
33
+ - ANLI (Adversarial Natural Language Inference)
34
+ - LogiQA (Raciocínio lógico)
35
+
36
+ O desempenho final é calculado como a média dos resultados em todas as tarefas avaliadas.
37
+
38
+ ## Sistema de Submissão
39
+ O sistema permite que usuários enviem seus modelos para avaliação através do formulário de submissão, que inclui:
40
+
41
+ 1. **Informações do Modelo:**
42
+ - Nome do modelo (no formato organization/model)
43
+ - Revisão/commit específico
44
+ - Tipo de modelo (pretrained, fine-tuned, etc.)
45
+ - Precisão (float16, bfloat16)
46
+ - Tipo de pesos (Original, Adapter, Delta)
47
+ - Modelo base (para pesos delta ou adapter)
48
+
49
+ 2. **Filas de Avaliação:**
50
+ - ✅ Avaliações Concluídas
51
+ - 🔄 Avaliações em Execução
52
+ - ⏳ Avaliações Pendentes
53
+
54
+ ## Requisitos para Submissão
55
+ Os modelos submetidos devem:
56
+ 1. Ser carregáveis através das classes Auto do Hugging Face
57
+ 2. Preferencialmente usar o formato safetensors para armazenamento de pesos
58
+ 3. Ter uma licença aberta
59
+ 4. Ter um model card devidamente preenchido
60
+
61
+ ## Backend e Armazenamento
62
+ A leaderboard utiliza:
63
+ - Repositórios HuggingFace para armazenar resultados de avaliação e requisições
64
+ - Datasets HuggingFace para gerenciar as filas de avaliação
65
+ - Sistema de atualização periódica para manter os dados atualizados
66
+
67
+ ## Detalhes Técnicos
68
+ - Implementado usando Gradio para a interface
69
+ - Utiliza pandas para manipulação e exibição de dados
70
+ - Componente especializado gradio_leaderboard para a visualização da tabela
71
+ - Atualização automática da interface a cada 30 minutos
72
+ - Autenticação via token HF para gerenciamento dos repositórios
output/leaderboard_data_20250413_002202.csv ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ T,Modelo,Tipo,Arquitetura,Tipo de Peso,Precisão,Licença,#Params (B),Hub Likes,Disponível no hub,SHA do modelo,Média Geral,Área Médica,Área do Direito,Provas Militares,Computação,Discurso de Ódio,Economia e Contabilidade,Semântica e Inferência,Multidisciplinar,Revalida,MREX,OAB,ENAM,AFA,ITA,IME,POSCOMP,OBI,HateBR,PT Hate Speech,tweetSentBR,BCB,CFCES,FAQUAD NLI,ASSIN2 RTE,ASSIN2 STS,ENEM,BLUEX,CNPU,ENADE,BNDES,CACD (1ª fase),CACD (2ª fase),Datasets Área Médica,Datasets Área do Direito,Datasets Provas Militares,Datasets Computação,Datasets Discurso de Ódio,Datasets Economia e Contabilidade,Datasets Semântica e Inferência,Datasets Multidisciplinar
2
+ PT,openai/gpt2-portuguese,PT : pré-treinado,,Original,float16,MIT,0.12,268,True,42b7792,0.7105925230941055,0.6188847305300255,0.6701955871546674,0.5883600439376051,0.7344674503873334,0.7475962540883628,0.849576998841669,0.7788317408159661,0.7090867005579059,0.6035585626832006,0.6342108983768503,0.6592469269015914,0.6811442474077435,0.5748457759326684,0.5677492084978396,0.6224851473823073,0.7001717151455216,0.7687631856291454,0.7670186275780199,0.7059318581743916,0.7698382765126768,0.894231731105497,0.8049222665778408,0.7802498017547961,0.744894322166147,0.811351098526955,0.6944001027581704,0.7122388959938982,0.710017915718404,0.7325868004943001,0.6521346635913674,0.7253374839411425,0.7368910414080588,"Revalida, MREX","OAB, ENAM","AFA, ITA, IME","POSCOMP, OBI","HateBR, PT Hate Speech, tweetSentBR","BCB, CFCES","FAQUAD NLI, ASSIN2 RTE, ASSIN2 STS","ENEM, BLUEX, CNPU, ENADE, BNDES, CACD (1ª fase), CACD (2ª fase)"
3
+ PT,rufimelo/bert-large-portuguese-cased,PT : pré-treinado,,Original,bfloat16,MIT,0.34,96,True,b1f4531,0.7663174449695491,0.7602091078571434,0.973711984625266,0.9684507475761012,0.6871687003745621,0.8875668825625945,0.680483320210085,0.7734089183287836,0.6143132582475183,0.8009095654537163,0.7195086502605705,0.99,0.957423969250532,0.9253522427283033,0.99,0.99,0.7048405210681232,0.669496879681001,0.884580080804517,0.8725119298929598,0.9056086369903068,0.6574021331921888,0.7035645072279809,0.7812621953082739,0.7898121521378834,0.7491524075401934,0.6587752839863983,0.6568440899210536,0.5762508105837758,0.5817111877078663,0.6622223097972831,0.6028837938647416,0.5615053318715086,"Revalida, MREX","OAB, ENAM","AFA, ITA, IME","POSCOMP, OBI","HateBR, PT Hate Speech, tweetSentBR","BCB, CFCES","FAQUAD NLI, ASSIN2 RTE, ASSIN2 STS","ENEM, BLUEX, CNPU, ENADE, BNDES, CACD (1ª fase), CACD (2ª fase)"
4
+ FT,unicamp-dl/mbert-portuguese-lener,FT : fine-tuned,,Original,float16,Apache-2.0,0.11,89,True,a764b32,0.5238981255673918,0.46910271433534434,0.6478804077379579,0.4764435027720609,0.6348258377007074,0.5679570688955583,0.545621864602512,0.672668649364131,0.4039265288241587,0.4501996677234772,0.4880057609472114,0.6284122869968649,0.6673485284790508,0.4831702853633705,0.4947379598495716,0.4514222631032407,0.5966991470611256,0.6729525283402894,0.5739315630212218,0.5841033582970794,0.5458362853683737,0.5869583065081572,0.5042854226968669,0.6817845836938061,0.7232768140967074,0.6129445503018796,0.3861249155636754,0.3784890609935717,0.4222874061015168,0.3739044853763201,0.4392823188665487,0.4131461427810349,0.41425137208644314,"Revalida, MREX","OAB, ENAM","AFA, ITA, IME","POSCOMP, OBI","HateBR, PT Hate Speech, tweetSentBR","BCB, CFCES","FAQUAD NLI, ASSIN2 RTE, ASSIN2 STS","ENEM, BLUEX, CNPU, ENADE, BNDES, CACD (1ª fase), CACD (2ª fase)"
5
+ RL,brasileira/llama-2-7b-pt,RL : RL-tuned,,Original,bfloat16,LLAMA 2,7.0,562,True,c24dd37,0.7190013380059581,0.99,0.6277621780492408,0.6710016620169995,0.8100443731646181,0.7199915914170014,0.8883611805594716,0.6228298935870216,0.6546038867904668,0.99,0.99,0.6037868088474728,0.651737547251009,0.7314858380256698,0.6415017399609673,0.6400174080643614,0.8332219046699918,0.7868668416592445,0.7390035975198623,0.737371682844131,0.6835994938870111,0.920605738500256,0.8561166226186872,0.6392000772840843,0.6328464029942036,0.5964432004827769,0.6811458854492797,0.7087276670361912,0.5953130091726074,0.6393706576325692,0.6943992143544933,0.6474751086467531,0.6157956652413729,"Revalida, MREX","OAB, ENAM","AFA, ITA, IME","POSCOMP, OBI","HateBR, PT Hate Speech, tweetSentBR","BCB, CFCES","FAQUAD NLI, ASSIN2 RTE, ASSIN2 STS","ENEM, BLUEX, CNPU, ENADE, BNDES, CACD (1ª fase), CACD (2ª fase)"
6
+ PT,neuralmind/bert-base-portuguese-cased,PT : pré-treinado,,Original,float16,MIT,0.11,153,True,main,0.7903320522382491,0.7525068067882859,0.790212164999971,0.5993185824512812,0.8281524838360852,0.8274517991360355,0.6240401489565062,0.7446158736697582,0.9234267541121511,0.731003703849237,0.7740099097273346,0.7285398327624035,0.8518844972375387,0.566841753054601,0.591417134883207,0.6396968594160356,0.8389798308453639,0.8173251368268065,0.9010121557045129,0.8171834652635162,0.7641597764400775,0.6784864262708603,0.5695938716421521,0.7647064598634123,0.7170997425103216,0.7520414186355404,0.9543505337931829,0.9315956953962545,0.8767306588367528,0.9090563746651121,0.9204153379699338,0.9424590333325819,0.9293796447912399,"Revalida, MREX","OAB, ENAM","AFA, ITA, IME","POSCOMP, OBI","HateBR, PT Hate Speech, tweetSentBR","BCB, CFCES","FAQUAD NLI, ASSIN2 RTE, ASSIN2 STS","ENEM, BLUEX, CNPU, ENADE, BNDES, CACD (1ª fase), CACD (2ª fase)"
7
+ IFT,tulioandrade/mistral-7b-pt-adapter,IFT : instruction-tuned,,Adapter,bfloat16,Apache-2.0,7.2,315,True,main,0.7029740896600845,0.5349382711992174,0.6692960782120554,0.5175172583982789,0.6031745463143081,0.5873322504231425,0.6299480416296037,0.9302316794025772,0.8416315303513803,0.5461004402959746,0.5237761021024602,0.6390972174687846,0.6994949389553262,0.5231758805454357,0.5044300377533704,0.5249458568960303,0.6106598540949609,0.5956892385336552,0.5658410795439047,0.5897717901156058,0.6063838816099171,0.6203659080880489,0.6395301751711585,0.9446092137517881,0.9014725977356242,0.9446132267203191,0.8016106913774061,0.8798065244263629,0.8270460513988156,0.886454918928402,0.7981520314515067,0.8865213636168642,0.8118291312603048,"Revalida, MREX","OAB, ENAM","AFA, ITA, IME","POSCOMP, OBI","HateBR, PT Hate Speech, tweetSentBR","BCB, CFCES","FAQUAD NLI, ASSIN2 RTE, ASSIN2 STS","ENEM, BLUEX, CNPU, ENADE, BNDES, CACD (1ª fase), CACD (2ª fase)"
8
+ IFT,PetroNLP/xlm-roberta-large-portuguese-instruct,IFT : instruction-tuned,,Original,bfloat16,Apache-2.0,0.56,173,True,8a67c19,0.6169427195732342,0.5533574226765212,0.4822753750865628,0.7312559564874549,0.8708286748117062,0.7547818164225487,0.7539473601359346,0.4607262875406695,0.5207879175691644,0.5183490470430046,0.588365798310038,0.47998532940130334,0.4845654207718223,0.6801757920914254,0.7463653840531308,0.7672266933178084,0.891268718690119,0.8503886309332934,0.7692787720488455,0.6844533933473784,0.8106132838714223,0.7078948291402046,0.7999998911316645,0.5057436202874445,0.45824193776195243,0.4181933045726116,0.5408312211672078,0.48116084507473167,0.500552102499582,0.5444194567570633,0.5195728015292668,0.5294734338791449,0.5295055620771545,"Revalida, MREX","OAB, ENAM","AFA, ITA, IME","POSCOMP, OBI","HateBR, PT Hate Speech, tweetSentBR","BCB, CFCES","FAQUAD NLI, ASSIN2 RTE, ASSIN2 STS","ENEM, BLUEX, CNPU, ENADE, BNDES, CACD (1ª fase), CACD (2ª fase)"
9
+ FT,pucpr/biobertpt-bio,FT : fine-tuned,,Original,float16,CC-BY-SA-4.0,0.11,47,True,ab2d4b9,0.6108657601152154,0.702988363094768,0.6872008974588979,0.7613994162478939,0.5591628508904611,0.5708983915370802,0.6267255976277573,0.5200744268291007,0.5645008743970255,0.7228867322736113,0.6830899939159248,0.6957087204307917,0.678693074487004,0.8375613035639826,0.7426540780752489,0.70398286710445,0.5558048959754675,0.5625208058054548,0.5322247419674755,0.5780859026067205,0.6023845300370447,0.6241177598442514,0.6293334354112632,0.5662521904860494,0.49510038883517177,0.49887070116608073,0.6043572001868638,0.6034587214831894,0.5179199478458284,0.5128404234538338,0.5440709445935807,0.6134068257246169,0.5554520574912651,"Revalida, MREX","OAB, ENAM","AFA, ITA, IME","POSCOMP, OBI","HateBR, PT Hate Speech, tweetSentBR","BCB, CFCES","FAQUAD NLI, ASSIN2 RTE, ASSIN2 STS","ENEM, BLUEX, CNPU, ENADE, BNDES, CACD (1ª fase), CACD (2ª fase)"
10
+ IFT,ai-forever/gpt-pequeno-pt,IFT : instruction-tuned,,Original,float16,MIT,1.3,409,True,main,0.75196128994191,0.8670933989159613,0.8348443783260642,0.7223937364955204,0.5604037920638382,0.7773722062427244,0.8326558639935904,0.8139561288452296,0.7022728396080725,0.937260492607467,0.7969263052244557,0.8233168492437696,0.8463719074083589,0.7605339427733732,0.7120343105252683,0.6946129561879192,0.5626662966116917,0.5581412875159847,0.8322948812545272,0.7351328585423854,0.7646888789312607,0.7818810820254256,0.8834306459617549,0.8026792301025911,0.8560128858812226,0.7831762705518751,0.6721405607195572,0.6766199465199311,0.7002599041017649,0.7020562275544762,0.7412478076487755,0.7257566032842915,0.6978288274277119,"Revalida, MREX","OAB, ENAM","AFA, ITA, IME","POSCOMP, OBI","HateBR, PT Hate Speech, tweetSentBR","BCB, CFCES","FAQUAD NLI, ASSIN2 RTE, ASSIN2 STS","ENEM, BLUEX, CNPU, ENADE, BNDES, CACD (1ª fase), CACD (2ª fase)"
11
+ PT,saramago/roberta-base-portuguese,PT : pré-treinado,,Original,float16,MIT,0.13,112,True,main,0.9312283237428517,0.6179539103982901,0.99,0.9578333048317907,0.9130800909989987,0.8935172820223088,0.9548532555906575,0.9745700292057768,0.9885633439532743,0.6180836055639037,0.6178242152326763,0.99,0.99,0.9647166009011852,0.9449254684441978,0.9638578451499892,0.9581953587133843,0.8679648232846131,0.9161393680272095,0.8602959374608022,0.9041165405789148,0.9449285867327771,0.9647779244485377,0.99,0.99,0.9437100876173303,0.9799434076729195,0.99,0.99,0.99,0.99,0.99,0.99,"Revalida, MREX","OAB, ENAM","AFA, ITA, IME","POSCOMP, OBI","HateBR, PT Hate Speech, tweetSentBR","BCB, CFCES","FAQUAD NLI, ASSIN2 RTE, ASSIN2 STS","ENEM, BLUEX, CNPU, ENADE, BNDES, CACD (1ª fase), CACD (2ª fase)"
12
+ FT,pierreguillou/bert-base-brpt-clinical,FT : fine-tuned,,Original,float16,MIT,0.11,73,True,c7bef2a,0.5482271789287327,0.4041051952506239,0.6969643618569595,0.6095811315978034,0.6695079353203791,0.6304290306986099,0.4762540627043566,0.5236257225581472,0.48184021849446684,0.37785374915437375,0.43035664134687407,0.6946451229482293,0.69928360076569,0.6208503603101987,0.5800215298462423,0.6278715046369693,0.7053490347771434,0.6336668358636147,0.6278205554878714,0.6237519629728817,0.6397145736350766,0.49769596156757845,0.4548121638411348,0.49858536577398616,0.537775323953586,0.5345164779468695,0.4616904995777591,0.46535002756586913,0.49571926994757104,0.4725184614551914,0.4771470703267811,0.48038288044301486,0.5200733201450809,"Revalida, MREX","OAB, ENAM","AFA, ITA, IME","POSCOMP, OBI","HateBR, PT Hate Speech, tweetSentBR","BCB, CFCES","FAQUAD NLI, ASSIN2 RTE, ASSIN2 STS","ENEM, BLUEX, CNPU, ENADE, BNDES, CACD (1ª fase), CACD (2ª fase)"
13
+ PT,nlp-wyldlab/deberta-v3-base-portuguese,PT : pré-treinado,,Original,float16,MIT,0.18,128,True,main,0.7465021220826226,0.5946564630491511,0.6710791004110548,0.8314281273352276,0.6823194837016078,0.7755501949635453,0.8795739094780679,0.7241149481320521,0.7525018864872434,0.5792849969048782,0.6100279291934239,0.6985800323961385,0.6435781684259713,0.7914821771702616,0.8881410293755269,0.8146611754598944,0.6519856198221405,0.712653347581075,0.8175383934276091,0.7687241725532503,0.7403880189097766,0.8527339923251589,0.906413826630977,0.6930474129852162,0.6893290656821874,0.7899683657287526,0.7604902451021204,0.815325076521949,0.6935039827485028,0.7749131065853034,0.6890146652904909,0.7499769597205909,0.7842891694417463,"Revalida, MREX","OAB, ENAM","AFA, ITA, IME","POSCOMP, OBI","HateBR, PT Hate Speech, tweetSentBR","BCB, CFCES","FAQUAD NLI, ASSIN2 RTE, ASSIN2 STS","ENEM, BLUEX, CNPU, ENADE, BNDES, CACD (1ª fase), CACD (2ª fase)"
output/leaderboard_data_20250413_002339.csv ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ T,Modelo,Tipo,Arquitetura,Tipo de Peso,Precisão,Licença,#Params (B),Hub Likes,Disponível no hub,SHA do modelo,Média Geral,Área Médica,Área do Direito,Provas Militares,Computação,Discurso de Ódio,Economia e Contabilidade,Semântica e Inferência,Multidisciplinar,Revalida,MREX,OAB,ENAM,AFA,ITA,IME,POSCOMP,OBI,HateBR,PT Hate Speech,tweetSentBR,BCB,CFCES,FAQUAD NLI,ASSIN2 RTE,ASSIN2 STS,ENEM,BLUEX,CNPU,ENADE,BNDES,CACD (1ª fase),CACD (2ª fase),Datasets Área Médica,Datasets Área do Direito,Datasets Provas Militares,Datasets Computação,Datasets Discurso de Ódio,Datasets Economia e Contabilidade,Datasets Semântica e Inferência,Datasets Multidisciplinar
2
+ PT,openai/gpt2-portuguese,PT : pré-treinado,,Original,float16,MIT,0.12,268,True,42b7792,0.7105925230941055,0.6188847305300255,0.6701955871546674,0.5883600439376051,0.7344674503873334,0.7475962540883628,0.849576998841669,0.7788317408159661,0.7090867005579059,0.6035585626832006,0.6342108983768503,0.6592469269015914,0.6811442474077435,0.5748457759326684,0.5677492084978396,0.6224851473823073,0.7001717151455216,0.7687631856291454,0.7670186275780199,0.7059318581743916,0.7698382765126768,0.894231731105497,0.8049222665778408,0.7802498017547961,0.744894322166147,0.811351098526955,0.6944001027581704,0.7122388959938982,0.710017915718404,0.7325868004943001,0.6521346635913674,0.7253374839411425,0.7368910414080588,"Revalida, MREX","OAB, ENAM","AFA, ITA, IME","POSCOMP, OBI","HateBR, PT Hate Speech, tweetSentBR","BCB, CFCES","FAQUAD NLI, ASSIN2 RTE, ASSIN2 STS","ENEM, BLUEX, CNPU, ENADE, BNDES, CACD (1ª fase), CACD (2ª fase)"
3
+ PT,rufimelo/bert-large-portuguese-cased,PT : pré-treinado,,Original,bfloat16,MIT,0.34,96,True,b1f4531,0.7663174449695491,0.7602091078571434,0.973711984625266,0.9684507475761012,0.6871687003745621,0.8875668825625945,0.680483320210085,0.7734089183287836,0.6143132582475183,0.8009095654537163,0.7195086502605705,0.99,0.957423969250532,0.9253522427283033,0.99,0.99,0.7048405210681232,0.669496879681001,0.884580080804517,0.8725119298929598,0.9056086369903068,0.6574021331921888,0.7035645072279809,0.7812621953082739,0.7898121521378834,0.7491524075401934,0.6587752839863983,0.6568440899210536,0.5762508105837758,0.5817111877078663,0.6622223097972831,0.6028837938647416,0.5615053318715086,"Revalida, MREX","OAB, ENAM","AFA, ITA, IME","POSCOMP, OBI","HateBR, PT Hate Speech, tweetSentBR","BCB, CFCES","FAQUAD NLI, ASSIN2 RTE, ASSIN2 STS","ENEM, BLUEX, CNPU, ENADE, BNDES, CACD (1ª fase), CACD (2ª fase)"
4
+ FT,unicamp-dl/mbert-portuguese-lener,FT : fine-tuned,,Original,float16,Apache-2.0,0.11,89,True,a764b32,0.5238981255673918,0.46910271433534434,0.6478804077379579,0.4764435027720609,0.6348258377007074,0.5679570688955583,0.545621864602512,0.672668649364131,0.4039265288241587,0.4501996677234772,0.4880057609472114,0.6284122869968649,0.6673485284790508,0.4831702853633705,0.4947379598495716,0.4514222631032407,0.5966991470611256,0.6729525283402894,0.5739315630212218,0.5841033582970794,0.5458362853683737,0.5869583065081572,0.5042854226968669,0.6817845836938061,0.7232768140967074,0.6129445503018796,0.3861249155636754,0.3784890609935717,0.4222874061015168,0.3739044853763201,0.4392823188665487,0.4131461427810349,0.41425137208644314,"Revalida, MREX","OAB, ENAM","AFA, ITA, IME","POSCOMP, OBI","HateBR, PT Hate Speech, tweetSentBR","BCB, CFCES","FAQUAD NLI, ASSIN2 RTE, ASSIN2 STS","ENEM, BLUEX, CNPU, ENADE, BNDES, CACD (1ª fase), CACD (2ª fase)"
5
+ RL,brasileira/llama-2-7b-pt,RL : RL-tuned,,Original,bfloat16,LLAMA 2,7.0,562,True,c24dd37,0.7190013380059581,0.99,0.6277621780492408,0.6710016620169995,0.8100443731646181,0.7199915914170014,0.8883611805594716,0.6228298935870216,0.6546038867904668,0.99,0.99,0.6037868088474728,0.651737547251009,0.7314858380256698,0.6415017399609673,0.6400174080643614,0.8332219046699918,0.7868668416592445,0.7390035975198623,0.737371682844131,0.6835994938870111,0.920605738500256,0.8561166226186872,0.6392000772840843,0.6328464029942036,0.5964432004827769,0.6811458854492797,0.7087276670361912,0.5953130091726074,0.6393706576325692,0.6943992143544933,0.6474751086467531,0.6157956652413729,"Revalida, MREX","OAB, ENAM","AFA, ITA, IME","POSCOMP, OBI","HateBR, PT Hate Speech, tweetSentBR","BCB, CFCES","FAQUAD NLI, ASSIN2 RTE, ASSIN2 STS","ENEM, BLUEX, CNPU, ENADE, BNDES, CACD (1ª fase), CACD (2ª fase)"
6
+ PT,neuralmind/bert-base-portuguese-cased,PT : pré-treinado,,Original,float16,MIT,0.11,153,True,main,0.7903320522382491,0.7525068067882859,0.790212164999971,0.5993185824512812,0.8281524838360852,0.8274517991360355,0.6240401489565062,0.7446158736697582,0.9234267541121511,0.731003703849237,0.7740099097273346,0.7285398327624035,0.8518844972375387,0.566841753054601,0.591417134883207,0.6396968594160356,0.8389798308453639,0.8173251368268065,0.9010121557045129,0.8171834652635162,0.7641597764400775,0.6784864262708603,0.5695938716421521,0.7647064598634123,0.7170997425103216,0.7520414186355404,0.9543505337931829,0.9315956953962545,0.8767306588367528,0.9090563746651121,0.9204153379699338,0.9424590333325819,0.9293796447912399,"Revalida, MREX","OAB, ENAM","AFA, ITA, IME","POSCOMP, OBI","HateBR, PT Hate Speech, tweetSentBR","BCB, CFCES","FAQUAD NLI, ASSIN2 RTE, ASSIN2 STS","ENEM, BLUEX, CNPU, ENADE, BNDES, CACD (1ª fase), CACD (2ª fase)"
7
+ IFT,tulioandrade/mistral-7b-pt-adapter,IFT : instruction-tuned,,Adapter,bfloat16,Apache-2.0,7.2,315,True,main,0.7029740896600845,0.5349382711992174,0.6692960782120554,0.5175172583982789,0.6031745463143081,0.5873322504231425,0.6299480416296037,0.9302316794025772,0.8416315303513803,0.5461004402959746,0.5237761021024602,0.6390972174687846,0.6994949389553262,0.5231758805454357,0.5044300377533704,0.5249458568960303,0.6106598540949609,0.5956892385336552,0.5658410795439047,0.5897717901156058,0.6063838816099171,0.6203659080880489,0.6395301751711585,0.9446092137517881,0.9014725977356242,0.9446132267203191,0.8016106913774061,0.8798065244263629,0.8270460513988156,0.886454918928402,0.7981520314515067,0.8865213636168642,0.8118291312603048,"Revalida, MREX","OAB, ENAM","AFA, ITA, IME","POSCOMP, OBI","HateBR, PT Hate Speech, tweetSentBR","BCB, CFCES","FAQUAD NLI, ASSIN2 RTE, ASSIN2 STS","ENEM, BLUEX, CNPU, ENADE, BNDES, CACD (1ª fase), CACD (2ª fase)"
8
+ IFT,PetroNLP/xlm-roberta-large-portuguese-instruct,IFT : instruction-tuned,,Original,bfloat16,Apache-2.0,0.56,173,True,8a67c19,0.6169427195732342,0.5533574226765212,0.4822753750865628,0.7312559564874549,0.8708286748117062,0.7547818164225487,0.7539473601359346,0.4607262875406695,0.5207879175691644,0.5183490470430046,0.588365798310038,0.47998532940130334,0.4845654207718223,0.6801757920914254,0.7463653840531308,0.7672266933178084,0.891268718690119,0.8503886309332934,0.7692787720488455,0.6844533933473784,0.8106132838714223,0.7078948291402046,0.7999998911316645,0.5057436202874445,0.45824193776195243,0.4181933045726116,0.5408312211672078,0.48116084507473167,0.500552102499582,0.5444194567570633,0.5195728015292668,0.5294734338791449,0.5295055620771545,"Revalida, MREX","OAB, ENAM","AFA, ITA, IME","POSCOMP, OBI","HateBR, PT Hate Speech, tweetSentBR","BCB, CFCES","FAQUAD NLI, ASSIN2 RTE, ASSIN2 STS","ENEM, BLUEX, CNPU, ENADE, BNDES, CACD (1ª fase), CACD (2ª fase)"
9
+ FT,pucpr/biobertpt-bio,FT : fine-tuned,,Original,float16,CC-BY-SA-4.0,0.11,47,True,ab2d4b9,0.6108657601152154,0.702988363094768,0.6872008974588979,0.7613994162478939,0.5591628508904611,0.5708983915370802,0.6267255976277573,0.5200744268291007,0.5645008743970255,0.7228867322736113,0.6830899939159248,0.6957087204307917,0.678693074487004,0.8375613035639826,0.7426540780752489,0.70398286710445,0.5558048959754675,0.5625208058054548,0.5322247419674755,0.5780859026067205,0.6023845300370447,0.6241177598442514,0.6293334354112632,0.5662521904860494,0.49510038883517177,0.49887070116608073,0.6043572001868638,0.6034587214831894,0.5179199478458284,0.5128404234538338,0.5440709445935807,0.6134068257246169,0.5554520574912651,"Revalida, MREX","OAB, ENAM","AFA, ITA, IME","POSCOMP, OBI","HateBR, PT Hate Speech, tweetSentBR","BCB, CFCES","FAQUAD NLI, ASSIN2 RTE, ASSIN2 STS","ENEM, BLUEX, CNPU, ENADE, BNDES, CACD (1ª fase), CACD (2ª fase)"
10
+ IFT,ai-forever/gpt-pequeno-pt,IFT : instruction-tuned,,Original,float16,MIT,1.3,409,True,main,0.75196128994191,0.8670933989159613,0.8348443783260642,0.7223937364955204,0.5604037920638382,0.7773722062427244,0.8326558639935904,0.8139561288452296,0.7022728396080725,0.937260492607467,0.7969263052244557,0.8233168492437696,0.8463719074083589,0.7605339427733732,0.7120343105252683,0.6946129561879192,0.5626662966116917,0.5581412875159847,0.8322948812545272,0.7351328585423854,0.7646888789312607,0.7818810820254256,0.8834306459617549,0.8026792301025911,0.8560128858812226,0.7831762705518751,0.6721405607195572,0.6766199465199311,0.7002599041017649,0.7020562275544762,0.7412478076487755,0.7257566032842915,0.6978288274277119,"Revalida, MREX","OAB, ENAM","AFA, ITA, IME","POSCOMP, OBI","HateBR, PT Hate Speech, tweetSentBR","BCB, CFCES","FAQUAD NLI, ASSIN2 RTE, ASSIN2 STS","ENEM, BLUEX, CNPU, ENADE, BNDES, CACD (1ª fase), CACD (2ª fase)"
11
+ PT,saramago/roberta-base-portuguese,PT : pré-treinado,,Original,float16,MIT,0.13,112,True,main,0.9312283237428517,0.6179539103982901,0.99,0.9578333048317907,0.9130800909989987,0.8935172820223088,0.9548532555906575,0.9745700292057768,0.9885633439532743,0.6180836055639037,0.6178242152326763,0.99,0.99,0.9647166009011852,0.9449254684441978,0.9638578451499892,0.9581953587133843,0.8679648232846131,0.9161393680272095,0.8602959374608022,0.9041165405789148,0.9449285867327771,0.9647779244485377,0.99,0.99,0.9437100876173303,0.9799434076729195,0.99,0.99,0.99,0.99,0.99,0.99,"Revalida, MREX","OAB, ENAM","AFA, ITA, IME","POSCOMP, OBI","HateBR, PT Hate Speech, tweetSentBR","BCB, CFCES","FAQUAD NLI, ASSIN2 RTE, ASSIN2 STS","ENEM, BLUEX, CNPU, ENADE, BNDES, CACD (1ª fase), CACD (2ª fase)"
12
+ FT,pierreguillou/bert-base-brpt-clinical,FT : fine-tuned,,Original,float16,MIT,0.11,73,True,c7bef2a,0.5482271789287327,0.4041051952506239,0.6969643618569595,0.6095811315978034,0.6695079353203791,0.6304290306986099,0.4762540627043566,0.5236257225581472,0.48184021849446684,0.37785374915437375,0.43035664134687407,0.6946451229482293,0.69928360076569,0.6208503603101987,0.5800215298462423,0.6278715046369693,0.7053490347771434,0.6336668358636147,0.6278205554878714,0.6237519629728817,0.6397145736350766,0.49769596156757845,0.4548121638411348,0.49858536577398616,0.537775323953586,0.5345164779468695,0.4616904995777591,0.46535002756586913,0.49571926994757104,0.4725184614551914,0.4771470703267811,0.48038288044301486,0.5200733201450809,"Revalida, MREX","OAB, ENAM","AFA, ITA, IME","POSCOMP, OBI","HateBR, PT Hate Speech, tweetSentBR","BCB, CFCES","FAQUAD NLI, ASSIN2 RTE, ASSIN2 STS","ENEM, BLUEX, CNPU, ENADE, BNDES, CACD (1ª fase), CACD (2ª fase)"
13
+ PT,nlp-wyldlab/deberta-v3-base-portuguese,PT : pré-treinado,,Original,float16,MIT,0.18,128,True,main,0.7465021220826226,0.5946564630491511,0.6710791004110548,0.8314281273352276,0.6823194837016078,0.7755501949635453,0.8795739094780679,0.7241149481320521,0.7525018864872434,0.5792849969048782,0.6100279291934239,0.6985800323961385,0.6435781684259713,0.7914821771702616,0.8881410293755269,0.8146611754598944,0.6519856198221405,0.712653347581075,0.8175383934276091,0.7687241725532503,0.7403880189097766,0.8527339923251589,0.906413826630977,0.6930474129852162,0.6893290656821874,0.7899683657287526,0.7604902451021204,0.815325076521949,0.6935039827485028,0.7749131065853034,0.6890146652904909,0.7499769597205909,0.7842891694417463,"Revalida, MREX","OAB, ENAM","AFA, ITA, IME","POSCOMP, OBI","HateBR, PT Hate Speech, tweetSentBR","BCB, CFCES","FAQUAD NLI, ASSIN2 RTE, ASSIN2 STS","ENEM, BLUEX, CNPU, ENADE, BNDES, CACD (1ª fase), CACD (2ª fase)"
output/leaderboard_data_20250413_002339.json ADDED
@@ -0,0 +1,650 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "T":"PT",
4
+ "Modelo":"openai\/gpt2-portuguese",
5
+ "Tipo":"PT : pr\u00e9-treinado",
6
+ "Arquitetura":"",
7
+ "Tipo de Peso":"Original",
8
+ "Precis\u00e3o":"float16",
9
+ "Licen\u00e7a":"MIT",
10
+ "#Params (B)":0.12,
11
+ "Hub Likes":268,
12
+ "Dispon\u00edvel no hub":true,
13
+ "SHA do modelo":"42b7792",
14
+ "M\u00e9dia Geral":0.7105925231,
15
+ "\u00c1rea M\u00e9dica":0.6188847305,
16
+ "\u00c1rea do Direito":0.6701955872,
17
+ "Provas Militares":0.5883600439,
18
+ "Computa\u00e7\u00e3o":0.7344674504,
19
+ "Discurso de \u00d3dio":0.7475962541,
20
+ "Economia e Contabilidade":0.8495769988,
21
+ "Sem\u00e2ntica e Infer\u00eancia":0.7788317408,
22
+ "Multidisciplinar":0.7090867006,
23
+ "Revalida":0.6035585627,
24
+ "MREX":0.6342108984,
25
+ "OAB":0.6592469269,
26
+ "ENAM":0.6811442474,
27
+ "AFA":0.5748457759,
28
+ "ITA":0.5677492085,
29
+ "IME":0.6224851474,
30
+ "POSCOMP":0.7001717151,
31
+ "OBI":0.7687631856,
32
+ "HateBR":0.7670186276,
33
+ "PT Hate Speech":0.7059318582,
34
+ "tweetSentBR":0.7698382765,
35
+ "BCB":0.8942317311,
36
+ "CFCES":0.8049222666,
37
+ "FAQUAD NLI":0.7802498018,
38
+ "ASSIN2 RTE":0.7448943222,
39
+ "ASSIN2 STS":0.8113510985,
40
+ "ENEM":0.6944001028,
41
+ "BLUEX":0.712238896,
42
+ "CNPU":0.7100179157,
43
+ "ENADE":0.7325868005,
44
+ "BNDES":0.6521346636,
45
+ "CACD (1\u00aa fase)":0.7253374839,
46
+ "CACD (2\u00aa fase)":0.7368910414,
47
+ "Datasets \u00c1rea M\u00e9dica":"Revalida, MREX",
48
+ "Datasets \u00c1rea do Direito":"OAB, ENAM",
49
+ "Datasets Provas Militares":"AFA, ITA, IME",
50
+ "Datasets Computa\u00e7\u00e3o":"POSCOMP, OBI",
51
+ "Datasets Discurso de \u00d3dio":"HateBR, PT Hate Speech, tweetSentBR",
52
+ "Datasets Economia e Contabilidade":"BCB, CFCES",
53
+ "Datasets Sem\u00e2ntica e Infer\u00eancia":"FAQUAD NLI, ASSIN2 RTE, ASSIN2 STS",
54
+ "Datasets Multidisciplinar":"ENEM, BLUEX, CNPU, ENADE, BNDES, CACD (1\u00aa fase), CACD (2\u00aa fase)"
55
+ },
56
+ {
57
+ "T":"PT",
58
+ "Modelo":"rufimelo\/bert-large-portuguese-cased",
59
+ "Tipo":"PT : pr\u00e9-treinado",
60
+ "Arquitetura":"",
61
+ "Tipo de Peso":"Original",
62
+ "Precis\u00e3o":"bfloat16",
63
+ "Licen\u00e7a":"MIT",
64
+ "#Params (B)":0.34,
65
+ "Hub Likes":96,
66
+ "Dispon\u00edvel no hub":true,
67
+ "SHA do modelo":"b1f4531",
68
+ "M\u00e9dia Geral":0.766317445,
69
+ "\u00c1rea M\u00e9dica":0.7602091079,
70
+ "\u00c1rea do Direito":0.9737119846,
71
+ "Provas Militares":0.9684507476,
72
+ "Computa\u00e7\u00e3o":0.6871687004,
73
+ "Discurso de \u00d3dio":0.8875668826,
74
+ "Economia e Contabilidade":0.6804833202,
75
+ "Sem\u00e2ntica e Infer\u00eancia":0.7734089183,
76
+ "Multidisciplinar":0.6143132582,
77
+ "Revalida":0.8009095655,
78
+ "MREX":0.7195086503,
79
+ "OAB":0.99,
80
+ "ENAM":0.9574239693,
81
+ "AFA":0.9253522427,
82
+ "ITA":0.99,
83
+ "IME":0.99,
84
+ "POSCOMP":0.7048405211,
85
+ "OBI":0.6694968797,
86
+ "HateBR":0.8845800808,
87
+ "PT Hate Speech":0.8725119299,
88
+ "tweetSentBR":0.905608637,
89
+ "BCB":0.6574021332,
90
+ "CFCES":0.7035645072,
91
+ "FAQUAD NLI":0.7812621953,
92
+ "ASSIN2 RTE":0.7898121521,
93
+ "ASSIN2 STS":0.7491524075,
94
+ "ENEM":0.658775284,
95
+ "BLUEX":0.6568440899,
96
+ "CNPU":0.5762508106,
97
+ "ENADE":0.5817111877,
98
+ "BNDES":0.6622223098,
99
+ "CACD (1\u00aa fase)":0.6028837939,
100
+ "CACD (2\u00aa fase)":0.5615053319,
101
+ "Datasets \u00c1rea M\u00e9dica":"Revalida, MREX",
102
+ "Datasets \u00c1rea do Direito":"OAB, ENAM",
103
+ "Datasets Provas Militares":"AFA, ITA, IME",
104
+ "Datasets Computa\u00e7\u00e3o":"POSCOMP, OBI",
105
+ "Datasets Discurso de \u00d3dio":"HateBR, PT Hate Speech, tweetSentBR",
106
+ "Datasets Economia e Contabilidade":"BCB, CFCES",
107
+ "Datasets Sem\u00e2ntica e Infer\u00eancia":"FAQUAD NLI, ASSIN2 RTE, ASSIN2 STS",
108
+ "Datasets Multidisciplinar":"ENEM, BLUEX, CNPU, ENADE, BNDES, CACD (1\u00aa fase), CACD (2\u00aa fase)"
109
+ },
110
+ {
111
+ "T":"FT",
112
+ "Modelo":"unicamp-dl\/mbert-portuguese-lener",
113
+ "Tipo":"FT : fine-tuned",
114
+ "Arquitetura":"",
115
+ "Tipo de Peso":"Original",
116
+ "Precis\u00e3o":"float16",
117
+ "Licen\u00e7a":"Apache-2.0",
118
+ "#Params (B)":0.11,
119
+ "Hub Likes":89,
120
+ "Dispon\u00edvel no hub":true,
121
+ "SHA do modelo":"a764b32",
122
+ "M\u00e9dia Geral":0.5238981256,
123
+ "\u00c1rea M\u00e9dica":0.4691027143,
124
+ "\u00c1rea do Direito":0.6478804077,
125
+ "Provas Militares":0.4764435028,
126
+ "Computa\u00e7\u00e3o":0.6348258377,
127
+ "Discurso de \u00d3dio":0.5679570689,
128
+ "Economia e Contabilidade":0.5456218646,
129
+ "Sem\u00e2ntica e Infer\u00eancia":0.6726686494,
130
+ "Multidisciplinar":0.4039265288,
131
+ "Revalida":0.4501996677,
132
+ "MREX":0.4880057609,
133
+ "OAB":0.628412287,
134
+ "ENAM":0.6673485285,
135
+ "AFA":0.4831702854,
136
+ "ITA":0.4947379598,
137
+ "IME":0.4514222631,
138
+ "POSCOMP":0.5966991471,
139
+ "OBI":0.6729525283,
140
+ "HateBR":0.573931563,
141
+ "PT Hate Speech":0.5841033583,
142
+ "tweetSentBR":0.5458362854,
143
+ "BCB":0.5869583065,
144
+ "CFCES":0.5042854227,
145
+ "FAQUAD NLI":0.6817845837,
146
+ "ASSIN2 RTE":0.7232768141,
147
+ "ASSIN2 STS":0.6129445503,
148
+ "ENEM":0.3861249156,
149
+ "BLUEX":0.378489061,
150
+ "CNPU":0.4222874061,
151
+ "ENADE":0.3739044854,
152
+ "BNDES":0.4392823189,
153
+ "CACD (1\u00aa fase)":0.4131461428,
154
+ "CACD (2\u00aa fase)":0.4142513721,
155
+ "Datasets \u00c1rea M\u00e9dica":"Revalida, MREX",
156
+ "Datasets \u00c1rea do Direito":"OAB, ENAM",
157
+ "Datasets Provas Militares":"AFA, ITA, IME",
158
+ "Datasets Computa\u00e7\u00e3o":"POSCOMP, OBI",
159
+ "Datasets Discurso de \u00d3dio":"HateBR, PT Hate Speech, tweetSentBR",
160
+ "Datasets Economia e Contabilidade":"BCB, CFCES",
161
+ "Datasets Sem\u00e2ntica e Infer\u00eancia":"FAQUAD NLI, ASSIN2 RTE, ASSIN2 STS",
162
+ "Datasets Multidisciplinar":"ENEM, BLUEX, CNPU, ENADE, BNDES, CACD (1\u00aa fase), CACD (2\u00aa fase)"
163
+ },
164
+ {
165
+ "T":"RL",
166
+ "Modelo":"brasileira\/llama-2-7b-pt",
167
+ "Tipo":"RL : RL-tuned",
168
+ "Arquitetura":"",
169
+ "Tipo de Peso":"Original",
170
+ "Precis\u00e3o":"bfloat16",
171
+ "Licen\u00e7a":"LLAMA 2",
172
+ "#Params (B)":7.0,
173
+ "Hub Likes":562,
174
+ "Dispon\u00edvel no hub":true,
175
+ "SHA do modelo":"c24dd37",
176
+ "M\u00e9dia Geral":0.719001338,
177
+ "\u00c1rea M\u00e9dica":0.99,
178
+ "\u00c1rea do Direito":0.627762178,
179
+ "Provas Militares":0.671001662,
180
+ "Computa\u00e7\u00e3o":0.8100443732,
181
+ "Discurso de \u00d3dio":0.7199915914,
182
+ "Economia e Contabilidade":0.8883611806,
183
+ "Sem\u00e2ntica e Infer\u00eancia":0.6228298936,
184
+ "Multidisciplinar":0.6546038868,
185
+ "Revalida":0.99,
186
+ "MREX":0.99,
187
+ "OAB":0.6037868088,
188
+ "ENAM":0.6517375473,
189
+ "AFA":0.731485838,
190
+ "ITA":0.64150174,
191
+ "IME":0.6400174081,
192
+ "POSCOMP":0.8332219047,
193
+ "OBI":0.7868668417,
194
+ "HateBR":0.7390035975,
195
+ "PT Hate Speech":0.7373716828,
196
+ "tweetSentBR":0.6835994939,
197
+ "BCB":0.9206057385,
198
+ "CFCES":0.8561166226,
199
+ "FAQUAD NLI":0.6392000773,
200
+ "ASSIN2 RTE":0.632846403,
201
+ "ASSIN2 STS":0.5964432005,
202
+ "ENEM":0.6811458854,
203
+ "BLUEX":0.708727667,
204
+ "CNPU":0.5953130092,
205
+ "ENADE":0.6393706576,
206
+ "BNDES":0.6943992144,
207
+ "CACD (1\u00aa fase)":0.6474751086,
208
+ "CACD (2\u00aa fase)":0.6157956652,
209
+ "Datasets \u00c1rea M\u00e9dica":"Revalida, MREX",
210
+ "Datasets \u00c1rea do Direito":"OAB, ENAM",
211
+ "Datasets Provas Militares":"AFA, ITA, IME",
212
+ "Datasets Computa\u00e7\u00e3o":"POSCOMP, OBI",
213
+ "Datasets Discurso de \u00d3dio":"HateBR, PT Hate Speech, tweetSentBR",
214
+ "Datasets Economia e Contabilidade":"BCB, CFCES",
215
+ "Datasets Sem\u00e2ntica e Infer\u00eancia":"FAQUAD NLI, ASSIN2 RTE, ASSIN2 STS",
216
+ "Datasets Multidisciplinar":"ENEM, BLUEX, CNPU, ENADE, BNDES, CACD (1\u00aa fase), CACD (2\u00aa fase)"
217
+ },
218
+ {
219
+ "T":"PT",
220
+ "Modelo":"neuralmind\/bert-base-portuguese-cased",
221
+ "Tipo":"PT : pr\u00e9-treinado",
222
+ "Arquitetura":"",
223
+ "Tipo de Peso":"Original",
224
+ "Precis\u00e3o":"float16",
225
+ "Licen\u00e7a":"MIT",
226
+ "#Params (B)":0.11,
227
+ "Hub Likes":153,
228
+ "Dispon\u00edvel no hub":true,
229
+ "SHA do modelo":"main",
230
+ "M\u00e9dia Geral":0.7903320522,
231
+ "\u00c1rea M\u00e9dica":0.7525068068,
232
+ "\u00c1rea do Direito":0.790212165,
233
+ "Provas Militares":0.5993185825,
234
+ "Computa\u00e7\u00e3o":0.8281524838,
235
+ "Discurso de \u00d3dio":0.8274517991,
236
+ "Economia e Contabilidade":0.624040149,
237
+ "Sem\u00e2ntica e Infer\u00eancia":0.7446158737,
238
+ "Multidisciplinar":0.9234267541,
239
+ "Revalida":0.7310037038,
240
+ "MREX":0.7740099097,
241
+ "OAB":0.7285398328,
242
+ "ENAM":0.8518844972,
243
+ "AFA":0.5668417531,
244
+ "ITA":0.5914171349,
245
+ "IME":0.6396968594,
246
+ "POSCOMP":0.8389798308,
247
+ "OBI":0.8173251368,
248
+ "HateBR":0.9010121557,
249
+ "PT Hate Speech":0.8171834653,
250
+ "tweetSentBR":0.7641597764,
251
+ "BCB":0.6784864263,
252
+ "CFCES":0.5695938716,
253
+ "FAQUAD NLI":0.7647064599,
254
+ "ASSIN2 RTE":0.7170997425,
255
+ "ASSIN2 STS":0.7520414186,
256
+ "ENEM":0.9543505338,
257
+ "BLUEX":0.9315956954,
258
+ "CNPU":0.8767306588,
259
+ "ENADE":0.9090563747,
260
+ "BNDES":0.920415338,
261
+ "CACD (1\u00aa fase)":0.9424590333,
262
+ "CACD (2\u00aa fase)":0.9293796448,
263
+ "Datasets \u00c1rea M\u00e9dica":"Revalida, MREX",
264
+ "Datasets \u00c1rea do Direito":"OAB, ENAM",
265
+ "Datasets Provas Militares":"AFA, ITA, IME",
266
+ "Datasets Computa\u00e7\u00e3o":"POSCOMP, OBI",
267
+ "Datasets Discurso de \u00d3dio":"HateBR, PT Hate Speech, tweetSentBR",
268
+ "Datasets Economia e Contabilidade":"BCB, CFCES",
269
+ "Datasets Sem\u00e2ntica e Infer\u00eancia":"FAQUAD NLI, ASSIN2 RTE, ASSIN2 STS",
270
+ "Datasets Multidisciplinar":"ENEM, BLUEX, CNPU, ENADE, BNDES, CACD (1\u00aa fase), CACD (2\u00aa fase)"
271
+ },
272
+ {
273
+ "T":"IFT",
274
+ "Modelo":"tulioandrade\/mistral-7b-pt-adapter",
275
+ "Tipo":"IFT : instruction-tuned",
276
+ "Arquitetura":"",
277
+ "Tipo de Peso":"Adapter",
278
+ "Precis\u00e3o":"bfloat16",
279
+ "Licen\u00e7a":"Apache-2.0",
280
+ "#Params (B)":7.2,
281
+ "Hub Likes":315,
282
+ "Dispon\u00edvel no hub":true,
283
+ "SHA do modelo":"main",
284
+ "M\u00e9dia Geral":0.7029740897,
285
+ "\u00c1rea M\u00e9dica":0.5349382712,
286
+ "\u00c1rea do Direito":0.6692960782,
287
+ "Provas Militares":0.5175172584,
288
+ "Computa\u00e7\u00e3o":0.6031745463,
289
+ "Discurso de \u00d3dio":0.5873322504,
290
+ "Economia e Contabilidade":0.6299480416,
291
+ "Sem\u00e2ntica e Infer\u00eancia":0.9302316794,
292
+ "Multidisciplinar":0.8416315304,
293
+ "Revalida":0.5461004403,
294
+ "MREX":0.5237761021,
295
+ "OAB":0.6390972175,
296
+ "ENAM":0.699494939,
297
+ "AFA":0.5231758805,
298
+ "ITA":0.5044300378,
299
+ "IME":0.5249458569,
300
+ "POSCOMP":0.6106598541,
301
+ "OBI":0.5956892385,
302
+ "HateBR":0.5658410795,
303
+ "PT Hate Speech":0.5897717901,
304
+ "tweetSentBR":0.6063838816,
305
+ "BCB":0.6203659081,
306
+ "CFCES":0.6395301752,
307
+ "FAQUAD NLI":0.9446092138,
308
+ "ASSIN2 RTE":0.9014725977,
309
+ "ASSIN2 STS":0.9446132267,
310
+ "ENEM":0.8016106914,
311
+ "BLUEX":0.8798065244,
312
+ "CNPU":0.8270460514,
313
+ "ENADE":0.8864549189,
314
+ "BNDES":0.7981520315,
315
+ "CACD (1\u00aa fase)":0.8865213636,
316
+ "CACD (2\u00aa fase)":0.8118291313,
317
+ "Datasets \u00c1rea M\u00e9dica":"Revalida, MREX",
318
+ "Datasets \u00c1rea do Direito":"OAB, ENAM",
319
+ "Datasets Provas Militares":"AFA, ITA, IME",
320
+ "Datasets Computa\u00e7\u00e3o":"POSCOMP, OBI",
321
+ "Datasets Discurso de \u00d3dio":"HateBR, PT Hate Speech, tweetSentBR",
322
+ "Datasets Economia e Contabilidade":"BCB, CFCES",
323
+ "Datasets Sem\u00e2ntica e Infer\u00eancia":"FAQUAD NLI, ASSIN2 RTE, ASSIN2 STS",
324
+ "Datasets Multidisciplinar":"ENEM, BLUEX, CNPU, ENADE, BNDES, CACD (1\u00aa fase), CACD (2\u00aa fase)"
325
+ },
326
+ {
327
+ "T":"IFT",
328
+ "Modelo":"PetroNLP\/xlm-roberta-large-portuguese-instruct",
329
+ "Tipo":"IFT : instruction-tuned",
330
+ "Arquitetura":"",
331
+ "Tipo de Peso":"Original",
332
+ "Precis\u00e3o":"bfloat16",
333
+ "Licen\u00e7a":"Apache-2.0",
334
+ "#Params (B)":0.56,
335
+ "Hub Likes":173,
336
+ "Dispon\u00edvel no hub":true,
337
+ "SHA do modelo":"8a67c19",
338
+ "M\u00e9dia Geral":0.6169427196,
339
+ "\u00c1rea M\u00e9dica":0.5533574227,
340
+ "\u00c1rea do Direito":0.4822753751,
341
+ "Provas Militares":0.7312559565,
342
+ "Computa\u00e7\u00e3o":0.8708286748,
343
+ "Discurso de \u00d3dio":0.7547818164,
344
+ "Economia e Contabilidade":0.7539473601,
345
+ "Sem\u00e2ntica e Infer\u00eancia":0.4607262875,
346
+ "Multidisciplinar":0.5207879176,
347
+ "Revalida":0.518349047,
348
+ "MREX":0.5883657983,
349
+ "OAB":0.4799853294,
350
+ "ENAM":0.4845654208,
351
+ "AFA":0.6801757921,
352
+ "ITA":0.7463653841,
353
+ "IME":0.7672266933,
354
+ "POSCOMP":0.8912687187,
355
+ "OBI":0.8503886309,
356
+ "HateBR":0.769278772,
357
+ "PT Hate Speech":0.6844533933,
358
+ "tweetSentBR":0.8106132839,
359
+ "BCB":0.7078948291,
360
+ "CFCES":0.7999998911,
361
+ "FAQUAD NLI":0.5057436203,
362
+ "ASSIN2 RTE":0.4582419378,
363
+ "ASSIN2 STS":0.4181933046,
364
+ "ENEM":0.5408312212,
365
+ "BLUEX":0.4811608451,
366
+ "CNPU":0.5005521025,
367
+ "ENADE":0.5444194568,
368
+ "BNDES":0.5195728015,
369
+ "CACD (1\u00aa fase)":0.5294734339,
370
+ "CACD (2\u00aa fase)":0.5295055621,
371
+ "Datasets \u00c1rea M\u00e9dica":"Revalida, MREX",
372
+ "Datasets \u00c1rea do Direito":"OAB, ENAM",
373
+ "Datasets Provas Militares":"AFA, ITA, IME",
374
+ "Datasets Computa\u00e7\u00e3o":"POSCOMP, OBI",
375
+ "Datasets Discurso de \u00d3dio":"HateBR, PT Hate Speech, tweetSentBR",
376
+ "Datasets Economia e Contabilidade":"BCB, CFCES",
377
+ "Datasets Sem\u00e2ntica e Infer\u00eancia":"FAQUAD NLI, ASSIN2 RTE, ASSIN2 STS",
378
+ "Datasets Multidisciplinar":"ENEM, BLUEX, CNPU, ENADE, BNDES, CACD (1\u00aa fase), CACD (2\u00aa fase)"
379
+ },
380
+ {
381
+ "T":"FT",
382
+ "Modelo":"pucpr\/biobertpt-bio",
383
+ "Tipo":"FT : fine-tuned",
384
+ "Arquitetura":"",
385
+ "Tipo de Peso":"Original",
386
+ "Precis\u00e3o":"float16",
387
+ "Licen\u00e7a":"CC-BY-SA-4.0",
388
+ "#Params (B)":0.11,
389
+ "Hub Likes":47,
390
+ "Dispon\u00edvel no hub":true,
391
+ "SHA do modelo":"ab2d4b9",
392
+ "M\u00e9dia Geral":0.6108657601,
393
+ "\u00c1rea M\u00e9dica":0.7029883631,
394
+ "\u00c1rea do Direito":0.6872008975,
395
+ "Provas Militares":0.7613994162,
396
+ "Computa\u00e7\u00e3o":0.5591628509,
397
+ "Discurso de \u00d3dio":0.5708983915,
398
+ "Economia e Contabilidade":0.6267255976,
399
+ "Sem\u00e2ntica e Infer\u00eancia":0.5200744268,
400
+ "Multidisciplinar":0.5645008744,
401
+ "Revalida":0.7228867323,
402
+ "MREX":0.6830899939,
403
+ "OAB":0.6957087204,
404
+ "ENAM":0.6786930745,
405
+ "AFA":0.8375613036,
406
+ "ITA":0.7426540781,
407
+ "IME":0.7039828671,
408
+ "POSCOMP":0.555804896,
409
+ "OBI":0.5625208058,
410
+ "HateBR":0.532224742,
411
+ "PT Hate Speech":0.5780859026,
412
+ "tweetSentBR":0.60238453,
413
+ "BCB":0.6241177598,
414
+ "CFCES":0.6293334354,
415
+ "FAQUAD NLI":0.5662521905,
416
+ "ASSIN2 RTE":0.4951003888,
417
+ "ASSIN2 STS":0.4988707012,
418
+ "ENEM":0.6043572002,
419
+ "BLUEX":0.6034587215,
420
+ "CNPU":0.5179199478,
421
+ "ENADE":0.5128404235,
422
+ "BNDES":0.5440709446,
423
+ "CACD (1\u00aa fase)":0.6134068257,
424
+ "CACD (2\u00aa fase)":0.5554520575,
425
+ "Datasets \u00c1rea M\u00e9dica":"Revalida, MREX",
426
+ "Datasets \u00c1rea do Direito":"OAB, ENAM",
427
+ "Datasets Provas Militares":"AFA, ITA, IME",
428
+ "Datasets Computa\u00e7\u00e3o":"POSCOMP, OBI",
429
+ "Datasets Discurso de \u00d3dio":"HateBR, PT Hate Speech, tweetSentBR",
430
+ "Datasets Economia e Contabilidade":"BCB, CFCES",
431
+ "Datasets Sem\u00e2ntica e Infer\u00eancia":"FAQUAD NLI, ASSIN2 RTE, ASSIN2 STS",
432
+ "Datasets Multidisciplinar":"ENEM, BLUEX, CNPU, ENADE, BNDES, CACD (1\u00aa fase), CACD (2\u00aa fase)"
433
+ },
434
+ {
435
+ "T":"IFT",
436
+ "Modelo":"ai-forever\/gpt-pequeno-pt",
437
+ "Tipo":"IFT : instruction-tuned",
438
+ "Arquitetura":"",
439
+ "Tipo de Peso":"Original",
440
+ "Precis\u00e3o":"float16",
441
+ "Licen\u00e7a":"MIT",
442
+ "#Params (B)":1.3,
443
+ "Hub Likes":409,
444
+ "Dispon\u00edvel no hub":true,
445
+ "SHA do modelo":"main",
446
+ "M\u00e9dia Geral":0.7519612899,
447
+ "\u00c1rea M\u00e9dica":0.8670933989,
448
+ "\u00c1rea do Direito":0.8348443783,
449
+ "Provas Militares":0.7223937365,
450
+ "Computa\u00e7\u00e3o":0.5604037921,
451
+ "Discurso de \u00d3dio":0.7773722062,
452
+ "Economia e Contabilidade":0.832655864,
453
+ "Sem\u00e2ntica e Infer\u00eancia":0.8139561288,
454
+ "Multidisciplinar":0.7022728396,
455
+ "Revalida":0.9372604926,
456
+ "MREX":0.7969263052,
457
+ "OAB":0.8233168492,
458
+ "ENAM":0.8463719074,
459
+ "AFA":0.7605339428,
460
+ "ITA":0.7120343105,
461
+ "IME":0.6946129562,
462
+ "POSCOMP":0.5626662966,
463
+ "OBI":0.5581412875,
464
+ "HateBR":0.8322948813,
465
+ "PT Hate Speech":0.7351328585,
466
+ "tweetSentBR":0.7646888789,
467
+ "BCB":0.781881082,
468
+ "CFCES":0.883430646,
469
+ "FAQUAD NLI":0.8026792301,
470
+ "ASSIN2 RTE":0.8560128859,
471
+ "ASSIN2 STS":0.7831762706,
472
+ "ENEM":0.6721405607,
473
+ "BLUEX":0.6766199465,
474
+ "CNPU":0.7002599041,
475
+ "ENADE":0.7020562276,
476
+ "BNDES":0.7412478076,
477
+ "CACD (1\u00aa fase)":0.7257566033,
478
+ "CACD (2\u00aa fase)":0.6978288274,
479
+ "Datasets \u00c1rea M\u00e9dica":"Revalida, MREX",
480
+ "Datasets \u00c1rea do Direito":"OAB, ENAM",
481
+ "Datasets Provas Militares":"AFA, ITA, IME",
482
+ "Datasets Computa\u00e7\u00e3o":"POSCOMP, OBI",
483
+ "Datasets Discurso de \u00d3dio":"HateBR, PT Hate Speech, tweetSentBR",
484
+ "Datasets Economia e Contabilidade":"BCB, CFCES",
485
+ "Datasets Sem\u00e2ntica e Infer\u00eancia":"FAQUAD NLI, ASSIN2 RTE, ASSIN2 STS",
486
+ "Datasets Multidisciplinar":"ENEM, BLUEX, CNPU, ENADE, BNDES, CACD (1\u00aa fase), CACD (2\u00aa fase)"
487
+ },
488
+ {
489
+ "T":"PT",
490
+ "Modelo":"saramago\/roberta-base-portuguese",
491
+ "Tipo":"PT : pr\u00e9-treinado",
492
+ "Arquitetura":"",
493
+ "Tipo de Peso":"Original",
494
+ "Precis\u00e3o":"float16",
495
+ "Licen\u00e7a":"MIT",
496
+ "#Params (B)":0.13,
497
+ "Hub Likes":112,
498
+ "Dispon\u00edvel no hub":true,
499
+ "SHA do modelo":"main",
500
+ "M\u00e9dia Geral":0.9312283237,
501
+ "\u00c1rea M\u00e9dica":0.6179539104,
502
+ "\u00c1rea do Direito":0.99,
503
+ "Provas Militares":0.9578333048,
504
+ "Computa\u00e7\u00e3o":0.913080091,
505
+ "Discurso de \u00d3dio":0.893517282,
506
+ "Economia e Contabilidade":0.9548532556,
507
+ "Sem\u00e2ntica e Infer\u00eancia":0.9745700292,
508
+ "Multidisciplinar":0.988563344,
509
+ "Revalida":0.6180836056,
510
+ "MREX":0.6178242152,
511
+ "OAB":0.99,
512
+ "ENAM":0.99,
513
+ "AFA":0.9647166009,
514
+ "ITA":0.9449254684,
515
+ "IME":0.9638578451,
516
+ "POSCOMP":0.9581953587,
517
+ "OBI":0.8679648233,
518
+ "HateBR":0.916139368,
519
+ "PT Hate Speech":0.8602959375,
520
+ "tweetSentBR":0.9041165406,
521
+ "BCB":0.9449285867,
522
+ "CFCES":0.9647779244,
523
+ "FAQUAD NLI":0.99,
524
+ "ASSIN2 RTE":0.99,
525
+ "ASSIN2 STS":0.9437100876,
526
+ "ENEM":0.9799434077,
527
+ "BLUEX":0.99,
528
+ "CNPU":0.99,
529
+ "ENADE":0.99,
530
+ "BNDES":0.99,
531
+ "CACD (1\u00aa fase)":0.99,
532
+ "CACD (2\u00aa fase)":0.99,
533
+ "Datasets \u00c1rea M\u00e9dica":"Revalida, MREX",
534
+ "Datasets \u00c1rea do Direito":"OAB, ENAM",
535
+ "Datasets Provas Militares":"AFA, ITA, IME",
536
+ "Datasets Computa\u00e7\u00e3o":"POSCOMP, OBI",
537
+ "Datasets Discurso de \u00d3dio":"HateBR, PT Hate Speech, tweetSentBR",
538
+ "Datasets Economia e Contabilidade":"BCB, CFCES",
539
+ "Datasets Sem\u00e2ntica e Infer\u00eancia":"FAQUAD NLI, ASSIN2 RTE, ASSIN2 STS",
540
+ "Datasets Multidisciplinar":"ENEM, BLUEX, CNPU, ENADE, BNDES, CACD (1\u00aa fase), CACD (2\u00aa fase)"
541
+ },
542
+ {
543
+ "T":"FT",
544
+ "Modelo":"pierreguillou\/bert-base-brpt-clinical",
545
+ "Tipo":"FT : fine-tuned",
546
+ "Arquitetura":"",
547
+ "Tipo de Peso":"Original",
548
+ "Precis\u00e3o":"float16",
549
+ "Licen\u00e7a":"MIT",
550
+ "#Params (B)":0.11,
551
+ "Hub Likes":73,
552
+ "Dispon\u00edvel no hub":true,
553
+ "SHA do modelo":"c7bef2a",
554
+ "M\u00e9dia Geral":0.5482271789,
555
+ "\u00c1rea M\u00e9dica":0.4041051953,
556
+ "\u00c1rea do Direito":0.6969643619,
557
+ "Provas Militares":0.6095811316,
558
+ "Computa\u00e7\u00e3o":0.6695079353,
559
+ "Discurso de \u00d3dio":0.6304290307,
560
+ "Economia e Contabilidade":0.4762540627,
561
+ "Sem\u00e2ntica e Infer\u00eancia":0.5236257226,
562
+ "Multidisciplinar":0.4818402185,
563
+ "Revalida":0.3778537492,
564
+ "MREX":0.4303566413,
565
+ "OAB":0.6946451229,
566
+ "ENAM":0.6992836008,
567
+ "AFA":0.6208503603,
568
+ "ITA":0.5800215298,
569
+ "IME":0.6278715046,
570
+ "POSCOMP":0.7053490348,
571
+ "OBI":0.6336668359,
572
+ "HateBR":0.6278205555,
573
+ "PT Hate Speech":0.623751963,
574
+ "tweetSentBR":0.6397145736,
575
+ "BCB":0.4976959616,
576
+ "CFCES":0.4548121638,
577
+ "FAQUAD NLI":0.4985853658,
578
+ "ASSIN2 RTE":0.537775324,
579
+ "ASSIN2 STS":0.5345164779,
580
+ "ENEM":0.4616904996,
581
+ "BLUEX":0.4653500276,
582
+ "CNPU":0.4957192699,
583
+ "ENADE":0.4725184615,
584
+ "BNDES":0.4771470703,
585
+ "CACD (1\u00aa fase)":0.4803828804,
586
+ "CACD (2\u00aa fase)":0.5200733201,
587
+ "Datasets \u00c1rea M\u00e9dica":"Revalida, MREX",
588
+ "Datasets \u00c1rea do Direito":"OAB, ENAM",
589
+ "Datasets Provas Militares":"AFA, ITA, IME",
590
+ "Datasets Computa\u00e7\u00e3o":"POSCOMP, OBI",
591
+ "Datasets Discurso de \u00d3dio":"HateBR, PT Hate Speech, tweetSentBR",
592
+ "Datasets Economia e Contabilidade":"BCB, CFCES",
593
+ "Datasets Sem\u00e2ntica e Infer\u00eancia":"FAQUAD NLI, ASSIN2 RTE, ASSIN2 STS",
594
+ "Datasets Multidisciplinar":"ENEM, BLUEX, CNPU, ENADE, BNDES, CACD (1\u00aa fase), CACD (2\u00aa fase)"
595
+ },
596
+ {
597
+ "T":"PT",
598
+ "Modelo":"nlp-wyldlab\/deberta-v3-base-portuguese",
599
+ "Tipo":"PT : pr\u00e9-treinado",
600
+ "Arquitetura":"",
601
+ "Tipo de Peso":"Original",
602
+ "Precis\u00e3o":"float16",
603
+ "Licen\u00e7a":"MIT",
604
+ "#Params (B)":0.18,
605
+ "Hub Likes":128,
606
+ "Dispon\u00edvel no hub":true,
607
+ "SHA do modelo":"main",
608
+ "M\u00e9dia Geral":0.7465021221,
609
+ "\u00c1rea M\u00e9dica":0.594656463,
610
+ "\u00c1rea do Direito":0.6710791004,
611
+ "Provas Militares":0.8314281273,
612
+ "Computa\u00e7\u00e3o":0.6823194837,
613
+ "Discurso de \u00d3dio":0.775550195,
614
+ "Economia e Contabilidade":0.8795739095,
615
+ "Sem\u00e2ntica e Infer\u00eancia":0.7241149481,
616
+ "Multidisciplinar":0.7525018865,
617
+ "Revalida":0.5792849969,
618
+ "MREX":0.6100279292,
619
+ "OAB":0.6985800324,
620
+ "ENAM":0.6435781684,
621
+ "AFA":0.7914821772,
622
+ "ITA":0.8881410294,
623
+ "IME":0.8146611755,
624
+ "POSCOMP":0.6519856198,
625
+ "OBI":0.7126533476,
626
+ "HateBR":0.8175383934,
627
+ "PT Hate Speech":0.7687241726,
628
+ "tweetSentBR":0.7403880189,
629
+ "BCB":0.8527339923,
630
+ "CFCES":0.9064138266,
631
+ "FAQUAD NLI":0.693047413,
632
+ "ASSIN2 RTE":0.6893290657,
633
+ "ASSIN2 STS":0.7899683657,
634
+ "ENEM":0.7604902451,
635
+ "BLUEX":0.8153250765,
636
+ "CNPU":0.6935039827,
637
+ "ENADE":0.7749131066,
638
+ "BNDES":0.6890146653,
639
+ "CACD (1\u00aa fase)":0.7499769597,
640
+ "CACD (2\u00aa fase)":0.7842891694,
641
+ "Datasets \u00c1rea M\u00e9dica":"Revalida, MREX",
642
+ "Datasets \u00c1rea do Direito":"OAB, ENAM",
643
+ "Datasets Provas Militares":"AFA, ITA, IME",
644
+ "Datasets Computa\u00e7\u00e3o":"POSCOMP, OBI",
645
+ "Datasets Discurso de \u00d3dio":"HateBR, PT Hate Speech, tweetSentBR",
646
+ "Datasets Economia e Contabilidade":"BCB, CFCES",
647
+ "Datasets Sem\u00e2ntica e Infer\u00eancia":"FAQUAD NLI, ASSIN2 RTE, ASSIN2 STS",
648
+ "Datasets Multidisciplinar":"ENEM, BLUEX, CNPU, ENADE, BNDES, CACD (1\u00aa fase), CACD (2\u00aa fase)"
649
+ }
650
+ ]
output/leaderboard_data_20250413_002339.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a27686dc08775fc43f0b92365c1affd11dfc08026a41a926b2a0c3c22739807d
3
+ size 7463
output/leaderboard_data_20250413_002339.xlsx ADDED
Binary file (12.3 kB). View file
 
output/leaderboard_info_20250413_002339.txt ADDED
@@ -0,0 +1,81 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ DataFrame Shape: (12, 52)
2
+
3
+ Colunas:
4
+ - T
5
+ - Modelo
6
+ - Tipo
7
+ - Arquitetura
8
+ - Tipo de Peso
9
+ - Precisão
10
+ - Licença
11
+ - #Params (B)
12
+ - Hub Likes
13
+ - Disponível no hub
14
+ - SHA do modelo
15
+ - Média Geral
16
+ - Área Médica
17
+ - Área do Direito
18
+ - Provas Militares
19
+ - Computação
20
+ - Discurso de Ódio
21
+ - Economia e Contabilidade
22
+ - Semântica e Inferência
23
+ - Multidisciplinar
24
+ - Revalida
25
+ - MREX
26
+ - OAB
27
+ - ENAM
28
+ - AFA
29
+ - ITA
30
+ - IME
31
+ - POSCOMP
32
+ - OBI
33
+ - HateBR
34
+ - PT Hate Speech
35
+ - tweetSentBR
36
+ - BCB
37
+ - CFCES
38
+ - FAQUAD NLI
39
+ - ASSIN2 RTE
40
+ - ASSIN2 STS
41
+ - ENEM
42
+ - BLUEX
43
+ - CNPU
44
+ - ENADE
45
+ - BNDES
46
+ - CACD (1ª fase)
47
+ - CACD (2ª fase)
48
+ - Datasets Área Médica
49
+ - Datasets Área do Direito
50
+ - Datasets Provas Militares
51
+ - Datasets Computação
52
+ - Datasets Discurso de Ódio
53
+ - Datasets Economia e Contabilidade
54
+ - Datasets Semântica e Inferência
55
+ - Datasets Multidisciplinar
56
+
57
+ Informações por área:
58
+
59
+ Área Médica:
60
+ - Datasets: ['Revalida', 'MREX']
61
+
62
+ Área do Direito:
63
+ - Datasets: ['OAB', 'ENAM']
64
+
65
+ Provas Militares:
66
+ - Datasets: ['AFA', 'ITA', 'IME']
67
+
68
+ Computação:
69
+ - Datasets: ['POSCOMP', 'OBI']
70
+
71
+ Discurso de Ódio:
72
+ - Datasets: ['HateBR', 'PT Hate Speech', 'tweetSentBR']
73
+
74
+ Economia e Contabilidade:
75
+ - Datasets: ['BCB', 'CFCES']
76
+
77
+ Semântica e Inferência:
78
+ - Datasets: ['FAQUAD NLI', 'ASSIN2 RTE', 'ASSIN2 STS']
79
+
80
+ Multidisciplinar:
81
+ - Datasets: ['ENEM', 'BLUEX', 'CNPU', 'ENADE', 'BNDES', 'CACD (1ª fase)', 'CACD (2ª fase)']
pyproject.toml ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [tool.ruff]
2
+ # Enable pycodestyle (`E`) and Pyflakes (`F`) codes by default.
3
+ select = ["E", "F"]
4
+ ignore = ["E501"] # line too long (black is taking care of this)
5
+ line-length = 119
6
+ fixable = ["A", "B", "C", "D", "E", "F", "G", "I", "N", "Q", "S", "T", "W", "ANN", "ARG", "BLE", "COM", "DJ", "DTZ", "EM", "ERA", "EXE", "FBT", "ICN", "INP", "ISC", "NPY", "PD", "PGH", "PIE", "PL", "PT", "PTH", "PYI", "RET", "RSE", "RUF", "SIM", "SLF", "TCH", "TID", "TRY", "UP", "YTT"]
7
+
8
+ [tool.isort]
9
+ profile = "black"
10
+ line_length = 119
11
+
12
+ [tool.black]
13
+ line-length = 119
requirements.txt CHANGED
@@ -1,18 +1,16 @@
1
- APScheduler>=3.10.0
2
  black
3
  datasets
4
- gradio>=3.50.0
5
  gradio[oauth]
6
- gradio-leaderboard>=0.0.13
7
  gradio_client
8
- huggingface_hub>=0.19.0
9
  matplotlib
10
- numpy>=1.24.0
11
- pandas>=2.0.0
12
- python-dateutil>=2.8.2
13
- streamlit>=1.31.0
14
  tqdm
15
- transformers>=4.30.0
16
  tokenizers>=0.15.0
17
- sentencepiece
18
- safetensors>=0.4.0
 
1
+ APScheduler
2
  black
3
  datasets
4
+ gradio
5
  gradio[oauth]
6
+ gradio_leaderboard==0.0.13
7
  gradio_client
8
+ huggingface-hub>=0.18.0
9
  matplotlib
10
+ numpy
11
+ pandas
12
+ python-dateutil
 
13
  tqdm
14
+ transformers
15
  tokenizers>=0.15.0
16
+ sentencepiece
 
src/about.py ADDED
@@ -0,0 +1,103 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from dataclasses import dataclass
2
+ from enum import Enum
3
+
4
+ @dataclass
5
+ class Task:
6
+ benchmark: str
7
+ metric: str
8
+ col_name: str
9
+
10
+
11
+ # Select your tasks here
12
+ # ---------------------------------------------------
13
+ class Tasks(Enum):
14
+ # task_key in the json file, metric_key in the json file, name to display in the leaderboard
15
+ # Área Médica
16
+ REVALIDA = Task("revalida", "acc", "Revalida")
17
+ MREX = Task("mrex", "acc", "MREX")
18
+ # Área do Direito
19
+ OAB = Task("oab", "acc", "OAB")
20
+ ENAM = Task("enam", "acc", "ENAM")
21
+ # Provas Militares
22
+ AFA = Task("afa", "acc", "AFA")
23
+ ITA = Task("ita", "acc", "ITA")
24
+ IME = Task("ime", "acc", "IME")
25
+ # Computação
26
+ POSCOMP = Task("poscomp", "acc", "POSCOMP")
27
+ OBI = Task("obi", "acc", "OBI")
28
+ # Discurso de Ódio
29
+ HATEBR = Task("hatebr", "acc", "HateBR")
30
+ PT_HATE_SPEECH = Task("pt_hate_speech", "acc", "PT Hate Speech")
31
+ TWEETSENTBR = Task("tweetsentbr", "acc", "tweetSentBR")
32
+ # Economia e Contabilidade
33
+ BCB = Task("bcb", "acc", "BCB")
34
+ CFCES = Task("cfces", "acc", "CFCES")
35
+ # Compreensão de Semântica e Inferência Textual
36
+ FAQUAD_NLI = Task("faquad_nli", "acc", "FAQUAD NLI")
37
+ ASSIN2_RTE = Task("assin2_rte", "acc", "ASSIN2 RTE")
38
+ ASSIN2_STS = Task("assin2_sts", "acc", "ASSIN2 STS")
39
+ # Provas de Conhecimento Multidisciplinar
40
+ ENEM = Task("enem", "acc", "ENEM")
41
+ BLUEX = Task("bluex", "acc", "BLUEX")
42
+ CNPU = Task("cnpu", "acc", "CNPU")
43
+ ENADE = Task("enade", "acc", "ENADE")
44
+ BNDES = Task("bndes", "acc", "BNDES")
45
+ CACD_1 = Task("cacd_1", "acc", "CACD (1ª fase)")
46
+ CACD_2 = Task("cacd_2", "acc", "CACD (2ª fase)")
47
+
48
+
49
+ NUM_FEWSHOT = 0 # Change with your few shot
50
+ # ---------------------------------------------------
51
+
52
+
53
+
54
+ # Your leaderboard name
55
+ TITLE = """<h1 align="center" id="space-title">Demo leaderboard</h1>"""
56
+
57
+ # What does your leaderboard evaluate?
58
+ INTRODUCTION_TEXT = """
59
+ Intro text
60
+ """
61
+
62
+ # Which evaluations are you running? how can people reproduce what you have?
63
+ LLM_BENCHMARKS_TEXT = f"""
64
+ ## How it works
65
+
66
+ ## Reproducibility
67
+ To reproduce our results, here is the commands you can run:
68
+
69
+ """
70
+
71
+ EVALUATION_QUEUE_TEXT = """
72
+ ## Some good practices before submitting a model
73
+
74
+ ### 1) Make sure you can load your model and tokenizer using AutoClasses:
75
+ ```python
76
+ from transformers import AutoConfig, AutoModel, AutoTokenizer
77
+ config = AutoConfig.from_pretrained("your model name", revision=revision)
78
+ model = AutoModel.from_pretrained("your model name", revision=revision)
79
+ tokenizer = AutoTokenizer.from_pretrained("your model name", revision=revision)
80
+ ```
81
+ If this step fails, follow the error messages to debug your model before submitting it. It's likely your model has been improperly uploaded.
82
+
83
+ Note: make sure your model is public!
84
+ Note: if your model needs `use_remote_code=True`, we do not support this option yet but we are working on adding it, stay posted!
85
+
86
+ ### 2) Convert your model weights to [safetensors](https://huggingface.co/docs/safetensors/index)
87
+ It's a new format for storing weights which is safer and faster to load and use. It will also allow us to add the number of parameters of your model to the `Extended Viewer`!
88
+
89
+ ### 3) Make sure your model has an open license!
90
+ This is a leaderboard for Open LLMs, and we'd love for as many people as possible to know they can use your model 🤗
91
+
92
+ ### 4) Fill up your model card
93
+ When we add extra information about models to the leaderboard, it will be automatically taken from the model card
94
+
95
+ ## In case of model failure
96
+ If your model is displayed in the `FAILED` category, its execution stopped.
97
+ Make sure you have followed the above steps first.
98
+ If everything is done, check you can launch the EleutherAIHarness on your model locally, using the above command without modifications (you can add `--limit` to limit the number of examples per task).
99
+ """
100
+
101
+ CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
102
+ CITATION_BUTTON_TEXT = r"""
103
+ """
src/about.pyZone.Identifier ADDED
File without changes
src/display/css_html_js.py ADDED
@@ -0,0 +1,105 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ custom_css = """
2
+
3
+ .markdown-text {
4
+ font-size: 16px !important;
5
+ }
6
+
7
+ #models-to-add-text {
8
+ font-size: 18px !important;
9
+ }
10
+
11
+ #citation-button span {
12
+ font-size: 16px !important;
13
+ }
14
+
15
+ #citation-button textarea {
16
+ font-size: 16px !important;
17
+ }
18
+
19
+ #citation-button > label > button {
20
+ margin: 6px;
21
+ transform: scale(1.3);
22
+ }
23
+
24
+ #leaderboard-table {
25
+ margin-top: 15px
26
+ }
27
+
28
+ #leaderboard-table-lite {
29
+ margin-top: 15px
30
+ }
31
+
32
+ #search-bar-table-box > div:first-child {
33
+ background: none;
34
+ border: none;
35
+ }
36
+
37
+ #search-bar {
38
+ padding: 0px;
39
+ }
40
+
41
+ /* Limit the width of the first AutoEvalColumn so that names don't expand too much */
42
+ #leaderboard-table td:nth-child(2),
43
+ #leaderboard-table th:nth-child(2) {
44
+ max-width: 400px;
45
+ overflow: auto;
46
+ white-space: nowrap;
47
+ }
48
+
49
+ .tab-buttons button {
50
+ font-size: 20px;
51
+ }
52
+
53
+ #scale-logo {
54
+ border-style: none !important;
55
+ box-shadow: none;
56
+ display: block;
57
+ margin-left: auto;
58
+ margin-right: auto;
59
+ max-width: 600px;
60
+ }
61
+
62
+ #scale-logo .download {
63
+ display: none;
64
+ }
65
+ #filter_type{
66
+ border: 0;
67
+ padding-left: 0;
68
+ padding-top: 0;
69
+ }
70
+ #filter_type label {
71
+ display: flex;
72
+ }
73
+ #filter_type label > span{
74
+ margin-top: var(--spacing-lg);
75
+ margin-right: 0.5em;
76
+ }
77
+ #filter_type label > .wrap{
78
+ width: 103px;
79
+ }
80
+ #filter_type label > .wrap .wrap-inner{
81
+ padding: 2px;
82
+ }
83
+ #filter_type label > .wrap .wrap-inner input{
84
+ width: 1px
85
+ }
86
+ #filter-columns-type{
87
+ border:0;
88
+ padding:0.5;
89
+ }
90
+ #filter-columns-size{
91
+ border:0;
92
+ padding:0.5;
93
+ }
94
+ #box-filter > .form{
95
+ border: 0
96
+ }
97
+ """
98
+
99
+ get_window_url_params = """
100
+ function(url_params) {
101
+ const params = new URLSearchParams(window.location.search);
102
+ url_params = Object.fromEntries(params);
103
+ return url_params;
104
+ }
105
+ """
src/display/css_html_js.pyZone.Identifier ADDED
File without changes
src/display/formatting.py ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ def model_hyperlink(link, model_name):
2
+ return f'<a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model_name}</a>'
3
+
4
+
5
+ def make_clickable_model(model_name):
6
+ link = f"https://huggingface.co/{model_name}"
7
+ return model_hyperlink(link, model_name)
8
+
9
+
10
+ def styled_error(error):
11
+ return f"<p style='color: red; font-size: 20px; text-align: center;'>{error}</p>"
12
+
13
+
14
+ def styled_warning(warn):
15
+ return f"<p style='color: orange; font-size: 20px; text-align: center;'>{warn}</p>"
16
+
17
+
18
+ def styled_message(message):
19
+ return f"<p style='color: green; font-size: 20px; text-align: center;'>{message}</p>"
20
+
21
+
22
+ def has_no_nan_values(df, columns):
23
+ return df[columns].notna().all(axis=1)
24
+
25
+
26
+ def has_nan_values(df, columns):
27
+ return df[columns].isna().any(axis=1)
src/display/formatting.pyZone.Identifier ADDED
File without changes
src/display/utils.py ADDED
@@ -0,0 +1,145 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from dataclasses import dataclass, make_dataclass
2
+ from enum import Enum
3
+
4
+ import pandas as pd
5
+
6
+ from src.about import Tasks
7
+
8
+ def fields(raw_class):
9
+ return [v for k, v in raw_class.__dict__.items() if k[:2] != "__" and k[-2:] != "__"]
10
+
11
+
12
+ # These classes are for user facing column names,
13
+ # to avoid having to change them all around the code
14
+ # when a modif is needed
15
+ @dataclass
16
+ class ColumnContent:
17
+ name: str
18
+ type: str
19
+ displayed_by_default: bool
20
+ hidden: bool = False
21
+ never_hidden: bool = False
22
+
23
+ ## Leaderboard columns
24
+ auto_eval_column_dict = []
25
+ # Init
26
+ auto_eval_column_dict.append(["model_type_symbol", ColumnContent, ColumnContent("T", "str", True, never_hidden=True)])
27
+ auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)])
28
+ #Scores
29
+ auto_eval_column_dict.append(["average", ColumnContent, ColumnContent("Média Geral ⬆️", "number", True)])
30
+
31
+ # Adicionando colunas para as médias das áreas
32
+ auto_eval_column_dict.append(["area_medica_avg", ColumnContent, ColumnContent("Área Médica", "number", True)])
33
+ auto_eval_column_dict.append(["area_direito_avg", ColumnContent, ColumnContent("Área do Direito", "number", True)])
34
+ auto_eval_column_dict.append(["provas_militares_avg", ColumnContent, ColumnContent("Provas Militares", "number", True)])
35
+ auto_eval_column_dict.append(["computacao_avg", ColumnContent, ColumnContent("Computação", "number", True)])
36
+ auto_eval_column_dict.append(["discurso_odio_avg", ColumnContent, ColumnContent("Discurso de Ódio", "number", True)])
37
+ auto_eval_column_dict.append(["economia_contabilidade_avg", ColumnContent, ColumnContent("Economia e Contabilidade", "number", True)])
38
+ auto_eval_column_dict.append(["semantica_inferencia_avg", ColumnContent, ColumnContent("Semântica e Inferência", "number", True)])
39
+ auto_eval_column_dict.append(["multidisciplinar_avg", ColumnContent, ColumnContent("Multidisciplinar", "number", True)])
40
+
41
+ for task in Tasks:
42
+ auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", False)]) # Mudar para False para não exibir por padrão na aba geral
43
+ # Model information
44
+ auto_eval_column_dict.append(["model_type", ColumnContent, ColumnContent("Type", "str", False)])
45
+ auto_eval_column_dict.append(["architecture", ColumnContent, ColumnContent("Architecture", "str", False)])
46
+ auto_eval_column_dict.append(["weight_type", ColumnContent, ColumnContent("Weight type", "str", False, True)])
47
+ auto_eval_column_dict.append(["precision", ColumnContent, ColumnContent("Precision", "str", False)])
48
+ auto_eval_column_dict.append(["license", ColumnContent, ColumnContent("Hub License", "str", False)])
49
+ auto_eval_column_dict.append(["params", ColumnContent, ColumnContent("#Params (B)", "number", False)])
50
+ auto_eval_column_dict.append(["likes", ColumnContent, ColumnContent("Hub ❤️", "number", False)])
51
+ auto_eval_column_dict.append(["still_on_hub", ColumnContent, ColumnContent("Available on the hub", "bool", False)])
52
+ auto_eval_column_dict.append(["revision", ColumnContent, ColumnContent("Model sha", "str", False, False)])
53
+
54
+ # We use make dataclass to dynamically fill the scores from Tasks
55
+ AutoEvalColumn = make_dataclass("AutoEvalColumn", auto_eval_column_dict, frozen=True)
56
+
57
+ # Mapeamento das áreas de conhecimento para os Tasks correspondentes
58
+ AREA_DEFINITIONS = {
59
+ "Área Médica": [Tasks.REVALIDA, Tasks.MREX],
60
+ "Área do Direito": [Tasks.OAB, Tasks.ENAM],
61
+ "Provas Militares": [Tasks.AFA, Tasks.ITA, Tasks.IME],
62
+ "Computação": [Tasks.POSCOMP, Tasks.OBI],
63
+ "Discurso de Ódio": [Tasks.HATEBR, Tasks.PT_HATE_SPEECH, Tasks.TWEETSENTBR],
64
+ "Economia e Contabilidade": [Tasks.BCB, Tasks.CFCES],
65
+ "Semântica e Inferência": [Tasks.FAQUAD_NLI, Tasks.ASSIN2_RTE, Tasks.ASSIN2_STS],
66
+ "Multidisciplinar": [Tasks.ENEM, Tasks.BLUEX, Tasks.CNPU, Tasks.ENADE, Tasks.BNDES, Tasks.CACD_1, Tasks.CACD_2],
67
+ }
68
+
69
+ # Mapeamento dos nomes das áreas para as colunas de média correspondentes
70
+ AREA_AVG_COLUMN_MAP = {
71
+ "Área Médica": AutoEvalColumn.area_medica_avg.name,
72
+ "Área do Direito": AutoEvalColumn.area_direito_avg.name,
73
+ "Provas Militares": AutoEvalColumn.provas_militares_avg.name,
74
+ "Computação": AutoEvalColumn.computacao_avg.name,
75
+ "Discurso de Ódio": AutoEvalColumn.discurso_odio_avg.name,
76
+ "Economia e Contabilidade": AutoEvalColumn.economia_contabilidade_avg.name,
77
+ "Semântica e Inferência": AutoEvalColumn.semantica_inferencia_avg.name,
78
+ "Multidisciplinar": AutoEvalColumn.multidisciplinar_avg.name,
79
+ }
80
+
81
+ ## For the queue columns in the submission tab
82
+ @dataclass(frozen=True)
83
+ class EvalQueueColumn: # Queue column
84
+ model = ColumnContent("model", "markdown", True)
85
+ revision = ColumnContent("revision", "str", True)
86
+ private = ColumnContent("private", "bool", True)
87
+ precision = ColumnContent("precision", "str", True)
88
+ weight_type = ColumnContent("weight_type", "str", "Original")
89
+ status = ColumnContent("status", "str", True)
90
+
91
+ ## All the model information that we might need
92
+ @dataclass
93
+ class ModelDetails:
94
+ name: str
95
+ display_name: str = ""
96
+ symbol: str = "" # emoji
97
+
98
+
99
+ class ModelType(Enum):
100
+ PT = ModelDetails(name="pretrained", symbol="🟢")
101
+ FT = ModelDetails(name="fine-tuned", symbol="🔶")
102
+ IFT = ModelDetails(name="instruction-tuned", symbol="⭕")
103
+ RL = ModelDetails(name="RL-tuned", symbol="🟦")
104
+ Unknown = ModelDetails(name="", symbol="?")
105
+
106
+ def to_str(self, separator=" "):
107
+ return f"{self.value.symbol}{separator}{self.value.name}"
108
+
109
+ @staticmethod
110
+ def from_str(type):
111
+ if "fine-tuned" in type or "🔶" in type:
112
+ return ModelType.FT
113
+ if "pretrained" in type or "🟢" in type:
114
+ return ModelType.PT
115
+ if "RL-tuned" in type or "🟦" in type:
116
+ return ModelType.RL
117
+ if "instruction-tuned" in type or "⭕" in type:
118
+ return ModelType.IFT
119
+ return ModelType.Unknown
120
+
121
+ class WeightType(Enum):
122
+ Adapter = ModelDetails("Adapter")
123
+ Original = ModelDetails("Original")
124
+ Delta = ModelDetails("Delta")
125
+
126
+ class Precision(Enum):
127
+ float16 = ModelDetails("float16")
128
+ bfloat16 = ModelDetails("bfloat16")
129
+ Unknown = ModelDetails("?")
130
+
131
+ def from_str(precision):
132
+ if precision in ["torch.float16", "float16"]:
133
+ return Precision.float16
134
+ if precision in ["torch.bfloat16", "bfloat16"]:
135
+ return Precision.bfloat16
136
+ return Precision.Unknown
137
+
138
+ # Column selection
139
+ COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden]
140
+
141
+ EVAL_COLS = [c.name for c in fields(EvalQueueColumn)]
142
+ EVAL_TYPES = [c.type for c in fields(EvalQueueColumn)]
143
+
144
+ BENCHMARK_COLS = [t.value.col_name for t in Tasks]
145
+
src/display/utils.pyZone.Identifier ADDED
File without changes
src/envs.py ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+
3
+ from huggingface_hub import HfApi
4
+
5
+ # Info to change for your repository
6
+ # ----------------------------------
7
+ TOKEN = os.environ.get("HF_TOKEN") # A read/write token for your org
8
+
9
+ OWNER = "demo-leaderboard-backend" # Change to your org - don't forget to create a results and request dataset, with the correct format!
10
+ # ----------------------------------
11
+
12
+ REPO_ID = f"{OWNER}/leaderboard"
13
+ QUEUE_REPO = f"{OWNER}/requests"
14
+ RESULTS_REPO = f"{OWNER}/results"
15
+
16
+ # If you setup a cache later, just change HF_HOME
17
+ CACHE_PATH=os.getenv("HF_HOME", ".")
18
+
19
+ # Local caches
20
+ EVAL_REQUESTS_PATH = os.path.join(CACHE_PATH, "eval-queue")
21
+ EVAL_RESULTS_PATH = os.path.join(CACHE_PATH, "eval-results")
22
+ EVAL_REQUESTS_PATH_BACKEND = os.path.join(CACHE_PATH, "eval-queue-bk")
23
+ EVAL_RESULTS_PATH_BACKEND = os.path.join(CACHE_PATH, "eval-results-bk")
24
+
25
+ API = HfApi(token=TOKEN)
src/envs.pyZone.Identifier ADDED
File without changes
src/leaderboard/read_evals.py ADDED
@@ -0,0 +1,196 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import glob
2
+ import json
3
+ import math
4
+ import os
5
+ from dataclasses import dataclass
6
+
7
+ import dateutil
8
+ import numpy as np
9
+
10
+ from src.display.formatting import make_clickable_model
11
+ from src.display.utils import AutoEvalColumn, ModelType, Tasks, Precision, WeightType
12
+ from src.submission.check_validity import is_model_on_hub
13
+
14
+
15
+ @dataclass
16
+ class EvalResult:
17
+ """Represents one full evaluation. Built from a combination of the result and request file for a given run.
18
+ """
19
+ eval_name: str # org_model_precision (uid)
20
+ full_model: str # org/model (path on hub)
21
+ org: str
22
+ model: str
23
+ revision: str # commit hash, "" if main
24
+ results: dict
25
+ precision: Precision = Precision.Unknown
26
+ model_type: ModelType = ModelType.Unknown # Pretrained, fine tuned, ...
27
+ weight_type: WeightType = WeightType.Original # Original or Adapter
28
+ architecture: str = "Unknown"
29
+ license: str = "?"
30
+ likes: int = 0
31
+ num_params: int = 0
32
+ date: str = "" # submission date of request file
33
+ still_on_hub: bool = False
34
+
35
+ @classmethod
36
+ def init_from_json_file(self, json_filepath):
37
+ """Inits the result from the specific model result file"""
38
+ with open(json_filepath) as fp:
39
+ data = json.load(fp)
40
+
41
+ config = data.get("config")
42
+
43
+ # Precision
44
+ precision = Precision.from_str(config.get("model_dtype"))
45
+
46
+ # Get model and org
47
+ org_and_model = config.get("model_name", config.get("model_args", None))
48
+ org_and_model = org_and_model.split("/", 1)
49
+
50
+ if len(org_and_model) == 1:
51
+ org = None
52
+ model = org_and_model[0]
53
+ result_key = f"{model}_{precision.value.name}"
54
+ else:
55
+ org = org_and_model[0]
56
+ model = org_and_model[1]
57
+ result_key = f"{org}_{model}_{precision.value.name}"
58
+ full_model = "/".join(org_and_model)
59
+
60
+ still_on_hub, _, model_config = is_model_on_hub(
61
+ full_model, config.get("model_sha", "main"), trust_remote_code=True, test_tokenizer=False
62
+ )
63
+ architecture = "?"
64
+ if model_config is not None:
65
+ architectures = getattr(model_config, "architectures", None)
66
+ if architectures:
67
+ architecture = ";".join(architectures)
68
+
69
+ # Extract results available in this file (some results are split in several files)
70
+ results = {}
71
+ for task in Tasks:
72
+ task = task.value
73
+
74
+ # We average all scores of a given metric (not all metrics are present in all files)
75
+ accs = np.array([v.get(task.metric, None) for k, v in data["results"].items() if task.benchmark == k])
76
+ if accs.size == 0 or any([acc is None for acc in accs]):
77
+ continue
78
+
79
+ mean_acc = np.mean(accs) * 100.0
80
+ results[task.benchmark] = mean_acc
81
+
82
+ return self(
83
+ eval_name=result_key,
84
+ full_model=full_model,
85
+ org=org,
86
+ model=model,
87
+ results=results,
88
+ precision=precision,
89
+ revision= config.get("model_sha", ""),
90
+ still_on_hub=still_on_hub,
91
+ architecture=architecture
92
+ )
93
+
94
+ def update_with_request_file(self, requests_path):
95
+ """Finds the relevant request file for the current model and updates info with it"""
96
+ request_file = get_request_file_for_model(requests_path, self.full_model, self.precision.value.name)
97
+
98
+ try:
99
+ with open(request_file, "r") as f:
100
+ request = json.load(f)
101
+ self.model_type = ModelType.from_str(request.get("model_type", ""))
102
+ self.weight_type = WeightType[request.get("weight_type", "Original")]
103
+ self.license = request.get("license", "?")
104
+ self.likes = request.get("likes", 0)
105
+ self.num_params = request.get("params", 0)
106
+ self.date = request.get("submitted_time", "")
107
+ except Exception:
108
+ print(f"Could not find request file for {self.org}/{self.model} with precision {self.precision.value.name}")
109
+
110
+ def to_dict(self):
111
+ """Converts the Eval Result to a dict compatible with our dataframe display"""
112
+ average = sum([v for v in self.results.values() if v is not None]) / len(Tasks)
113
+ data_dict = {
114
+ "eval_name": self.eval_name, # not a column, just a save name,
115
+ AutoEvalColumn.precision.name: self.precision.value.name,
116
+ AutoEvalColumn.model_type.name: self.model_type.value.name,
117
+ AutoEvalColumn.model_type_symbol.name: self.model_type.value.symbol,
118
+ AutoEvalColumn.weight_type.name: self.weight_type.value.name,
119
+ AutoEvalColumn.architecture.name: self.architecture,
120
+ AutoEvalColumn.model.name: make_clickable_model(self.full_model),
121
+ AutoEvalColumn.revision.name: self.revision,
122
+ AutoEvalColumn.average.name: average,
123
+ AutoEvalColumn.license.name: self.license,
124
+ AutoEvalColumn.likes.name: self.likes,
125
+ AutoEvalColumn.params.name: self.num_params,
126
+ AutoEvalColumn.still_on_hub.name: self.still_on_hub,
127
+ }
128
+
129
+ for task in Tasks:
130
+ data_dict[task.value.col_name] = self.results[task.value.benchmark]
131
+
132
+ return data_dict
133
+
134
+
135
+ def get_request_file_for_model(requests_path, model_name, precision):
136
+ """Selects the correct request file for a given model. Only keeps runs tagged as FINISHED"""
137
+ request_files = os.path.join(
138
+ requests_path,
139
+ f"{model_name}_eval_request_*.json",
140
+ )
141
+ request_files = glob.glob(request_files)
142
+
143
+ # Select correct request file (precision)
144
+ request_file = ""
145
+ request_files = sorted(request_files, reverse=True)
146
+ for tmp_request_file in request_files:
147
+ with open(tmp_request_file, "r") as f:
148
+ req_content = json.load(f)
149
+ if (
150
+ req_content["status"] in ["FINISHED"]
151
+ and req_content["precision"] == precision.split(".")[-1]
152
+ ):
153
+ request_file = tmp_request_file
154
+ return request_file
155
+
156
+
157
+ def get_raw_eval_results(results_path: str, requests_path: str) -> list[EvalResult]:
158
+ """From the path of the results folder root, extract all needed info for results"""
159
+ model_result_filepaths = []
160
+
161
+ for root, _, files in os.walk(results_path):
162
+ # We should only have json files in model results
163
+ if len(files) == 0 or any([not f.endswith(".json") for f in files]):
164
+ continue
165
+
166
+ # Sort the files by date
167
+ try:
168
+ files.sort(key=lambda x: x.removesuffix(".json").removeprefix("results_")[:-7])
169
+ except dateutil.parser._parser.ParserError:
170
+ files = [files[-1]]
171
+
172
+ for file in files:
173
+ model_result_filepaths.append(os.path.join(root, file))
174
+
175
+ eval_results = {}
176
+ for model_result_filepath in model_result_filepaths:
177
+ # Creation of result
178
+ eval_result = EvalResult.init_from_json_file(model_result_filepath)
179
+ eval_result.update_with_request_file(requests_path)
180
+
181
+ # Store results of same eval together
182
+ eval_name = eval_result.eval_name
183
+ if eval_name in eval_results.keys():
184
+ eval_results[eval_name].results.update({k: v for k, v in eval_result.results.items() if v is not None})
185
+ else:
186
+ eval_results[eval_name] = eval_result
187
+
188
+ results = []
189
+ for v in eval_results.values():
190
+ try:
191
+ v.to_dict() # we test if the dict version is complete
192
+ results.append(v)
193
+ except KeyError: # not all eval values present
194
+ continue
195
+
196
+ return results
src/leaderboard/read_evals.pyZone.Identifier ADDED
File without changes
src/populate.py ADDED
@@ -0,0 +1,79 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import os
3
+ import numpy as np
4
+ import pandas as pd
5
+
6
+ from src.display.formatting import has_no_nan_values, make_clickable_model
7
+ from src.display.utils import AutoEvalColumn, EvalQueueColumn, AREA_DEFINITIONS, AREA_AVG_COLUMN_MAP
8
+ from src.leaderboard.read_evals import get_raw_eval_results
9
+ from src.about import Tasks
10
+
11
+
12
+ def get_leaderboard_df(results_path: str, requests_path: str, cols: list) -> pd.DataFrame:
13
+ """Creates a dataframe from all the individual experiment results"""
14
+ raw_data = get_raw_eval_results(results_path, requests_path)
15
+ all_data_json = [v.to_dict() for v in raw_data]
16
+
17
+ df = pd.DataFrame.from_records(all_data_json)
18
+
19
+ # Calcular médias por área
20
+ for area_name, tasks_in_area in AREA_DEFINITIONS.items():
21
+ area_cols = [task.name for task in tasks_in_area if task.name in df.columns]
22
+ avg_col_name = AREA_AVG_COLUMN_MAP[area_name]
23
+ if area_cols: # Calcula a média apenas se houver colunas da área no DataFrame
24
+ df[avg_col_name] = df[area_cols].mean(axis=1)
25
+ else:
26
+ df[avg_col_name] = np.nan # Define como NaN se nenhuma coluna da área estiver presente
27
+
28
+ # Calcular Média Geral (agora baseada nas médias das áreas)
29
+ avg_area_cols = list(AREA_AVG_COLUMN_MAP.values())
30
+ df[AutoEvalColumn.average.name] = df[avg_area_cols].mean(axis=1)
31
+
32
+ df = df.sort_values(by=[AutoEvalColumn.average.name], ascending=False)
33
+
34
+ # Selecionar e arredondar colunas
35
+ all_display_cols = [c.name for c in fields(AutoEvalColumn)] # Obter todas as colunas definidas
36
+ df = df[[col for col in all_display_cols if col in df.columns]] # Manter apenas colunas existentes no df
37
+ df = df.round(decimals=2)
38
+
39
+ # Filtrar linhas com valores NaN nas colunas de benchmark originais (se necessário)
40
+ # benchmark_cols = [t.name for t in Tasks] # Descomentar se precisar do filtro original
41
+ # df = df[has_no_nan_values(df, benchmark_cols)]
42
+
43
+ return df
44
+
45
+
46
+ def get_evaluation_queue_df(save_path: str, cols: list) -> list[pd.DataFrame]:
47
+ """Creates the different dataframes for the evaluation queues requestes"""
48
+ entries = [entry for entry in os.listdir(save_path) if not entry.startswith(".")]
49
+ all_evals = []
50
+
51
+ for entry in entries:
52
+ if ".json" in entry:
53
+ file_path = os.path.join(save_path, entry)
54
+ with open(file_path) as fp:
55
+ data = json.load(fp)
56
+
57
+ data[EvalQueueColumn.model.name] = make_clickable_model(data["model"])
58
+ data[EvalQueueColumn.revision.name] = data.get("revision", "main")
59
+
60
+ all_evals.append(data)
61
+ elif ".md" not in entry:
62
+ # this is a folder
63
+ sub_entries = [e for e in os.listdir(f"{save_path}/{entry}") if os.path.isfile(e) and not e.startswith(".")]
64
+ for sub_entry in sub_entries:
65
+ file_path = os.path.join(save_path, entry, sub_entry)
66
+ with open(file_path) as fp:
67
+ data = json.load(fp)
68
+
69
+ data[EvalQueueColumn.model.name] = make_clickable_model(data["model"])
70
+ data[EvalQueueColumn.revision.name] = data.get("revision", "main")
71
+ all_evals.append(data)
72
+
73
+ pending_list = [e for e in all_evals if e["status"] in ["PENDING", "RERUN"]]
74
+ running_list = [e for e in all_evals if e["status"] == "RUNNING"]
75
+ finished_list = [e for e in all_evals if e["status"].startswith("FINISHED") or e["status"] == "PENDING_NEW_EVAL"]
76
+ df_pending = pd.DataFrame.from_records(pending_list, columns=cols)
77
+ df_running = pd.DataFrame.from_records(running_list, columns=cols)
78
+ df_finished = pd.DataFrame.from_records(finished_list, columns=cols)
79
+ return df_finished[cols], df_running[cols], df_pending[cols]
src/populate.pyZone.Identifier ADDED
File without changes
src/submission/check_validity.py ADDED
@@ -0,0 +1,99 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import os
3
+ import re
4
+ from collections import defaultdict
5
+ from datetime import datetime, timedelta, timezone
6
+
7
+ import huggingface_hub
8
+ from huggingface_hub import ModelCard
9
+ from huggingface_hub.hf_api import ModelInfo
10
+ from transformers import AutoConfig
11
+ from transformers.models.auto.tokenization_auto import AutoTokenizer
12
+
13
+ def check_model_card(repo_id: str) -> tuple[bool, str]:
14
+ """Checks if the model card and license exist and have been filled"""
15
+ try:
16
+ card = ModelCard.load(repo_id)
17
+ except huggingface_hub.utils.EntryNotFoundError:
18
+ return False, "Please add a model card to your model to explain how you trained/fine-tuned it."
19
+
20
+ # Enforce license metadata
21
+ if card.data.license is None:
22
+ if not ("license_name" in card.data and "license_link" in card.data):
23
+ return False, (
24
+ "License not found. Please add a license to your model card using the `license` metadata or a"
25
+ " `license_name`/`license_link` pair."
26
+ )
27
+
28
+ # Enforce card content
29
+ if len(card.text) < 200:
30
+ return False, "Please add a description to your model card, it is too short."
31
+
32
+ return True, ""
33
+
34
+ def is_model_on_hub(model_name: str, revision: str, token: str = None, trust_remote_code=False, test_tokenizer=False) -> tuple[bool, str]:
35
+ """Checks if the model model_name is on the hub, and whether it (and its tokenizer) can be loaded with AutoClasses."""
36
+ try:
37
+ config = AutoConfig.from_pretrained(model_name, revision=revision, trust_remote_code=trust_remote_code, token=token)
38
+ if test_tokenizer:
39
+ try:
40
+ tk = AutoTokenizer.from_pretrained(model_name, revision=revision, trust_remote_code=trust_remote_code, token=token)
41
+ except ValueError as e:
42
+ return (
43
+ False,
44
+ f"uses a tokenizer which is not in a transformers release: {e}",
45
+ None
46
+ )
47
+ except Exception as e:
48
+ return (False, "'s tokenizer cannot be loaded. Is your tokenizer class in a stable transformers release, and correctly configured?", None)
49
+ return True, None, config
50
+
51
+ except ValueError:
52
+ return (
53
+ False,
54
+ "needs to be launched with `trust_remote_code=True`. For safety reason, we do not allow these models to be automatically submitted to the leaderboard.",
55
+ None
56
+ )
57
+
58
+ except Exception as e:
59
+ return False, "was not found on hub!", None
60
+
61
+
62
+ def get_model_size(model_info: ModelInfo, precision: str):
63
+ """Gets the model size from the configuration, or the model name if the configuration does not contain the information."""
64
+ try:
65
+ model_size = round(model_info.safetensors["total"] / 1e9, 3)
66
+ except (AttributeError, TypeError):
67
+ return 0 # Unknown model sizes are indicated as 0, see NUMERIC_INTERVALS in app.py
68
+
69
+ size_factor = 8 if (precision == "GPTQ" or "gptq" in model_info.modelId.lower()) else 1
70
+ model_size = size_factor * model_size
71
+ return model_size
72
+
73
+ def get_model_arch(model_info: ModelInfo):
74
+ """Gets the model architecture from the configuration"""
75
+ return model_info.config.get("architectures", "Unknown")
76
+
77
+ def already_submitted_models(requested_models_dir: str) -> set[str]:
78
+ """Gather a list of already submitted models to avoid duplicates"""
79
+ depth = 1
80
+ file_names = []
81
+ users_to_submission_dates = defaultdict(list)
82
+
83
+ for root, _, files in os.walk(requested_models_dir):
84
+ current_depth = root.count(os.sep) - requested_models_dir.count(os.sep)
85
+ if current_depth == depth:
86
+ for file in files:
87
+ if not file.endswith(".json"):
88
+ continue
89
+ with open(os.path.join(root, file), "r") as f:
90
+ info = json.load(f)
91
+ file_names.append(f"{info['model']}_{info['revision']}_{info['precision']}")
92
+
93
+ # Select organisation
94
+ if info["model"].count("/") == 0 or "submitted_time" not in info:
95
+ continue
96
+ organisation, _ = info["model"].split("/")
97
+ users_to_submission_dates[organisation].append(info["submitted_time"])
98
+
99
+ return set(file_names), users_to_submission_dates
src/submission/check_validity.pyZone.Identifier ADDED
File without changes
src/submission/submit.py ADDED
@@ -0,0 +1,119 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import os
3
+ from datetime import datetime, timezone
4
+
5
+ from src.display.formatting import styled_error, styled_message, styled_warning
6
+ from src.envs import API, EVAL_REQUESTS_PATH, TOKEN, QUEUE_REPO
7
+ from src.submission.check_validity import (
8
+ already_submitted_models,
9
+ check_model_card,
10
+ get_model_size,
11
+ is_model_on_hub,
12
+ )
13
+
14
+ REQUESTED_MODELS = None
15
+ USERS_TO_SUBMISSION_DATES = None
16
+
17
+ def add_new_eval(
18
+ model: str,
19
+ base_model: str,
20
+ revision: str,
21
+ precision: str,
22
+ weight_type: str,
23
+ model_type: str,
24
+ ):
25
+ global REQUESTED_MODELS
26
+ global USERS_TO_SUBMISSION_DATES
27
+ if not REQUESTED_MODELS:
28
+ REQUESTED_MODELS, USERS_TO_SUBMISSION_DATES = already_submitted_models(EVAL_REQUESTS_PATH)
29
+
30
+ user_name = ""
31
+ model_path = model
32
+ if "/" in model:
33
+ user_name = model.split("/")[0]
34
+ model_path = model.split("/")[1]
35
+
36
+ precision = precision.split(" ")[0]
37
+ current_time = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
38
+
39
+ if model_type is None or model_type == "":
40
+ return styled_error("Please select a model type.")
41
+
42
+ # Does the model actually exist?
43
+ if revision == "":
44
+ revision = "main"
45
+
46
+ # Is the model on the hub?
47
+ if weight_type in ["Delta", "Adapter"]:
48
+ base_model_on_hub, error, _ = is_model_on_hub(model_name=base_model, revision=revision, token=TOKEN, test_tokenizer=True)
49
+ if not base_model_on_hub:
50
+ return styled_error(f'Base model "{base_model}" {error}')
51
+
52
+ if not weight_type == "Adapter":
53
+ model_on_hub, error, _ = is_model_on_hub(model_name=model, revision=revision, token=TOKEN, test_tokenizer=True)
54
+ if not model_on_hub:
55
+ return styled_error(f'Model "{model}" {error}')
56
+
57
+ # Is the model info correctly filled?
58
+ try:
59
+ model_info = API.model_info(repo_id=model, revision=revision)
60
+ except Exception:
61
+ return styled_error("Could not get your model information. Please fill it up properly.")
62
+
63
+ model_size = get_model_size(model_info=model_info, precision=precision)
64
+
65
+ # Were the model card and license filled?
66
+ try:
67
+ license = model_info.cardData["license"]
68
+ except Exception:
69
+ return styled_error("Please select a license for your model")
70
+
71
+ modelcard_OK, error_msg = check_model_card(model)
72
+ if not modelcard_OK:
73
+ return styled_error(error_msg)
74
+
75
+ # Seems good, creating the eval
76
+ print("Adding new eval")
77
+
78
+ eval_entry = {
79
+ "model": model,
80
+ "base_model": base_model,
81
+ "revision": revision,
82
+ "precision": precision,
83
+ "weight_type": weight_type,
84
+ "status": "PENDING",
85
+ "submitted_time": current_time,
86
+ "model_type": model_type,
87
+ "likes": model_info.likes,
88
+ "params": model_size,
89
+ "license": license,
90
+ "private": False,
91
+ }
92
+
93
+ # Check for duplicate submission
94
+ if f"{model}_{revision}_{precision}" in REQUESTED_MODELS:
95
+ return styled_warning("This model has been already submitted.")
96
+
97
+ print("Creating eval file")
98
+ OUT_DIR = f"{EVAL_REQUESTS_PATH}/{user_name}"
99
+ os.makedirs(OUT_DIR, exist_ok=True)
100
+ out_path = f"{OUT_DIR}/{model_path}_eval_request_False_{precision}_{weight_type}.json"
101
+
102
+ with open(out_path, "w") as f:
103
+ f.write(json.dumps(eval_entry))
104
+
105
+ print("Uploading eval file")
106
+ API.upload_file(
107
+ path_or_fileobj=out_path,
108
+ path_in_repo=out_path.split("eval-queue/")[1],
109
+ repo_id=QUEUE_REPO,
110
+ repo_type="dataset",
111
+ commit_message=f"Add {model} to eval queue",
112
+ )
113
+
114
+ # Remove the local file
115
+ os.remove(out_path)
116
+
117
+ return styled_message(
118
+ "Your request has been submitted to the evaluation queue!\nPlease wait for up to an hour for the model to show in the PENDING list."
119
+ )
src/submission/submit.pyZone.Identifier ADDED
File without changes