Spaces:

Leaderboardteste
/

cemig

Running

App Files Files Community

LucasLima commited on Apr 13

Commit

ac500fb

verified ·

1 Parent(s): 087d1c0

Upload 34 files

Browse files

Files changed (34) hide show

.gitattributes +35 -0
.gitignore +13 -0
.pre-commit-config.yaml +53 -0
Classificação dos Dataset.txt +12 -0
Makefile +13 -0
README.md +46 -0
app.py +275 -0
leaderboard_funcionamento.txt +72 -0
output/leaderboard_data_20250413_002202.csv +13 -0
output/leaderboard_data_20250413_002339.csv +13 -0
output/leaderboard_data_20250413_002339.json +650 -0
output/leaderboard_data_20250413_002339.pkl +3 -0
output/leaderboard_data_20250413_002339.xlsx +0 -0
output/leaderboard_info_20250413_002339.txt +81 -0
pyproject.toml +13 -0
requirements.txt +9 -11
src/about.py +103 -0
src/about.pyZone.Identifier +0 -0
src/display/css_html_js.py +105 -0
src/display/css_html_js.pyZone.Identifier +0 -0
src/display/formatting.py +27 -0
src/display/formatting.pyZone.Identifier +0 -0
src/display/utils.py +145 -0
src/display/utils.pyZone.Identifier +0 -0
src/envs.py +25 -0
src/envs.pyZone.Identifier +0 -0
src/leaderboard/read_evals.py +196 -0
src/leaderboard/read_evals.pyZone.Identifier +0 -0
src/populate.py +79 -0
src/populate.pyZone.Identifier +0 -0
src/submission/check_validity.py +99 -0
src/submission/check_validity.pyZone.Identifier +0 -0
src/submission/submit.py +119 -0
src/submission/submit.pyZone.Identifier +0 -0

.gitattributes ADDED Viewed

	@@ -0,0 +1,35 @@

+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ckpt filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.mlmodel filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.npy filter=lfs diff=lfs merge=lfs -text
+*.npz filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pickle filter=lfs diff=lfs merge=lfs -text
+*.pkl filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+*.safetensors filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.wasm filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zst filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text
+scale-hf-logo.png filter=lfs diff=lfs merge=lfs -text

.gitignore ADDED Viewed

	@@ -0,0 +1,13 @@

+auto_evals/
+venv/
+__pycache__/
+.env
+.ipynb_checkpoints
+*ipynb
+.vscode/
+eval-queue/
+eval-results/
+eval-queue-bk/
+eval-results-bk/
+logs/

.pre-commit-config.yaml ADDED Viewed

	@@ -0,0 +1,53 @@

+# Copyright (c) 2022, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+default_language_version:
+  python: python3
+ci:
+  autofix_prs: true
+  autoupdate_commit_msg: '[pre-commit.ci] pre-commit suggestions'
+  autoupdate_schedule: quarterly
+repos:
+  - repo: https://github.com/pre-commit/pre-commit-hooks
+    rev: v4.3.0
+    hooks:
+      - id: check-yaml
+      - id: check-case-conflict
+      - id: detect-private-key
+      - id: check-added-large-files
+        args: ['--maxkb=1000']
+      - id: requirements-txt-fixer
+      - id: end-of-file-fixer
+      - id: trailing-whitespace
+  - repo: https://github.com/PyCQA/isort
+    rev: 5.12.0
+    hooks:
+      - id: isort
+        name: Format imports
+  - repo: https://github.com/psf/black
+    rev: 22.12.0
+    hooks:
+      - id: black
+        name: Format code
+        additional_dependencies: ['click==8.0.2']
+  - repo: https://github.com/charliermarsh/ruff-pre-commit
+    # Ruff version.
+    rev: 'v0.0.267'
+    hooks:
+      - id: ruff

Classificação dos Dataset.txt ADDED Viewed

	@@ -0,0 +1,12 @@

+Classificação dos Dataset
+A seguir, serão descritos quais datasets pertencem a quais categorias:
+Área Médica: Revalida, MREX
+Área do Direito: OAB, ENAM
+Provas Militares: AFA, ITA, IME
+Computação: POSCOMP, OBI
+Discurso de Ódio: HateBR, PT Hate Speech, tweetSentBR
+Economia e Contabilidade: BCB, CFCES
+Compreensão de Semântica e Inferência Textual: FAQUAD NLI, ASSIN2 RTE, ASSIN2 STS
+Provas de Conhecimento Multidisciplinar: ENEM, BLUEX, CNPU, ENADE, BNDES, CACD (1ª fase), CACD (2ª fase)

Makefile ADDED Viewed

	@@ -0,0 +1,13 @@

+.PHONY: style format
+style:
+	python -m black --line-length 119 .
+	python -m isort .
+	ruff check --fix .
+quality:
+	python -m black --check --line-length 119 .
+	python -m isort --check-only .
+	ruff check .

README.md ADDED Viewed

	@@ -0,0 +1,46 @@

+---
+title: Cemig
+emoji: 🥇
+colorFrom: green
+colorTo: indigo
+sdk: gradio
+app_file: app.py
+pinned: true
+license: apache-2.0
+short_description: Teste para criação de uma leaderboard
+sdk_version: 5.19.0
+---
+# Start the configuration
+Most of the variables to change for a default leaderboard are in `src/env.py` (replace the path for your leaderboard) and `src/about.py` (for tasks).
+Results files should have the following format and be stored as json files:
+```json
+{
+    "config": {
+        "model_dtype": "torch.float16", # or torch.bfloat16 or 8bit or 4bit
+        "model_name": "path of the model on the hub: org/model",
+        "model_sha": "revision on the hub",
+    },
+    "results": {
+        "task_name": {
+            "metric_name": score,
+        },
+        "task_name2": {
+            "metric_name": score,
+        }
+    }
+}
+```
+Request files are created automatically by this tool.
+If you encounter problem on the space, don't hesitate to restart it to remove the create eval-queue, eval-queue-bk, eval-results and eval-results-bk created folder.
+# Code logic for more complex edits
+You'll find
+- the main table' columns names and properties in `src/display/utils.py`
+- the logic to read all results and request files, then convert them in dataframe lines, in `src/leaderboard/read_evals.py`, and `src/populate.py`
+- the logic to allow or filter submissions in `src/submission/submit.py` and `src/submission/check_validity.py`

app.py ADDED Viewed

	@@ -0,0 +1,275 @@

+import gradio as gr
+from gradio_leaderboard import Leaderboard, ColumnFilter, SelectColumns
+import pandas as pd
+from apscheduler.schedulers.background import BackgroundScheduler
+from huggingface_hub import snapshot_download
+import numpy as np
+from src.about import (
+    CITATION_BUTTON_LABEL,
+    CITATION_BUTTON_TEXT,
+    EVALUATION_QUEUE_TEXT,
+    INTRODUCTION_TEXT,
+    TITLE,
+    Tasks
+)
+from src.display.css_html_js import custom_css
+from src.display.utils import (
+    EVAL_COLS,
+    EVAL_TYPES,
+    AutoEvalColumn,
+    ModelType,
+    fields,
+    WeightType,
+    Precision,
+    AREA_DEFINITIONS,
+    AREA_AVG_COLUMN_MAP
+)
+from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, REPO_ID, RESULTS_REPO, TOKEN
+from src.populate import get_evaluation_queue_df, get_leaderboard_df
+from src.submission.submit import add_new_eval
+def restart_space():
+    API.restart_space(repo_id=REPO_ID)
+### Space initialisation
+try:
+    print(EVAL_REQUESTS_PATH)
+    snapshot_download(
+        repo_id=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
+    )
+except Exception as e:
+    print(f"Erro ao baixar EVAL_REQUESTS: {e}")
+try:
+    print(EVAL_RESULTS_PATH)
+    snapshot_download(
+        repo_id=RESULTS_REPO, local_dir=EVAL_RESULTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
+    )
+except Exception as e:
+    print(f"Erro ao baixar EVAL_RESULTS: {e}")
+ALL_COLS = [c.name for c in fields(AutoEvalColumn)]
+try:
+    LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, ALL_COLS)
+except Exception as e:
+    print(f"Erro ao gerar o DataFrame do Leaderboard: {e}")
+    LEADERBOARD_DF = pd.DataFrame()
+(
+    finished_eval_queue_df,
+    running_eval_queue_df,
+    pending_eval_queue_df,
+) = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
+def create_leaderboard_component(dataframe, displayed_cols, hidden_cols=None, cant_deselect_cols=None, title=None):
+    if dataframe is None or dataframe.empty:
+        return gr.Markdown(f"## {title or ''}\nNão há dados para exibir.")
+    if hidden_cols is None:
+        hidden_cols = []
+    if cant_deselect_cols is None:
+        cant_deselect_cols = [AutoEvalColumn.model_type_symbol.name, AutoEvalColumn.model.name]
+    all_required_cols = set(displayed_cols) | set(hidden_cols) | set(cant_deselect_cols) | {AutoEvalColumn.model_type.name, AutoEvalColumn.precision.name, AutoEvalColumn.params.name, AutoEvalColumn.still_on_hub.name}
+    available_cols = [col for col in all_required_cols if col in dataframe.columns]
+    filtered_df = dataframe[available_cols].copy()
+    for col in cant_deselect_cols:
+        if col not in filtered_df.columns:
+            filtered_df[col] = np.nan
+    return Leaderboard(
+        value=filtered_df,
+        datatype=[c.type for c in fields(AutoEvalColumn) if c.name in filtered_df.columns],
+        select_columns=SelectColumns(
+            default_selection=displayed_cols,
+            cant_deselect=cant_deselect_cols,
+            label="Selecionar Colunas para Exibir:",
+        ),
+        search_columns=[AutoEvalColumn.model.name, AutoEvalColumn.license.name] if AutoEvalColumn.license.name in filtered_df.columns else [AutoEvalColumn.model.name],
+        hide_columns=[c for c in hidden_cols if c in filtered_df.columns],
+        filter_columns=[
+            ColumnFilter(AutoEvalColumn.model_type.name, type="checkboxgroup", label="Tipos de Modelo") if AutoEvalColumn.model_type.name in filtered_df.columns else None,
+            ColumnFilter(AutoEvalColumn.precision.name, type="checkboxgroup", label="Precisão") if AutoEvalColumn.precision.name in filtered_df.columns else None,
+            ColumnFilter(
+                AutoEvalColumn.params.name,
+                type="slider",
+                min=0.01,
+                max=max(150, filtered_df[AutoEvalColumn.params.name].max() if AutoEvalColumn.params.name in filtered_df.columns and not filtered_df[AutoEvalColumn.params.name].empty else 150),
+                label="Selecionar número de parâmetros (B)",
+            ) if AutoEvalColumn.params.name in filtered_df.columns else None,
+            ColumnFilter(
+                AutoEvalColumn.still_on_hub.name, type="boolean", label="Deletado/incompleto", default=True
+            ) if AutoEvalColumn.still_on_hub.name in filtered_df.columns else None,
+        ],
+        bool_checkboxgroup_label="Ocultar modelos",
+        interactive=False,
+    )
+demo = gr.Blocks(css=custom_css)
+with demo:
+    gr.HTML(TITLE)
+    gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
+    with gr.Tabs(elem_classes="tab-buttons") as tabs:
+        with gr.TabItem("📊 Benchmark Geral", id=0):
+            general_cols_to_display = [
+                AutoEvalColumn.model_type_symbol.name,
+                AutoEvalColumn.model.name,
+                AutoEvalColumn.average.name,
+            ] + list(AREA_AVG_COLUMN_MAP.values())
+            general_hidden_cols = [task.name for task in Tasks] + [
+                AutoEvalColumn.model_type.name,
+                AutoEvalColumn.architecture.name,
+                AutoEvalColumn.weight_type.name,
+                AutoEvalColumn.precision.name,
+                AutoEvalColumn.license.name,
+                AutoEvalColumn.params.name,
+                AutoEvalColumn.likes.name,
+                AutoEvalColumn.still_on_hub.name,
+                AutoEvalColumn.revision.name
+            ]
+            create_leaderboard_component(
+                LEADERBOARD_DF,
+                displayed_cols=general_cols_to_display,
+                hidden_cols=general_hidden_cols,
+                title="Benchmark Geral"
+            )
+        tab_index = 1
+        for area_name, tasks_in_area in AREA_DEFINITIONS.items():
+            with gr.TabItem(f"🎓 {area_name}", id=tab_index):
+                area_cols_to_display = [
+                    AutoEvalColumn.model_type_symbol.name,
+                    AutoEvalColumn.model.name,
+                ] + [task.name for task in tasks_in_area]
+                area_hidden_cols = list(AREA_AVG_COLUMN_MAP.values()) + [
+                    task.name for task in Tasks if task not in tasks_in_area
+                ] + [
+                    AutoEvalColumn.model_type.name,
+                    AutoEvalColumn.architecture.name,
+                    AutoEvalColumn.weight_type.name,
+                    AutoEvalColumn.precision.name,
+                    AutoEvalColumn.license.name,
+                    AutoEvalColumn.params.name,
+                    AutoEvalColumn.likes.name,
+                    AutoEvalColumn.still_on_hub.name,
+                    AutoEvalColumn.revision.name
+                ]
+                create_leaderboard_component(
+                    LEADERBOARD_DF,
+                    displayed_cols=area_cols_to_display,
+                    hidden_cols=[col for col in area_hidden_cols if col != AutoEvalColumn.average.name],
+                    title=area_name
+                )
+            tab_index += 1
+        with gr.TabItem("🚀 Submit aqui!", id=tab_index):
+            with gr.Column():
+                with gr.Row():
+                    gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
+                with gr.Column():
+                    with gr.Accordion(
+                        f"✅ Avaliações Concluídas ({len(finished_eval_queue_df)})",
+                        open=False,
+                    ):
+                        with gr.Row():
+                            finished_eval_table = gr.components.Dataframe(
+                                value=finished_eval_queue_df,
+                                headers=EVAL_COLS,
+                                datatype=EVAL_TYPES,
+                                row_count=5,
+                            )
+                    with gr.Accordion(
+                        f"🔄 Fila de Avaliação em Execução ({len(running_eval_queue_df)})",
+                        open=False,
+                    ):
+                        with gr.Row():
+                            running_eval_table = gr.components.Dataframe(
+                                value=running_eval_queue_df,
+                                headers=EVAL_COLS,
+                                datatype=EVAL_TYPES,
+                                row_count=5,
+                            )
+                    with gr.Accordion(
+                        f"⏳ Fila de Avaliação Pendente ({len(pending_eval_queue_df)})",
+                        open=False,
+                    ):
+                        with gr.Row():
+                            pending_eval_table = gr.components.Dataframe(
+                                value=pending_eval_queue_df,
+                                headers=EVAL_COLS,
+                                datatype=EVAL_TYPES,
+                                row_count=5,
+                            )
+            with gr.Row():
+                gr.Markdown("# ✉️✨ Submeta seu modelo aqui!", elem_classes="markdown-text")
+            with gr.Row():
+                with gr.Column():
+                    model_name_textbox = gr.Textbox(label="Nome do Modelo")
+                    revision_name_textbox = gr.Textbox(label="Commit da Revisão", placeholder="main")
+                    model_type = gr.Dropdown(
+                        choices=[t.to_str(" : ") for t in ModelType if t != ModelType.Unknown],
+                        label="Tipo do Modelo",
+                        multiselect=False,
+                        value=None,
+                        interactive=True,
+                    )
+                with gr.Column():
+                    precision = gr.Dropdown(
+                        choices=[i.value.name for i in Precision if i != Precision.Unknown],
+                        label="Precisão",
+                        multiselect=False,
+                        value="float16",
+                        interactive=True,
+                    )
+                    weight_type = gr.Dropdown(
+                        choices=[i.value.name for i in WeightType],
+                        label="Tipo dos Pesos",
+                        multiselect=False,
+                        value="Original",
+                        interactive=True,
+                    )
+                    base_model_name_textbox = gr.Textbox(label="Modelo Base (para pesos delta ou adapter)")
+            submit_button = gr.Button("Submeter Avaliação")
+            submission_result = gr.Markdown()
+            submit_button.click(
+                add_new_eval,
+                [
+                    model_name_textbox,
+                    base_model_name_textbox,
+                    revision_name_textbox,
+                    precision,
+                    weight_type,
+                    model_type,
+                ],
+                submission_result,
+            )
+    with gr.Row():
+        with gr.Accordion("📙 Citação", open=False):
+            citation_button = gr.Textbox(
+                value=CITATION_BUTTON_TEXT,
+                label=CITATION_BUTTON_LABEL,
+                lines=20,
+                elem_id="citation-button",
+                show_copy_button=True,
+            )
+scheduler = BackgroundScheduler()
+scheduler.add_job(restart_space, "interval", seconds=1800)
+scheduler.start()
+demo.queue(default_concurrency_limit=40).launch()

leaderboard_funcionamento.txt ADDED Viewed

	@@ -0,0 +1,72 @@

+# Funcionamento da Leaderboard para Avaliação de Modelos
+## Visão Geral
+Esta leaderboard é uma aplicação web desenvolvida com Gradio que permite avaliar, comparar e submeter modelos de linguagem para benchmarks específicos. O sistema é hospedado na plataforma HuggingFace Spaces e oferece uma interface interativa para visualizar resultados de avaliações de modelos em diferentes tarefas.
+## Estrutura do Aplicativo
+### Abas Principais
+1. **🏅 LLM Benchmark** - Principal aba que exibe a tabela de classificação dos modelos avaliados
+2. **📝 About** - Informações sobre a leaderboard, metodologia e funcionamento
+3. **🚀 Submit here!** - Interface para usuários submeterem seus próprios modelos para avaliação
+### Funcionalidades de Filtragem na Leaderboard
+A tabela de classificação oferece as seguintes opções de filtragem:
+- **Colunas Selecionáveis** - Permite escolher quais métricas e informações exibir
+- **Filtros de Tipo de Modelo** - Opção para filtrar por categorias de modelos:
+  - 🟢 Pretrained (Pré-treinados)
+  - 🔶 Fine-tuned (Ajuste fino)
+  - ⭕ Instruction-tuned (Ajustados para instruções)
+  - 🟦 RL-tuned (Ajustados por reinforcement learning)
+- **Filtros de Precisão** - Filtragem por formato de pesos:
+  - float16
+  - bfloat16
+- **Filtro de Parâmetros** - Slider para filtrar por número de parâmetros (0.01B - 150B)
+- **Filtro de Disponibilidade** - Opção para ocultar modelos excluídos ou incompletos
+- **Busca por Modelo/Licença** - Campo de busca textual para encontrar modelos específicos
+## Métricas e Benchmarks
+A leaderboard avalia os modelos em benchmarks específicos:
+- ANLI (Adversarial Natural Language Inference)
+- LogiQA (Raciocínio lógico)
+O desempenho final é calculado como a média dos resultados em todas as tarefas avaliadas.
+## Sistema de Submissão
+O sistema permite que usuários enviem seus modelos para avaliação através do formulário de submissão, que inclui:
+1. **Informações do Modelo:**
+   - Nome do modelo (no formato organization/model)
+   - Revisão/commit específico
+   - Tipo de modelo (pretrained, fine-tuned, etc.)
+   - Precisão (float16, bfloat16)
+   - Tipo de pesos (Original, Adapter, Delta)
+   - Modelo base (para pesos delta ou adapter)
+2. **Filas de Avaliação:**
+   - ✅ Avaliações Concluídas
+   - 🔄 Avaliações em Execução
+   - ⏳ Avaliações Pendentes
+## Requisitos para Submissão
+Os modelos submetidos devem:
+1. Ser carregáveis através das classes Auto do Hugging Face
+2. Preferencialmente usar o formato safetensors para armazenamento de pesos
+3. Ter uma licença aberta
+4. Ter um model card devidamente preenchido
+## Backend e Armazenamento
+A leaderboard utiliza:
+- Repositórios HuggingFace para armazenar resultados de avaliação e requisições
+- Datasets HuggingFace para gerenciar as filas de avaliação
+- Sistema de atualização periódica para manter os dados atualizados
+## Detalhes Técnicos
+- Implementado usando Gradio para a interface
+- Utiliza pandas para manipulação e exibição de dados
+- Componente especializado gradio_leaderboard para a visualização da tabela
+- Atualização automática da interface a cada 30 minutos
+- Autenticação via token HF para gerenciamento dos repositórios

output/leaderboard_data_20250413_002202.csv ADDED Viewed

	@@ -0,0 +1,13 @@

+T,Modelo,Tipo,Arquitetura,Tipo de Peso,Precisão,Licença,#Params (B),Hub Likes,Disponível no hub,SHA do modelo,Média Geral,Área Médica,Área do Direito,Provas Militares,Computação,Discurso de Ódio,Economia e Contabilidade,Semântica e Inferência,Multidisciplinar,Revalida,MREX,OAB,ENAM,AFA,ITA,IME,POSCOMP,OBI,HateBR,PT Hate Speech,tweetSentBR,BCB,CFCES,FAQUAD NLI,ASSIN2 RTE,ASSIN2 STS,ENEM,BLUEX,CNPU,ENADE,BNDES,CACD (1ª fase),CACD (2ª fase),Datasets Área Médica,Datasets Área do Direito,Datasets Provas Militares,Datasets Computação,Datasets Discurso de Ódio,Datasets Economia e Contabilidade,Datasets Semântica e Inferência,Datasets Multidisciplinar
+PT,openai/gpt2-portuguese,PT : pré-treinado,,Original,float16,MIT,0.12,268,True,42b7792,0.7105925230941055,0.6188847305300255,0.6701955871546674,0.5883600439376051,0.7344674503873334,0.7475962540883628,0.849576998841669,0.7788317408159661,0.7090867005579059,0.6035585626832006,0.6342108983768503,0.6592469269015914,0.6811442474077435,0.5748457759326684,0.5677492084978396,0.6224851473823073,0.7001717151455216,0.7687631856291454,0.7670186275780199,0.7059318581743916,0.7698382765126768,0.894231731105497,0.8049222665778408,0.7802498017547961,0.744894322166147,0.811351098526955,0.6944001027581704,0.7122388959938982,0.710017915718404,0.7325868004943001,0.6521346635913674,0.7253374839411425,0.7368910414080588,"Revalida, MREX","OAB, ENAM","AFA, ITA, IME","POSCOMP, OBI","HateBR, PT Hate Speech, tweetSentBR","BCB, CFCES","FAQUAD NLI, ASSIN2 RTE, ASSIN2 STS","ENEM, BLUEX, CNPU, ENADE, BNDES, CACD (1ª fase), CACD (2ª fase)"
+PT,rufimelo/bert-large-portuguese-cased,PT : pré-treinado,,Original,bfloat16,MIT,0.34,96,True,b1f4531,0.7663174449695491,0.7602091078571434,0.973711984625266,0.9684507475761012,0.6871687003745621,0.8875668825625945,0.680483320210085,0.7734089183287836,0.6143132582475183,0.8009095654537163,0.7195086502605705,0.99,0.957423969250532,0.9253522427283033,0.99,0.99,0.7048405210681232,0.669496879681001,0.884580080804517,0.8725119298929598,0.9056086369903068,0.6574021331921888,0.7035645072279809,0.7812621953082739,0.7898121521378834,0.7491524075401934,0.6587752839863983,0.6568440899210536,0.5762508105837758,0.5817111877078663,0.6622223097972831,0.6028837938647416,0.5615053318715086,"Revalida, MREX","OAB, ENAM","AFA, ITA, IME","POSCOMP, OBI","HateBR, PT Hate Speech, tweetSentBR","BCB, CFCES","FAQUAD NLI, ASSIN2 RTE, ASSIN2 STS","ENEM, BLUEX, CNPU, ENADE, BNDES, CACD (1ª fase), CACD (2ª fase)"
+FT,unicamp-dl/mbert-portuguese-lener,FT : fine-tuned,,Original,float16,Apache-2.0,0.11,89,True,a764b32,0.5238981255673918,0.46910271433534434,0.6478804077379579,0.4764435027720609,0.6348258377007074,0.5679570688955583,0.545621864602512,0.672668649364131,0.4039265288241587,0.4501996677234772,0.4880057609472114,0.6284122869968649,0.6673485284790508,0.4831702853633705,0.4947379598495716,0.4514222631032407,0.5966991470611256,0.6729525283402894,0.5739315630212218,0.5841033582970794,0.5458362853683737,0.5869583065081572,0.5042854226968669,0.6817845836938061,0.7232768140967074,0.6129445503018796,0.3861249155636754,0.3784890609935717,0.4222874061015168,0.3739044853763201,0.4392823188665487,0.4131461427810349,0.41425137208644314,"Revalida, MREX","OAB, ENAM","AFA, ITA, IME","POSCOMP, OBI","HateBR, PT Hate Speech, tweetSentBR","BCB, CFCES","FAQUAD NLI, ASSIN2 RTE, ASSIN2 STS","ENEM, BLUEX, CNPU, ENADE, BNDES, CACD (1ª fase), CACD (2ª fase)"
+RL,brasileira/llama-2-7b-pt,RL : RL-tuned,,Original,bfloat16,LLAMA 2,7.0,562,True,c24dd37,0.7190013380059581,0.99,0.6277621780492408,0.6710016620169995,0.8100443731646181,0.7199915914170014,0.8883611805594716,0.6228298935870216,0.6546038867904668,0.99,0.99,0.6037868088474728,0.651737547251009,0.7314858380256698,0.6415017399609673,0.6400174080643614,0.8332219046699918,0.7868668416592445,0.7390035975198623,0.737371682844131,0.6835994938870111,0.920605738500256,0.8561166226186872,0.6392000772840843,0.6328464029942036,0.5964432004827769,0.6811458854492797,0.7087276670361912,0.5953130091726074,0.6393706576325692,0.6943992143544933,0.6474751086467531,0.6157956652413729,"Revalida, MREX","OAB, ENAM","AFA, ITA, IME","POSCOMP, OBI","HateBR, PT Hate Speech, tweetSentBR","BCB, CFCES","FAQUAD NLI, ASSIN2 RTE, ASSIN2 STS","ENEM, BLUEX, CNPU, ENADE, BNDES, CACD (1ª fase), CACD (2ª fase)"
+PT,neuralmind/bert-base-portuguese-cased,PT : pré-treinado,,Original,float16,MIT,0.11,153,True,main,0.7903320522382491,0.7525068067882859,0.790212164999971,0.5993185824512812,0.8281524838360852,0.8274517991360355,0.6240401489565062,0.7446158736697582,0.9234267541121511,0.731003703849237,0.7740099097273346,0.7285398327624035,0.8518844972375387,0.566841753054601,0.591417134883207,0.6396968594160356,0.8389798308453639,0.8173251368268065,0.9010121557045129,0.8171834652635162,0.7641597764400775,0.6784864262708603,0.5695938716421521,0.7647064598634123,0.7170997425103216,0.7520414186355404,0.9543505337931829,0.9315956953962545,0.8767306588367528,0.9090563746651121,0.9204153379699338,0.9424590333325819,0.9293796447912399,"Revalida, MREX","OAB, ENAM","AFA, ITA, IME","POSCOMP, OBI","HateBR, PT Hate Speech, tweetSentBR","BCB, CFCES","FAQUAD NLI, ASSIN2 RTE, ASSIN2 STS","ENEM, BLUEX, CNPU, ENADE, BNDES, CACD (1ª fase), CACD (2ª fase)"
+IFT,tulioandrade/mistral-7b-pt-adapter,IFT : instruction-tuned,,Adapter,bfloat16,Apache-2.0,7.2,315,True,main,0.7029740896600845,0.5349382711992174,0.6692960782120554,0.5175172583982789,0.6031745463143081,0.5873322504231425,0.6299480416296037,0.9302316794025772,0.8416315303513803,0.5461004402959746,0.5237761021024602,0.6390972174687846,0.6994949389553262,0.5231758805454357,0.5044300377533704,0.5249458568960303,0.6106598540949609,0.5956892385336552,0.5658410795439047,0.5897717901156058,0.6063838816099171,0.6203659080880489,0.6395301751711585,0.9446092137517881,0.9014725977356242,0.9446132267203191,0.8016106913774061,0.8798065244263629,0.8270460513988156,0.886454918928402,0.7981520314515067,0.8865213636168642,0.8118291312603048,"Revalida, MREX","OAB, ENAM","AFA, ITA, IME","POSCOMP, OBI","HateBR, PT Hate Speech, tweetSentBR","BCB, CFCES","FAQUAD NLI, ASSIN2 RTE, ASSIN2 STS","ENEM, BLUEX, CNPU, ENADE, BNDES, CACD (1ª fase), CACD (2ª fase)"
+IFT,PetroNLP/xlm-roberta-large-portuguese-instruct,IFT : instruction-tuned,,Original,bfloat16,Apache-2.0,0.56,173,True,8a67c19,0.6169427195732342,0.5533574226765212,0.4822753750865628,0.7312559564874549,0.8708286748117062,0.7547818164225487,0.7539473601359346,0.4607262875406695,0.5207879175691644,0.5183490470430046,0.588365798310038,0.47998532940130334,0.4845654207718223,0.6801757920914254,0.7463653840531308,0.7672266933178084,0.891268718690119,0.8503886309332934,0.7692787720488455,0.6844533933473784,0.8106132838714223,0.7078948291402046,0.7999998911316645,0.5057436202874445,0.45824193776195243,0.4181933045726116,0.5408312211672078,0.48116084507473167,0.500552102499582,0.5444194567570633,0.5195728015292668,0.5294734338791449,0.5295055620771545,"Revalida, MREX","OAB, ENAM","AFA, ITA, IME","POSCOMP, OBI","HateBR, PT Hate Speech, tweetSentBR","BCB, CFCES","FAQUAD NLI, ASSIN2 RTE, ASSIN2 STS","ENEM, BLUEX, CNPU, ENADE, BNDES, CACD (1ª fase), CACD (2ª fase)"
+FT,pucpr/biobertpt-bio,FT : fine-tuned,,Original,float16,CC-BY-SA-4.0,0.11,47,True,ab2d4b9,0.6108657601152154,0.702988363094768,0.6872008974588979,0.7613994162478939,0.5591628508904611,0.5708983915370802,0.6267255976277573,0.5200744268291007,0.5645008743970255,0.7228867322736113,0.6830899939159248,0.6957087204307917,0.678693074487004,0.8375613035639826,0.7426540780752489,0.70398286710445,0.5558048959754675,0.5625208058054548,0.5322247419674755,0.5780859026067205,0.6023845300370447,0.6241177598442514,0.6293334354112632,0.5662521904860494,0.49510038883517177,0.49887070116608073,0.6043572001868638,0.6034587214831894,0.5179199478458284,0.5128404234538338,0.5440709445935807,0.6134068257246169,0.5554520574912651,"Revalida, MREX","OAB, ENAM","AFA, ITA, IME","POSCOMP, OBI","HateBR, PT Hate Speech, tweetSentBR","BCB, CFCES","FAQUAD NLI, ASSIN2 RTE, ASSIN2 STS","ENEM, BLUEX, CNPU, ENADE, BNDES, CACD (1ª fase), CACD (2ª fase)"
+IFT,ai-forever/gpt-pequeno-pt,IFT : instruction-tuned,,Original,float16,MIT,1.3,409,True,main,0.75196128994191,0.8670933989159613,0.8348443783260642,0.7223937364955204,0.5604037920638382,0.7773722062427244,0.8326558639935904,0.8139561288452296,0.7022728396080725,0.937260492607467,0.7969263052244557,0.8233168492437696,0.8463719074083589,0.7605339427733732,0.7120343105252683,0.6946129561879192,0.5626662966116917,0.5581412875159847,0.8322948812545272,0.7351328585423854,0.7646888789312607,0.7818810820254256,0.8834306459617549,0.8026792301025911,0.8560128858812226,0.7831762705518751,0.6721405607195572,0.6766199465199311,0.7002599041017649,0.7020562275544762,0.7412478076487755,0.7257566032842915,0.6978288274277119,"Revalida, MREX","OAB, ENAM","AFA, ITA, IME","POSCOMP, OBI","HateBR, PT Hate Speech, tweetSentBR","BCB, CFCES","FAQUAD NLI, ASSIN2 RTE, ASSIN2 STS","ENEM, BLUEX, CNPU, ENADE, BNDES, CACD (1ª fase), CACD (2ª fase)"
+PT,saramago/roberta-base-portuguese,PT : pré-treinado,,Original,float16,MIT,0.13,112,True,main,0.9312283237428517,0.6179539103982901,0.99,0.9578333048317907,0.9130800909989987,0.8935172820223088,0.9548532555906575,0.9745700292057768,0.9885633439532743,0.6180836055639037,0.6178242152326763,0.99,0.99,0.9647166009011852,0.9449254684441978,0.9638578451499892,0.9581953587133843,0.8679648232846131,0.9161393680272095,0.8602959374608022,0.9041165405789148,0.9449285867327771,0.9647779244485377,0.99,0.99,0.9437100876173303,0.9799434076729195,0.99,0.99,0.99,0.99,0.99,0.99,"Revalida, MREX","OAB, ENAM","AFA, ITA, IME","POSCOMP, OBI","HateBR, PT Hate Speech, tweetSentBR","BCB, CFCES","FAQUAD NLI, ASSIN2 RTE, ASSIN2 STS","ENEM, BLUEX, CNPU, ENADE, BNDES, CACD (1ª fase), CACD (2ª fase)"
+FT,pierreguillou/bert-base-brpt-clinical,FT : fine-tuned,,Original,float16,MIT,0.11,73,True,c7bef2a,0.5482271789287327,0.4041051952506239,0.6969643618569595,0.6095811315978034,0.6695079353203791,0.6304290306986099,0.4762540627043566,0.5236257225581472,0.48184021849446684,0.37785374915437375,0.43035664134687407,0.6946451229482293,0.69928360076569,0.6208503603101987,0.5800215298462423,0.6278715046369693,0.7053490347771434,0.6336668358636147,0.6278205554878714,0.6237519629728817,0.6397145736350766,0.49769596156757845,0.4548121638411348,0.49858536577398616,0.537775323953586,0.5345164779468695,0.4616904995777591,0.46535002756586913,0.49571926994757104,0.4725184614551914,0.4771470703267811,0.48038288044301486,0.5200733201450809,"Revalida, MREX","OAB, ENAM","AFA, ITA, IME","POSCOMP, OBI","HateBR, PT Hate Speech, tweetSentBR","BCB, CFCES","FAQUAD NLI, ASSIN2 RTE, ASSIN2 STS","ENEM, BLUEX, CNPU, ENADE, BNDES, CACD (1ª fase), CACD (2ª fase)"
+PT,nlp-wyldlab/deberta-v3-base-portuguese,PT : pré-treinado,,Original,float16,MIT,0.18,128,True,main,0.7465021220826226,0.5946564630491511,0.6710791004110548,0.8314281273352276,0.6823194837016078,0.7755501949635453,0.8795739094780679,0.7241149481320521,0.7525018864872434,0.5792849969048782,0.6100279291934239,0.6985800323961385,0.6435781684259713,0.7914821771702616,0.8881410293755269,0.8146611754598944,0.6519856198221405,0.712653347581075,0.8175383934276091,0.7687241725532503,0.7403880189097766,0.8527339923251589,0.906413826630977,0.6930474129852162,0.6893290656821874,0.7899683657287526,0.7604902451021204,0.815325076521949,0.6935039827485028,0.7749131065853034,0.6890146652904909,0.7499769597205909,0.7842891694417463,"Revalida, MREX","OAB, ENAM","AFA, ITA, IME","POSCOMP, OBI","HateBR, PT Hate Speech, tweetSentBR","BCB, CFCES","FAQUAD NLI, ASSIN2 RTE, ASSIN2 STS","ENEM, BLUEX, CNPU, ENADE, BNDES, CACD (1ª fase), CACD (2ª fase)"

output/leaderboard_data_20250413_002339.csv ADDED Viewed

	@@ -0,0 +1,13 @@

+T,Modelo,Tipo,Arquitetura,Tipo de Peso,Precisão,Licença,#Params (B),Hub Likes,Disponível no hub,SHA do modelo,Média Geral,Área Médica,Área do Direito,Provas Militares,Computação,Discurso de Ódio,Economia e Contabilidade,Semântica e Inferência,Multidisciplinar,Revalida,MREX,OAB,ENAM,AFA,ITA,IME,POSCOMP,OBI,HateBR,PT Hate Speech,tweetSentBR,BCB,CFCES,FAQUAD NLI,ASSIN2 RTE,ASSIN2 STS,ENEM,BLUEX,CNPU,ENADE,BNDES,CACD (1ª fase),CACD (2ª fase),Datasets Área Médica,Datasets Área do Direito,Datasets Provas Militares,Datasets Computação,Datasets Discurso de Ódio,Datasets Economia e Contabilidade,Datasets Semântica e Inferência,Datasets Multidisciplinar
+PT,openai/gpt2-portuguese,PT : pré-treinado,,Original,float16,MIT,0.12,268,True,42b7792,0.7105925230941055,0.6188847305300255,0.6701955871546674,0.5883600439376051,0.7344674503873334,0.7475962540883628,0.849576998841669,0.7788317408159661,0.7090867005579059,0.6035585626832006,0.6342108983768503,0.6592469269015914,0.6811442474077435,0.5748457759326684,0.5677492084978396,0.6224851473823073,0.7001717151455216,0.7687631856291454,0.7670186275780199,0.7059318581743916,0.7698382765126768,0.894231731105497,0.8049222665778408,0.7802498017547961,0.744894322166147,0.811351098526955,0.6944001027581704,0.7122388959938982,0.710017915718404,0.7325868004943001,0.6521346635913674,0.7253374839411425,0.7368910414080588,"Revalida, MREX","OAB, ENAM","AFA, ITA, IME","POSCOMP, OBI","HateBR, PT Hate Speech, tweetSentBR","BCB, CFCES","FAQUAD NLI, ASSIN2 RTE, ASSIN2 STS","ENEM, BLUEX, CNPU, ENADE, BNDES, CACD (1ª fase), CACD (2ª fase)"
+PT,rufimelo/bert-large-portuguese-cased,PT : pré-treinado,,Original,bfloat16,MIT,0.34,96,True,b1f4531,0.7663174449695491,0.7602091078571434,0.973711984625266,0.9684507475761012,0.6871687003745621,0.8875668825625945,0.680483320210085,0.7734089183287836,0.6143132582475183,0.8009095654537163,0.7195086502605705,0.99,0.957423969250532,0.9253522427283033,0.99,0.99,0.7048405210681232,0.669496879681001,0.884580080804517,0.8725119298929598,0.9056086369903068,0.6574021331921888,0.7035645072279809,0.7812621953082739,0.7898121521378834,0.7491524075401934,0.6587752839863983,0.6568440899210536,0.5762508105837758,0.5817111877078663,0.6622223097972831,0.6028837938647416,0.5615053318715086,"Revalida, MREX","OAB, ENAM","AFA, ITA, IME","POSCOMP, OBI","HateBR, PT Hate Speech, tweetSentBR","BCB, CFCES","FAQUAD NLI, ASSIN2 RTE, ASSIN2 STS","ENEM, BLUEX, CNPU, ENADE, BNDES, CACD (1ª fase), CACD (2ª fase)"
+FT,unicamp-dl/mbert-portuguese-lener,FT : fine-tuned,,Original,float16,Apache-2.0,0.11,89,True,a764b32,0.5238981255673918,0.46910271433534434,0.6478804077379579,0.4764435027720609,0.6348258377007074,0.5679570688955583,0.545621864602512,0.672668649364131,0.4039265288241587,0.4501996677234772,0.4880057609472114,0.6284122869968649,0.6673485284790508,0.4831702853633705,0.4947379598495716,0.4514222631032407,0.5966991470611256,0.6729525283402894,0.5739315630212218,0.5841033582970794,0.5458362853683737,0.5869583065081572,0.5042854226968669,0.6817845836938061,0.7232768140967074,0.6129445503018796,0.3861249155636754,0.3784890609935717,0.4222874061015168,0.3739044853763201,0.4392823188665487,0.4131461427810349,0.41425137208644314,"Revalida, MREX","OAB, ENAM","AFA, ITA, IME","POSCOMP, OBI","HateBR, PT Hate Speech, tweetSentBR","BCB, CFCES","FAQUAD NLI, ASSIN2 RTE, ASSIN2 STS","ENEM, BLUEX, CNPU, ENADE, BNDES, CACD (1ª fase), CACD (2ª fase)"
+RL,brasileira/llama-2-7b-pt,RL : RL-tuned,,Original,bfloat16,LLAMA 2,7.0,562,True,c24dd37,0.7190013380059581,0.99,0.6277621780492408,0.6710016620169995,0.8100443731646181,0.7199915914170014,0.8883611805594716,0.6228298935870216,0.6546038867904668,0.99,0.99,0.6037868088474728,0.651737547251009,0.7314858380256698,0.6415017399609673,0.6400174080643614,0.8332219046699918,0.7868668416592445,0.7390035975198623,0.737371682844131,0.6835994938870111,0.920605738500256,0.8561166226186872,0.6392000772840843,0.6328464029942036,0.5964432004827769,0.6811458854492797,0.7087276670361912,0.5953130091726074,0.6393706576325692,0.6943992143544933,0.6474751086467531,0.6157956652413729,"Revalida, MREX","OAB, ENAM","AFA, ITA, IME","POSCOMP, OBI","HateBR, PT Hate Speech, tweetSentBR","BCB, CFCES","FAQUAD NLI, ASSIN2 RTE, ASSIN2 STS","ENEM, BLUEX, CNPU, ENADE, BNDES, CACD (1ª fase), CACD (2ª fase)"
+PT,neuralmind/bert-base-portuguese-cased,PT : pré-treinado,,Original,float16,MIT,0.11,153,True,main,0.7903320522382491,0.7525068067882859,0.790212164999971,0.5993185824512812,0.8281524838360852,0.8274517991360355,0.6240401489565062,0.7446158736697582,0.9234267541121511,0.731003703849237,0.7740099097273346,0.7285398327624035,0.8518844972375387,0.566841753054601,0.591417134883207,0.6396968594160356,0.8389798308453639,0.8173251368268065,0.9010121557045129,0.8171834652635162,0.7641597764400775,0.6784864262708603,0.5695938716421521,0.7647064598634123,0.7170997425103216,0.7520414186355404,0.9543505337931829,0.9315956953962545,0.8767306588367528,0.9090563746651121,0.9204153379699338,0.9424590333325819,0.9293796447912399,"Revalida, MREX","OAB, ENAM","AFA, ITA, IME","POSCOMP, OBI","HateBR, PT Hate Speech, tweetSentBR","BCB, CFCES","FAQUAD NLI, ASSIN2 RTE, ASSIN2 STS","ENEM, BLUEX, CNPU, ENADE, BNDES, CACD (1ª fase), CACD (2ª fase)"
+IFT,tulioandrade/mistral-7b-pt-adapter,IFT : instruction-tuned,,Adapter,bfloat16,Apache-2.0,7.2,315,True,main,0.7029740896600845,0.5349382711992174,0.6692960782120554,0.5175172583982789,0.6031745463143081,0.5873322504231425,0.6299480416296037,0.9302316794025772,0.8416315303513803,0.5461004402959746,0.5237761021024602,0.6390972174687846,0.6994949389553262,0.5231758805454357,0.5044300377533704,0.5249458568960303,0.6106598540949609,0.5956892385336552,0.5658410795439047,0.5897717901156058,0.6063838816099171,0.6203659080880489,0.6395301751711585,0.9446092137517881,0.9014725977356242,0.9446132267203191,0.8016106913774061,0.8798065244263629,0.8270460513988156,0.886454918928402,0.7981520314515067,0.8865213636168642,0.8118291312603048,"Revalida, MREX","OAB, ENAM","AFA, ITA, IME","POSCOMP, OBI","HateBR, PT Hate Speech, tweetSentBR","BCB, CFCES","FAQUAD NLI, ASSIN2 RTE, ASSIN2 STS","ENEM, BLUEX, CNPU, ENADE, BNDES, CACD (1ª fase), CACD (2ª fase)"
+IFT,PetroNLP/xlm-roberta-large-portuguese-instruct,IFT : instruction-tuned,,Original,bfloat16,Apache-2.0,0.56,173,True,8a67c19,0.6169427195732342,0.5533574226765212,0.4822753750865628,0.7312559564874549,0.8708286748117062,0.7547818164225487,0.7539473601359346,0.4607262875406695,0.5207879175691644,0.5183490470430046,0.588365798310038,0.47998532940130334,0.4845654207718223,0.6801757920914254,0.7463653840531308,0.7672266933178084,0.891268718690119,0.8503886309332934,0.7692787720488455,0.6844533933473784,0.8106132838714223,0.7078948291402046,0.7999998911316645,0.5057436202874445,0.45824193776195243,0.4181933045726116,0.5408312211672078,0.48116084507473167,0.500552102499582,0.5444194567570633,0.5195728015292668,0.5294734338791449,0.5295055620771545,"Revalida, MREX","OAB, ENAM","AFA, ITA, IME","POSCOMP, OBI","HateBR, PT Hate Speech, tweetSentBR","BCB, CFCES","FAQUAD NLI, ASSIN2 RTE, ASSIN2 STS","ENEM, BLUEX, CNPU, ENADE, BNDES, CACD (1ª fase), CACD (2ª fase)"
+FT,pucpr/biobertpt-bio,FT : fine-tuned,,Original,float16,CC-BY-SA-4.0,0.11,47,True,ab2d4b9,0.6108657601152154,0.702988363094768,0.6872008974588979,0.7613994162478939,0.5591628508904611,0.5708983915370802,0.6267255976277573,0.5200744268291007,0.5645008743970255,0.7228867322736113,0.6830899939159248,0.6957087204307917,0.678693074487004,0.8375613035639826,0.7426540780752489,0.70398286710445,0.5558048959754675,0.5625208058054548,0.5322247419674755,0.5780859026067205,0.6023845300370447,0.6241177598442514,0.6293334354112632,0.5662521904860494,0.49510038883517177,0.49887070116608073,0.6043572001868638,0.6034587214831894,0.5179199478458284,0.5128404234538338,0.5440709445935807,0.6134068257246169,0.5554520574912651,"Revalida, MREX","OAB, ENAM","AFA, ITA, IME","POSCOMP, OBI","HateBR, PT Hate Speech, tweetSentBR","BCB, CFCES","FAQUAD NLI, ASSIN2 RTE, ASSIN2 STS","ENEM, BLUEX, CNPU, ENADE, BNDES, CACD (1ª fase), CACD (2ª fase)"
+IFT,ai-forever/gpt-pequeno-pt,IFT : instruction-tuned,,Original,float16,MIT,1.3,409,True,main,0.75196128994191,0.8670933989159613,0.8348443783260642,0.7223937364955204,0.5604037920638382,0.7773722062427244,0.8326558639935904,0.8139561288452296,0.7022728396080725,0.937260492607467,0.7969263052244557,0.8233168492437696,0.8463719074083589,0.7605339427733732,0.7120343105252683,0.6946129561879192,0.5626662966116917,0.5581412875159847,0.8322948812545272,0.7351328585423854,0.7646888789312607,0.7818810820254256,0.8834306459617549,0.8026792301025911,0.8560128858812226,0.7831762705518751,0.6721405607195572,0.6766199465199311,0.7002599041017649,0.7020562275544762,0.7412478076487755,0.7257566032842915,0.6978288274277119,"Revalida, MREX","OAB, ENAM","AFA, ITA, IME","POSCOMP, OBI","HateBR, PT Hate Speech, tweetSentBR","BCB, CFCES","FAQUAD NLI, ASSIN2 RTE, ASSIN2 STS","ENEM, BLUEX, CNPU, ENADE, BNDES, CACD (1ª fase), CACD (2ª fase)"
+PT,saramago/roberta-base-portuguese,PT : pré-treinado,,Original,float16,MIT,0.13,112,True,main,0.9312283237428517,0.6179539103982901,0.99,0.9578333048317907,0.9130800909989987,0.8935172820223088,0.9548532555906575,0.9745700292057768,0.9885633439532743,0.6180836055639037,0.6178242152326763,0.99,0.99,0.9647166009011852,0.9449254684441978,0.9638578451499892,0.9581953587133843,0.8679648232846131,0.9161393680272095,0.8602959374608022,0.9041165405789148,0.9449285867327771,0.9647779244485377,0.99,0.99,0.9437100876173303,0.9799434076729195,0.99,0.99,0.99,0.99,0.99,0.99,"Revalida, MREX","OAB, ENAM","AFA, ITA, IME","POSCOMP, OBI","HateBR, PT Hate Speech, tweetSentBR","BCB, CFCES","FAQUAD NLI, ASSIN2 RTE, ASSIN2 STS","ENEM, BLUEX, CNPU, ENADE, BNDES, CACD (1ª fase), CACD (2ª fase)"
+FT,pierreguillou/bert-base-brpt-clinical,FT : fine-tuned,,Original,float16,MIT,0.11,73,True,c7bef2a,0.5482271789287327,0.4041051952506239,0.6969643618569595,0.6095811315978034,0.6695079353203791,0.6304290306986099,0.4762540627043566,0.5236257225581472,0.48184021849446684,0.37785374915437375,0.43035664134687407,0.6946451229482293,0.69928360076569,0.6208503603101987,0.5800215298462423,0.6278715046369693,0.7053490347771434,0.6336668358636147,0.6278205554878714,0.6237519629728817,0.6397145736350766,0.49769596156757845,0.4548121638411348,0.49858536577398616,0.537775323953586,0.5345164779468695,0.4616904995777591,0.46535002756586913,0.49571926994757104,0.4725184614551914,0.4771470703267811,0.48038288044301486,0.5200733201450809,"Revalida, MREX","OAB, ENAM","AFA, ITA, IME","POSCOMP, OBI","HateBR, PT Hate Speech, tweetSentBR","BCB, CFCES","FAQUAD NLI, ASSIN2 RTE, ASSIN2 STS","ENEM, BLUEX, CNPU, ENADE, BNDES, CACD (1ª fase), CACD (2ª fase)"
+PT,nlp-wyldlab/deberta-v3-base-portuguese,PT : pré-treinado,,Original,float16,MIT,0.18,128,True,main,0.7465021220826226,0.5946564630491511,0.6710791004110548,0.8314281273352276,0.6823194837016078,0.7755501949635453,0.8795739094780679,0.7241149481320521,0.7525018864872434,0.5792849969048782,0.6100279291934239,0.6985800323961385,0.6435781684259713,0.7914821771702616,0.8881410293755269,0.8146611754598944,0.6519856198221405,0.712653347581075,0.8175383934276091,0.7687241725532503,0.7403880189097766,0.8527339923251589,0.906413826630977,0.6930474129852162,0.6893290656821874,0.7899683657287526,0.7604902451021204,0.815325076521949,0.6935039827485028,0.7749131065853034,0.6890146652904909,0.7499769597205909,0.7842891694417463,"Revalida, MREX","OAB, ENAM","AFA, ITA, IME","POSCOMP, OBI","HateBR, PT Hate Speech, tweetSentBR","BCB, CFCES","FAQUAD NLI, ASSIN2 RTE, ASSIN2 STS","ENEM, BLUEX, CNPU, ENADE, BNDES, CACD (1ª fase), CACD (2ª fase)"

output/leaderboard_data_20250413_002339.json ADDED Viewed

	@@ -0,0 +1,650 @@

+[
+  {
+    "T":"PT",
+    "Modelo":"openai\/gpt2-portuguese",
+    "Tipo":"PT : pr\u00e9-treinado",
+    "Arquitetura":"",
+    "Tipo de Peso":"Original",
+    "Precis\u00e3o":"float16",
+    "Licen\u00e7a":"MIT",
+    "#Params (B)":0.12,
+    "Hub Likes":268,
+    "Dispon\u00edvel no hub":true,
+    "SHA do modelo":"42b7792",
+    "M\u00e9dia Geral":0.7105925231,
+    "\u00c1rea M\u00e9dica":0.6188847305,
+    "\u00c1rea do Direito":0.6701955872,
+    "Provas Militares":0.5883600439,
+    "Computa\u00e7\u00e3o":0.7344674504,
+    "Discurso de \u00d3dio":0.7475962541,
+    "Economia e Contabilidade":0.8495769988,
+    "Sem\u00e2ntica e Infer\u00eancia":0.7788317408,
+    "Multidisciplinar":0.7090867006,
+    "Revalida":0.6035585627,
+    "MREX":0.6342108984,
+    "OAB":0.6592469269,
+    "ENAM":0.6811442474,
+    "AFA":0.5748457759,
+    "ITA":0.5677492085,
+    "IME":0.6224851474,
+    "POSCOMP":0.7001717151,
+    "OBI":0.7687631856,
+    "HateBR":0.7670186276,
+    "PT Hate Speech":0.7059318582,
+    "tweetSentBR":0.7698382765,
+    "BCB":0.8942317311,
+    "CFCES":0.8049222666,
+    "FAQUAD NLI":0.7802498018,
+    "ASSIN2 RTE":0.7448943222,
+    "ASSIN2 STS":0.8113510985,
+    "ENEM":0.6944001028,
+    "BLUEX":0.712238896,
+    "CNPU":0.7100179157,
+    "ENADE":0.7325868005,
+    "BNDES":0.6521346636,
+    "CACD (1\u00aa fase)":0.7253374839,
+    "CACD (2\u00aa fase)":0.7368910414,
+    "Datasets \u00c1rea M\u00e9dica":"Revalida, MREX",
+    "Datasets \u00c1rea do Direito":"OAB, ENAM",
+    "Datasets Provas Militares":"AFA, ITA, IME",
+    "Datasets Computa\u00e7\u00e3o":"POSCOMP, OBI",
+    "Datasets Discurso de \u00d3dio":"HateBR, PT Hate Speech, tweetSentBR",
+    "Datasets Economia e Contabilidade":"BCB, CFCES",
+    "Datasets Sem\u00e2ntica e Infer\u00eancia":"FAQUAD NLI, ASSIN2 RTE, ASSIN2 STS",
+    "Datasets Multidisciplinar":"ENEM, BLUEX, CNPU, ENADE, BNDES, CACD (1\u00aa fase), CACD (2\u00aa fase)"
+  },
+  {
+    "T":"PT",
+    "Modelo":"rufimelo\/bert-large-portuguese-cased",
+    "Tipo":"PT : pr\u00e9-treinado",
+    "Arquitetura":"",
+    "Tipo de Peso":"Original",
+    "Precis\u00e3o":"bfloat16",
+    "Licen\u00e7a":"MIT",
+    "#Params (B)":0.34,
+    "Hub Likes":96,
+    "Dispon\u00edvel no hub":true,
+    "SHA do modelo":"b1f4531",
+    "M\u00e9dia Geral":0.766317445,
+    "\u00c1rea M\u00e9dica":0.7602091079,
+    "\u00c1rea do Direito":0.9737119846,
+    "Provas Militares":0.9684507476,
+    "Computa\u00e7\u00e3o":0.6871687004,
+    "Discurso de \u00d3dio":0.8875668826,
+    "Economia e Contabilidade":0.6804833202,
+    "Sem\u00e2ntica e Infer\u00eancia":0.7734089183,
+    "Multidisciplinar":0.6143132582,
+    "Revalida":0.8009095655,
+    "MREX":0.7195086503,
+    "OAB":0.99,
+    "ENAM":0.9574239693,
+    "AFA":0.9253522427,
+    "ITA":0.99,
+    "IME":0.99,
+    "POSCOMP":0.7048405211,
+    "OBI":0.6694968797,
+    "HateBR":0.8845800808,
+    "PT Hate Speech":0.8725119299,
+    "tweetSentBR":0.905608637,
+    "BCB":0.6574021332,
+    "CFCES":0.7035645072,
+    "FAQUAD NLI":0.7812621953,
+    "ASSIN2 RTE":0.7898121521,
+    "ASSIN2 STS":0.7491524075,
+    "ENEM":0.658775284,
+    "BLUEX":0.6568440899,
+    "CNPU":0.5762508106,
+    "ENADE":0.5817111877,
+    "BNDES":0.6622223098,
+    "CACD (1\u00aa fase)":0.6028837939,
+    "CACD (2\u00aa fase)":0.5615053319,
+    "Datasets \u00c1rea M\u00e9dica":"Revalida, MREX",
+    "Datasets \u00c1rea do Direito":"OAB, ENAM",
+    "Datasets Provas Militares":"AFA, ITA, IME",
+    "Datasets Computa\u00e7\u00e3o":"POSCOMP, OBI",
+    "Datasets Discurso de \u00d3dio":"HateBR, PT Hate Speech, tweetSentBR",
+    "Datasets Economia e Contabilidade":"BCB, CFCES",
+    "Datasets Sem\u00e2ntica e Infer\u00eancia":"FAQUAD NLI, ASSIN2 RTE, ASSIN2 STS",
+    "Datasets Multidisciplinar":"ENEM, BLUEX, CNPU, ENADE, BNDES, CACD (1\u00aa fase), CACD (2\u00aa fase)"
+  },
+  {
+    "T":"FT",
+    "Modelo":"unicamp-dl\/mbert-portuguese-lener",
+    "Tipo":"FT : fine-tuned",
+    "Arquitetura":"",
+    "Tipo de Peso":"Original",
+    "Precis\u00e3o":"float16",
+    "Licen\u00e7a":"Apache-2.0",
+    "#Params (B)":0.11,
+    "Hub Likes":89,
+    "Dispon\u00edvel no hub":true,
+    "SHA do modelo":"a764b32",
+    "M\u00e9dia Geral":0.5238981256,
+    "\u00c1rea M\u00e9dica":0.4691027143,
+    "\u00c1rea do Direito":0.6478804077,
+    "Provas Militares":0.4764435028,
+    "Computa\u00e7\u00e3o":0.6348258377,
+    "Discurso de \u00d3dio":0.5679570689,
+    "Economia e Contabilidade":0.5456218646,
+    "Sem\u00e2ntica e Infer\u00eancia":0.6726686494,
+    "Multidisciplinar":0.4039265288,
+    "Revalida":0.4501996677,
+    "MREX":0.4880057609,
+    "OAB":0.628412287,
+    "ENAM":0.6673485285,
+    "AFA":0.4831702854,
+    "ITA":0.4947379598,
+    "IME":0.4514222631,
+    "POSCOMP":0.5966991471,
+    "OBI":0.6729525283,
+    "HateBR":0.573931563,
+    "PT Hate Speech":0.5841033583,
+    "tweetSentBR":0.5458362854,
+    "BCB":0.5869583065,
+    "CFCES":0.5042854227,
+    "FAQUAD NLI":0.6817845837,
+    "ASSIN2 RTE":0.7232768141,
+    "ASSIN2 STS":0.6129445503,
+    "ENEM":0.3861249156,
+    "BLUEX":0.378489061,
+    "CNPU":0.4222874061,
+    "ENADE":0.3739044854,
+    "BNDES":0.4392823189,
+    "CACD (1\u00aa fase)":0.4131461428,
+    "CACD (2\u00aa fase)":0.4142513721,
+    "Datasets \u00c1rea M\u00e9dica":"Revalida, MREX",
+    "Datasets \u00c1rea do Direito":"OAB, ENAM",
+    "Datasets Provas Militares":"AFA, ITA, IME",
+    "Datasets Computa\u00e7\u00e3o":"POSCOMP, OBI",
+    "Datasets Discurso de \u00d3dio":"HateBR, PT Hate Speech, tweetSentBR",
+    "Datasets Economia e Contabilidade":"BCB, CFCES",
+    "Datasets Sem\u00e2ntica e Infer\u00eancia":"FAQUAD NLI, ASSIN2 RTE, ASSIN2 STS",
+    "Datasets Multidisciplinar":"ENEM, BLUEX, CNPU, ENADE, BNDES, CACD (1\u00aa fase), CACD (2\u00aa fase)"
+  },
+  {
+    "T":"RL",
+    "Modelo":"brasileira\/llama-2-7b-pt",
+    "Tipo":"RL : RL-tuned",
+    "Arquitetura":"",
+    "Tipo de Peso":"Original",
+    "Precis\u00e3o":"bfloat16",
+    "Licen\u00e7a":"LLAMA 2",
+    "#Params (B)":7.0,
+    "Hub Likes":562,
+    "Dispon\u00edvel no hub":true,
+    "SHA do modelo":"c24dd37",
+    "M\u00e9dia Geral":0.719001338,
+    "\u00c1rea M\u00e9dica":0.99,
+    "\u00c1rea do Direito":0.627762178,
+    "Provas Militares":0.671001662,
+    "Computa\u00e7\u00e3o":0.8100443732,
+    "Discurso de \u00d3dio":0.7199915914,
+    "Economia e Contabilidade":0.8883611806,
+    "Sem\u00e2ntica e Infer\u00eancia":0.6228298936,
+    "Multidisciplinar":0.6546038868,
+    "Revalida":0.99,
+    "MREX":0.99,
+    "OAB":0.6037868088,
+    "ENAM":0.6517375473,
+    "AFA":0.731485838,
+    "ITA":0.64150174,
+    "IME":0.6400174081,
+    "POSCOMP":0.8332219047,
+    "OBI":0.7868668417,
+    "HateBR":0.7390035975,
+    "PT Hate Speech":0.7373716828,
+    "tweetSentBR":0.6835994939,
+    "BCB":0.9206057385,
+    "CFCES":0.8561166226,
+    "FAQUAD NLI":0.6392000773,
+    "ASSIN2 RTE":0.632846403,
+    "ASSIN2 STS":0.5964432005,
+    "ENEM":0.6811458854,
+    "BLUEX":0.708727667,
+    "CNPU":0.5953130092,
+    "ENADE":0.6393706576,
+    "BNDES":0.6943992144,
+    "CACD (1\u00aa fase)":0.6474751086,
+    "CACD (2\u00aa fase)":0.6157956652,
+    "Datasets \u00c1rea M\u00e9dica":"Revalida, MREX",
+    "Datasets \u00c1rea do Direito":"OAB, ENAM",
+    "Datasets Provas Militares":"AFA, ITA, IME",
+    "Datasets Computa\u00e7\u00e3o":"POSCOMP, OBI",
+    "Datasets Discurso de \u00d3dio":"HateBR, PT Hate Speech, tweetSentBR",
+    "Datasets Economia e Contabilidade":"BCB, CFCES",
+    "Datasets Sem\u00e2ntica e Infer\u00eancia":"FAQUAD NLI, ASSIN2 RTE, ASSIN2 STS",
+    "Datasets Multidisciplinar":"ENEM, BLUEX, CNPU, ENADE, BNDES, CACD (1\u00aa fase), CACD (2\u00aa fase)"
+  },
+  {
+    "T":"PT",
+    "Modelo":"neuralmind\/bert-base-portuguese-cased",
+    "Tipo":"PT : pr\u00e9-treinado",
+    "Arquitetura":"",
+    "Tipo de Peso":"Original",
+    "Precis\u00e3o":"float16",
+    "Licen\u00e7a":"MIT",
+    "#Params (B)":0.11,
+    "Hub Likes":153,
+    "Dispon\u00edvel no hub":true,
+    "SHA do modelo":"main",
+    "M\u00e9dia Geral":0.7903320522,
+    "\u00c1rea M\u00e9dica":0.7525068068,
+    "\u00c1rea do Direito":0.790212165,
+    "Provas Militares":0.5993185825,
+    "Computa\u00e7\u00e3o":0.8281524838,
+    "Discurso de \u00d3dio":0.8274517991,
+    "Economia e Contabilidade":0.624040149,
+    "Sem\u00e2ntica e Infer\u00eancia":0.7446158737,
+    "Multidisciplinar":0.9234267541,
+    "Revalida":0.7310037038,
+    "MREX":0.7740099097,
+    "OAB":0.7285398328,
+    "ENAM":0.8518844972,
+    "AFA":0.5668417531,
+    "ITA":0.5914171349,
+    "IME":0.6396968594,
+    "POSCOMP":0.8389798308,
+    "OBI":0.8173251368,
+    "HateBR":0.9010121557,
+    "PT Hate Speech":0.8171834653,
+    "tweetSentBR":0.7641597764,
+    "BCB":0.6784864263,
+    "CFCES":0.5695938716,
+    "FAQUAD NLI":0.7647064599,
+    "ASSIN2 RTE":0.7170997425,
+    "ASSIN2 STS":0.7520414186,
+    "ENEM":0.9543505338,
+    "BLUEX":0.9315956954,
+    "CNPU":0.8767306588,
+    "ENADE":0.9090563747,
+    "BNDES":0.920415338,
+    "CACD (1\u00aa fase)":0.9424590333,
+    "CACD (2\u00aa fase)":0.9293796448,
+    "Datasets \u00c1rea M\u00e9dica":"Revalida, MREX",
+    "Datasets \u00c1rea do Direito":"OAB, ENAM",
+    "Datasets Provas Militares":"AFA, ITA, IME",
+    "Datasets Computa\u00e7\u00e3o":"POSCOMP, OBI",
+    "Datasets Discurso de \u00d3dio":"HateBR, PT Hate Speech, tweetSentBR",
+    "Datasets Economia e Contabilidade":"BCB, CFCES",
+    "Datasets Sem\u00e2ntica e Infer\u00eancia":"FAQUAD NLI, ASSIN2 RTE, ASSIN2 STS",
+    "Datasets Multidisciplinar":"ENEM, BLUEX, CNPU, ENADE, BNDES, CACD (1\u00aa fase), CACD (2\u00aa fase)"
+  },
+  {
+    "T":"IFT",
+    "Modelo":"tulioandrade\/mistral-7b-pt-adapter",
+    "Tipo":"IFT : instruction-tuned",
+    "Arquitetura":"",
+    "Tipo de Peso":"Adapter",
+    "Precis\u00e3o":"bfloat16",
+    "Licen\u00e7a":"Apache-2.0",
+    "#Params (B)":7.2,
+    "Hub Likes":315,
+    "Dispon\u00edvel no hub":true,
+    "SHA do modelo":"main",
+    "M\u00e9dia Geral":0.7029740897,
+    "\u00c1rea M\u00e9dica":0.5349382712,
+    "\u00c1rea do Direito":0.6692960782,
+    "Provas Militares":0.5175172584,
+    "Computa\u00e7\u00e3o":0.6031745463,
+    "Discurso de \u00d3dio":0.5873322504,
+    "Economia e Contabilidade":0.6299480416,
+    "Sem\u00e2ntica e Infer\u00eancia":0.9302316794,
+    "Multidisciplinar":0.8416315304,
+    "Revalida":0.5461004403,
+    "MREX":0.5237761021,
+    "OAB":0.6390972175,
+    "ENAM":0.699494939,
+    "AFA":0.5231758805,
+    "ITA":0.5044300378,
+    "IME":0.5249458569,
+    "POSCOMP":0.6106598541,
+    "OBI":0.5956892385,
+    "HateBR":0.5658410795,
+    "PT Hate Speech":0.5897717901,
+    "tweetSentBR":0.6063838816,
+    "BCB":0.6203659081,
+    "CFCES":0.6395301752,
+    "FAQUAD NLI":0.9446092138,
+    "ASSIN2 RTE":0.9014725977,
+    "ASSIN2 STS":0.9446132267,
+    "ENEM":0.8016106914,
+    "BLUEX":0.8798065244,
+    "CNPU":0.8270460514,
+    "ENADE":0.8864549189,
+    "BNDES":0.7981520315,
+    "CACD (1\u00aa fase)":0.8865213636,
+    "CACD (2\u00aa fase)":0.8118291313,
+    "Datasets \u00c1rea M\u00e9dica":"Revalida, MREX",
+    "Datasets \u00c1rea do Direito":"OAB, ENAM",
+    "Datasets Provas Militares":"AFA, ITA, IME",
+    "Datasets Computa\u00e7\u00e3o":"POSCOMP, OBI",
+    "Datasets Discurso de \u00d3dio":"HateBR, PT Hate Speech, tweetSentBR",
+    "Datasets Economia e Contabilidade":"BCB, CFCES",
+    "Datasets Sem\u00e2ntica e Infer\u00eancia":"FAQUAD NLI, ASSIN2 RTE, ASSIN2 STS",
+    "Datasets Multidisciplinar":"ENEM, BLUEX, CNPU, ENADE, BNDES, CACD (1\u00aa fase), CACD (2\u00aa fase)"
+  },
+  {
+    "T":"IFT",
+    "Modelo":"PetroNLP\/xlm-roberta-large-portuguese-instruct",
+    "Tipo":"IFT : instruction-tuned",
+    "Arquitetura":"",
+    "Tipo de Peso":"Original",
+    "Precis\u00e3o":"bfloat16",
+    "Licen\u00e7a":"Apache-2.0",
+    "#Params (B)":0.56,
+    "Hub Likes":173,
+    "Dispon\u00edvel no hub":true,
+    "SHA do modelo":"8a67c19",
+    "M\u00e9dia Geral":0.6169427196,
+    "\u00c1rea M\u00e9dica":0.5533574227,
+    "\u00c1rea do Direito":0.4822753751,
+    "Provas Militares":0.7312559565,
+    "Computa\u00e7\u00e3o":0.8708286748,
+    "Discurso de \u00d3dio":0.7547818164,
+    "Economia e Contabilidade":0.7539473601,
+    "Sem\u00e2ntica e Infer\u00eancia":0.4607262875,
+    "Multidisciplinar":0.5207879176,
+    "Revalida":0.518349047,
+    "MREX":0.5883657983,
+    "OAB":0.4799853294,
+    "ENAM":0.4845654208,
+    "AFA":0.6801757921,
+    "ITA":0.7463653841,
+    "IME":0.7672266933,
+    "POSCOMP":0.8912687187,
+    "OBI":0.8503886309,
+    "HateBR":0.769278772,
+    "PT Hate Speech":0.6844533933,
+    "tweetSentBR":0.8106132839,
+    "BCB":0.7078948291,
+    "CFCES":0.7999998911,
+    "FAQUAD NLI":0.5057436203,
+    "ASSIN2 RTE":0.4582419378,
+    "ASSIN2 STS":0.4181933046,
+    "ENEM":0.5408312212,
+    "BLUEX":0.4811608451,
+    "CNPU":0.5005521025,
+    "ENADE":0.5444194568,
+    "BNDES":0.5195728015,
+    "CACD (1\u00aa fase)":0.5294734339,
+    "CACD (2\u00aa fase)":0.5295055621,
+    "Datasets \u00c1rea M\u00e9dica":"Revalida, MREX",
+    "Datasets \u00c1rea do Direito":"OAB, ENAM",
+    "Datasets Provas Militares":"AFA, ITA, IME",
+    "Datasets Computa\u00e7\u00e3o":"POSCOMP, OBI",
+    "Datasets Discurso de \u00d3dio":"HateBR, PT Hate Speech, tweetSentBR",
+    "Datasets Economia e Contabilidade":"BCB, CFCES",
+    "Datasets Sem\u00e2ntica e Infer\u00eancia":"FAQUAD NLI, ASSIN2 RTE, ASSIN2 STS",
+    "Datasets Multidisciplinar":"ENEM, BLUEX, CNPU, ENADE, BNDES, CACD (1\u00aa fase), CACD (2\u00aa fase)"
+  },
+  {
+    "T":"FT",
+    "Modelo":"pucpr\/biobertpt-bio",
+    "Tipo":"FT : fine-tuned",
+    "Arquitetura":"",
+    "Tipo de Peso":"Original",
+    "Precis\u00e3o":"float16",
+    "Licen\u00e7a":"CC-BY-SA-4.0",
+    "#Params (B)":0.11,
+    "Hub Likes":47,
+    "Dispon\u00edvel no hub":true,
+    "SHA do modelo":"ab2d4b9",
+    "M\u00e9dia Geral":0.6108657601,
+    "\u00c1rea M\u00e9dica":0.7029883631,
+    "\u00c1rea do Direito":0.6872008975,
+    "Provas Militares":0.7613994162,
+    "Computa\u00e7\u00e3o":0.5591628509,
+    "Discurso de \u00d3dio":0.5708983915,
+    "Economia e Contabilidade":0.6267255976,
+    "Sem\u00e2ntica e Infer\u00eancia":0.5200744268,
+    "Multidisciplinar":0.5645008744,
+    "Revalida":0.7228867323,
+    "MREX":0.6830899939,
+    "OAB":0.6957087204,
+    "ENAM":0.6786930745,
+    "AFA":0.8375613036,
+    "ITA":0.7426540781,
+    "IME":0.7039828671,
+    "POSCOMP":0.555804896,
+    "OBI":0.5625208058,
+    "HateBR":0.532224742,
+    "PT Hate Speech":0.5780859026,
+    "tweetSentBR":0.60238453,
+    "BCB":0.6241177598,
+    "CFCES":0.6293334354,
+    "FAQUAD NLI":0.5662521905,
+    "ASSIN2 RTE":0.4951003888,
+    "ASSIN2 STS":0.4988707012,
+    "ENEM":0.6043572002,
+    "BLUEX":0.6034587215,
+    "CNPU":0.5179199478,
+    "ENADE":0.5128404235,
+    "BNDES":0.5440709446,
+    "CACD (1\u00aa fase)":0.6134068257,
+    "CACD (2\u00aa fase)":0.5554520575,
+    "Datasets \u00c1rea M\u00e9dica":"Revalida, MREX",
+    "Datasets \u00c1rea do Direito":"OAB, ENAM",
+    "Datasets Provas Militares":"AFA, ITA, IME",
+    "Datasets Computa\u00e7\u00e3o":"POSCOMP, OBI",
+    "Datasets Discurso de \u00d3dio":"HateBR, PT Hate Speech, tweetSentBR",
+    "Datasets Economia e Contabilidade":"BCB, CFCES",
+    "Datasets Sem\u00e2ntica e Infer\u00eancia":"FAQUAD NLI, ASSIN2 RTE, ASSIN2 STS",
+    "Datasets Multidisciplinar":"ENEM, BLUEX, CNPU, ENADE, BNDES, CACD (1\u00aa fase), CACD (2\u00aa fase)"
+  },
+  {
+    "T":"IFT",
+    "Modelo":"ai-forever\/gpt-pequeno-pt",
+    "Tipo":"IFT : instruction-tuned",
+    "Arquitetura":"",
+    "Tipo de Peso":"Original",
+    "Precis\u00e3o":"float16",
+    "Licen\u00e7a":"MIT",
+    "#Params (B)":1.3,
+    "Hub Likes":409,
+    "Dispon\u00edvel no hub":true,
+    "SHA do modelo":"main",
+    "M\u00e9dia Geral":0.7519612899,
+    "\u00c1rea M\u00e9dica":0.8670933989,
+    "\u00c1rea do Direito":0.8348443783,
+    "Provas Militares":0.7223937365,
+    "Computa\u00e7\u00e3o":0.5604037921,
+    "Discurso de \u00d3dio":0.7773722062,
+    "Economia e Contabilidade":0.832655864,
+    "Sem\u00e2ntica e Infer\u00eancia":0.8139561288,
+    "Multidisciplinar":0.7022728396,
+    "Revalida":0.9372604926,
+    "MREX":0.7969263052,
+    "OAB":0.8233168492,
+    "ENAM":0.8463719074,
+    "AFA":0.7605339428,
+    "ITA":0.7120343105,
+    "IME":0.6946129562,
+    "POSCOMP":0.5626662966,
+    "OBI":0.5581412875,
+    "HateBR":0.8322948813,
+    "PT Hate Speech":0.7351328585,
+    "tweetSentBR":0.7646888789,
+    "BCB":0.781881082,
+    "CFCES":0.883430646,
+    "FAQUAD NLI":0.8026792301,
+    "ASSIN2 RTE":0.8560128859,
+    "ASSIN2 STS":0.7831762706,
+    "ENEM":0.6721405607,
+    "BLUEX":0.6766199465,
+    "CNPU":0.7002599041,
+    "ENADE":0.7020562276,
+    "BNDES":0.7412478076,
+    "CACD (1\u00aa fase)":0.7257566033,
+    "CACD (2\u00aa fase)":0.6978288274,
+    "Datasets \u00c1rea M\u00e9dica":"Revalida, MREX",
+    "Datasets \u00c1rea do Direito":"OAB, ENAM",
+    "Datasets Provas Militares":"AFA, ITA, IME",
+    "Datasets Computa\u00e7\u00e3o":"POSCOMP, OBI",
+    "Datasets Discurso de \u00d3dio":"HateBR, PT Hate Speech, tweetSentBR",
+    "Datasets Economia e Contabilidade":"BCB, CFCES",
+    "Datasets Sem\u00e2ntica e Infer\u00eancia":"FAQUAD NLI, ASSIN2 RTE, ASSIN2 STS",
+    "Datasets Multidisciplinar":"ENEM, BLUEX, CNPU, ENADE, BNDES, CACD (1\u00aa fase), CACD (2\u00aa fase)"
+  },
+  {
+    "T":"PT",
+    "Modelo":"saramago\/roberta-base-portuguese",
+    "Tipo":"PT : pr\u00e9-treinado",
+    "Arquitetura":"",
+    "Tipo de Peso":"Original",
+    "Precis\u00e3o":"float16",
+    "Licen\u00e7a":"MIT",
+    "#Params (B)":0.13,
+    "Hub Likes":112,
+    "Dispon\u00edvel no hub":true,
+    "SHA do modelo":"main",
+    "M\u00e9dia Geral":0.9312283237,
+    "\u00c1rea M\u00e9dica":0.6179539104,
+    "\u00c1rea do Direito":0.99,
+    "Provas Militares":0.9578333048,
+    "Computa\u00e7\u00e3o":0.913080091,
+    "Discurso de \u00d3dio":0.893517282,
+    "Economia e Contabilidade":0.9548532556,
+    "Sem\u00e2ntica e Infer\u00eancia":0.9745700292,
+    "Multidisciplinar":0.988563344,
+    "Revalida":0.6180836056,
+    "MREX":0.6178242152,
+    "OAB":0.99,
+    "ENAM":0.99,
+    "AFA":0.9647166009,
+    "ITA":0.9449254684,
+    "IME":0.9638578451,
+    "POSCOMP":0.9581953587,
+    "OBI":0.8679648233,
+    "HateBR":0.916139368,
+    "PT Hate Speech":0.8602959375,
+    "tweetSentBR":0.9041165406,
+    "BCB":0.9449285867,
+    "CFCES":0.9647779244,
+    "FAQUAD NLI":0.99,
+    "ASSIN2 RTE":0.99,
+    "ASSIN2 STS":0.9437100876,
+    "ENEM":0.9799434077,
+    "BLUEX":0.99,
+    "CNPU":0.99,
+    "ENADE":0.99,
+    "BNDES":0.99,
+    "CACD (1\u00aa fase)":0.99,
+    "CACD (2\u00aa fase)":0.99,
+    "Datasets \u00c1rea M\u00e9dica":"Revalida, MREX",
+    "Datasets \u00c1rea do Direito":"OAB, ENAM",
+    "Datasets Provas Militares":"AFA, ITA, IME",
+    "Datasets Computa\u00e7\u00e3o":"POSCOMP, OBI",
+    "Datasets Discurso de \u00d3dio":"HateBR, PT Hate Speech, tweetSentBR",
+    "Datasets Economia e Contabilidade":"BCB, CFCES",
+    "Datasets Sem\u00e2ntica e Infer\u00eancia":"FAQUAD NLI, ASSIN2 RTE, ASSIN2 STS",
+    "Datasets Multidisciplinar":"ENEM, BLUEX, CNPU, ENADE, BNDES, CACD (1\u00aa fase), CACD (2\u00aa fase)"
+  },
+  {
+    "T":"FT",
+    "Modelo":"pierreguillou\/bert-base-brpt-clinical",
+    "Tipo":"FT : fine-tuned",
+    "Arquitetura":"",
+    "Tipo de Peso":"Original",
+    "Precis\u00e3o":"float16",
+    "Licen\u00e7a":"MIT",
+    "#Params (B)":0.11,
+    "Hub Likes":73,
+    "Dispon\u00edvel no hub":true,
+    "SHA do modelo":"c7bef2a",
+    "M\u00e9dia Geral":0.5482271789,
+    "\u00c1rea M\u00e9dica":0.4041051953,
+    "\u00c1rea do Direito":0.6969643619,
+    "Provas Militares":0.6095811316,
+    "Computa\u00e7\u00e3o":0.6695079353,
+    "Discurso de \u00d3dio":0.6304290307,
+    "Economia e Contabilidade":0.4762540627,
+    "Sem\u00e2ntica e Infer\u00eancia":0.5236257226,
+    "Multidisciplinar":0.4818402185,
+    "Revalida":0.3778537492,
+    "MREX":0.4303566413,
+    "OAB":0.6946451229,
+    "ENAM":0.6992836008,
+    "AFA":0.6208503603,
+    "ITA":0.5800215298,
+    "IME":0.6278715046,
+    "POSCOMP":0.7053490348,
+    "OBI":0.6336668359,
+    "HateBR":0.6278205555,
+    "PT Hate Speech":0.623751963,
+    "tweetSentBR":0.6397145736,
+    "BCB":0.4976959616,
+    "CFCES":0.4548121638,
+    "FAQUAD NLI":0.4985853658,
+    "ASSIN2 RTE":0.537775324,
+    "ASSIN2 STS":0.5345164779,
+    "ENEM":0.4616904996,
+    "BLUEX":0.4653500276,
+    "CNPU":0.4957192699,
+    "ENADE":0.4725184615,
+    "BNDES":0.4771470703,
+    "CACD (1\u00aa fase)":0.4803828804,
+    "CACD (2\u00aa fase)":0.5200733201,
+    "Datasets \u00c1rea M\u00e9dica":"Revalida, MREX",
+    "Datasets \u00c1rea do Direito":"OAB, ENAM",
+    "Datasets Provas Militares":"AFA, ITA, IME",
+    "Datasets Computa\u00e7\u00e3o":"POSCOMP, OBI",
+    "Datasets Discurso de \u00d3dio":"HateBR, PT Hate Speech, tweetSentBR",
+    "Datasets Economia e Contabilidade":"BCB, CFCES",
+    "Datasets Sem\u00e2ntica e Infer\u00eancia":"FAQUAD NLI, ASSIN2 RTE, ASSIN2 STS",
+    "Datasets Multidisciplinar":"ENEM, BLUEX, CNPU, ENADE, BNDES, CACD (1\u00aa fase), CACD (2\u00aa fase)"
+  },
+  {
+    "T":"PT",
+    "Modelo":"nlp-wyldlab\/deberta-v3-base-portuguese",
+    "Tipo":"PT : pr\u00e9-treinado",
+    "Arquitetura":"",
+    "Tipo de Peso":"Original",
+    "Precis\u00e3o":"float16",
+    "Licen\u00e7a":"MIT",
+    "#Params (B)":0.18,
+    "Hub Likes":128,
+    "Dispon\u00edvel no hub":true,
+    "SHA do modelo":"main",
+    "M\u00e9dia Geral":0.7465021221,
+    "\u00c1rea M\u00e9dica":0.594656463,
+    "\u00c1rea do Direito":0.6710791004,
+    "Provas Militares":0.8314281273,
+    "Computa\u00e7\u00e3o":0.6823194837,
+    "Discurso de \u00d3dio":0.775550195,
+    "Economia e Contabilidade":0.8795739095,
+    "Sem\u00e2ntica e Infer\u00eancia":0.7241149481,
+    "Multidisciplinar":0.7525018865,
+    "Revalida":0.5792849969,
+    "MREX":0.6100279292,
+    "OAB":0.6985800324,
+    "ENAM":0.6435781684,
+    "AFA":0.7914821772,
+    "ITA":0.8881410294,
+    "IME":0.8146611755,
+    "POSCOMP":0.6519856198,
+    "OBI":0.7126533476,
+    "HateBR":0.8175383934,
+    "PT Hate Speech":0.7687241726,
+    "tweetSentBR":0.7403880189,
+    "BCB":0.8527339923,
+    "CFCES":0.9064138266,
+    "FAQUAD NLI":0.693047413,
+    "ASSIN2 RTE":0.6893290657,
+    "ASSIN2 STS":0.7899683657,
+    "ENEM":0.7604902451,
+    "BLUEX":0.8153250765,
+    "CNPU":0.6935039827,
+    "ENADE":0.7749131066,
+    "BNDES":0.6890146653,
+    "CACD (1\u00aa fase)":0.7499769597,
+    "CACD (2\u00aa fase)":0.7842891694,
+    "Datasets \u00c1rea M\u00e9dica":"Revalida, MREX",
+    "Datasets \u00c1rea do Direito":"OAB, ENAM",
+    "Datasets Provas Militares":"AFA, ITA, IME",
+    "Datasets Computa\u00e7\u00e3o":"POSCOMP, OBI",
+    "Datasets Discurso de \u00d3dio":"HateBR, PT Hate Speech, tweetSentBR",
+    "Datasets Economia e Contabilidade":"BCB, CFCES",
+    "Datasets Sem\u00e2ntica e Infer\u00eancia":"FAQUAD NLI, ASSIN2 RTE, ASSIN2 STS",
+    "Datasets Multidisciplinar":"ENEM, BLUEX, CNPU, ENADE, BNDES, CACD (1\u00aa fase), CACD (2\u00aa fase)"
+  }
+]

output/leaderboard_data_20250413_002339.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a27686dc08775fc43f0b92365c1affd11dfc08026a41a926b2a0c3c22739807d
+size 7463

output/leaderboard_data_20250413_002339.xlsx ADDED Viewed

Binary file (12.3 kB). View file

output/leaderboard_info_20250413_002339.txt ADDED Viewed

	@@ -0,0 +1,81 @@

+DataFrame Shape: (12, 52)
+Colunas:
+- T
+- Modelo
+- Tipo
+- Arquitetura
+- Tipo de Peso
+- Precisão
+- Licença
+- #Params (B)
+- Hub Likes
+- Disponível no hub
+- SHA do modelo
+- Média Geral
+- Área Médica
+- Área do Direito
+- Provas Militares
+- Computação
+- Discurso de Ódio
+- Economia e Contabilidade
+- Semântica e Inferência
+- Multidisciplinar
+- Revalida
+- MREX
+- OAB
+- ENAM
+- AFA
+- ITA
+- IME
+- POSCOMP
+- OBI
+- HateBR
+- PT Hate Speech
+- tweetSentBR
+- BCB
+- CFCES
+- FAQUAD NLI
+- ASSIN2 RTE
+- ASSIN2 STS
+- ENEM
+- BLUEX
+- CNPU
+- ENADE
+- BNDES
+- CACD (1ª fase)
+- CACD (2ª fase)
+- Datasets Área Médica
+- Datasets Área do Direito
+- Datasets Provas Militares
+- Datasets Computação
+- Datasets Discurso de Ódio
+- Datasets Economia e Contabilidade
+- Datasets Semântica e Inferência
+- Datasets Multidisciplinar
+Informações por área:
+Área Médica:
+- Datasets: ['Revalida', 'MREX']
+Área do Direito:
+- Datasets: ['OAB', 'ENAM']
+Provas Militares:
+- Datasets: ['AFA', 'ITA', 'IME']
+Computação:
+- Datasets: ['POSCOMP', 'OBI']
+Discurso de Ódio:
+- Datasets: ['HateBR', 'PT Hate Speech', 'tweetSentBR']
+Economia e Contabilidade:
+- Datasets: ['BCB', 'CFCES']
+Semântica e Inferência:
+- Datasets: ['FAQUAD NLI', 'ASSIN2 RTE', 'ASSIN2 STS']
+Multidisciplinar:
+- Datasets: ['ENEM', 'BLUEX', 'CNPU', 'ENADE', 'BNDES', 'CACD (1ª fase)', 'CACD (2ª fase)']

pyproject.toml ADDED Viewed

	@@ -0,0 +1,13 @@

+[tool.ruff]
+# Enable pycodestyle (`E`) and Pyflakes (`F`) codes by default.
+select = ["E", "F"]
+ignore = ["E501"] # line too long (black is taking care of this)
+line-length = 119
+fixable = ["A", "B", "C", "D", "E", "F", "G", "I", "N", "Q", "S", "T", "W", "ANN", "ARG", "BLE", "COM", "DJ", "DTZ", "EM", "ERA", "EXE", "FBT", "ICN", "INP", "ISC", "NPY", "PD", "PGH", "PIE", "PL", "PT", "PTH", "PYI", "RET", "RSE", "RUF", "SIM", "SLF", "TCH", "TID", "TRY", "UP", "YTT"]
+[tool.isort]
+profile = "black"
+line_length = 119
+[tool.black]
+line-length = 119

requirements.txt CHANGED Viewed

@@ -1,18 +1,16 @@
-APScheduler>=3.10.0
 black
 datasets
-gradio>=3.50.0
 gradio[oauth]
-gradio-leaderboard>=0.0.13
 gradio_client
-huggingface_hub>=0.19.0
 matplotlib
-numpy>=1.24.0
-pandas>=2.0.0
-python-dateutil>=2.8.2
-streamlit>=1.31.0
 tqdm
-transformers>=4.30.0
 tokenizers>=0.15.0
-sentencepiece
-safetensors>=0.4.0

+APScheduler
 black
 datasets
+gradio
 gradio[oauth]
+gradio_leaderboard==0.0.13
 gradio_client
+huggingface-hub>=0.18.0
 matplotlib
+numpy
+pandas
+python-dateutil
 tqdm
+transformers
 tokenizers>=0.15.0
+sentencepiece

src/about.py ADDED Viewed

	@@ -0,0 +1,103 @@

+from dataclasses import dataclass
+from enum import Enum
+@dataclass
+class Task:
+    benchmark: str
+    metric: str
+    col_name: str
+# Select your tasks here
+# ---------------------------------------------------
+class Tasks(Enum):
+    # task_key in the json file, metric_key in the json file, name to display in the leaderboard
+    # Área Médica
+    REVALIDA = Task("revalida", "acc", "Revalida")
+    MREX = Task("mrex", "acc", "MREX")
+    # Área do Direito
+    OAB = Task("oab", "acc", "OAB")
+    ENAM = Task("enam", "acc", "ENAM")
+    # Provas Militares
+    AFA = Task("afa", "acc", "AFA")
+    ITA = Task("ita", "acc", "ITA")
+    IME = Task("ime", "acc", "IME")
+    # Computação
+    POSCOMP = Task("poscomp", "acc", "POSCOMP")
+    OBI = Task("obi", "acc", "OBI")
+    # Discurso de Ódio
+    HATEBR = Task("hatebr", "acc", "HateBR")
+    PT_HATE_SPEECH = Task("pt_hate_speech", "acc", "PT Hate Speech")
+    TWEETSENTBR = Task("tweetsentbr", "acc", "tweetSentBR")
+    # Economia e Contabilidade
+    BCB = Task("bcb", "acc", "BCB")
+    CFCES = Task("cfces", "acc", "CFCES")
+    # Compreensão de Semântica e Inferência Textual
+    FAQUAD_NLI = Task("faquad_nli", "acc", "FAQUAD NLI")
+    ASSIN2_RTE = Task("assin2_rte", "acc", "ASSIN2 RTE")
+    ASSIN2_STS = Task("assin2_sts", "acc", "ASSIN2 STS")
+    # Provas de Conhecimento Multidisciplinar
+    ENEM = Task("enem", "acc", "ENEM")
+    BLUEX = Task("bluex", "acc", "BLUEX")
+    CNPU = Task("cnpu", "acc", "CNPU")
+    ENADE = Task("enade", "acc", "ENADE")
+    BNDES = Task("bndes", "acc", "BNDES")
+    CACD_1 = Task("cacd_1", "acc", "CACD (1ª fase)")
+    CACD_2 = Task("cacd_2", "acc", "CACD (2ª fase)")
+NUM_FEWSHOT = 0 # Change with your few shot
+# ---------------------------------------------------
+# Your leaderboard name
+TITLE = """<h1 align="center" id="space-title">Demo leaderboard</h1>"""
+# What does your leaderboard evaluate?
+INTRODUCTION_TEXT = """
+Intro text
+"""
+# Which evaluations are you running? how can people reproduce what you have?
+LLM_BENCHMARKS_TEXT = f"""
+## How it works
+## Reproducibility
+To reproduce our results, here is the commands you can run:
+"""
+EVALUATION_QUEUE_TEXT = """
+## Some good practices before submitting a model
+### 1) Make sure you can load your model and tokenizer using AutoClasses:
+```python
+from transformers import AutoConfig, AutoModel, AutoTokenizer
+config = AutoConfig.from_pretrained("your model name", revision=revision)
+model = AutoModel.from_pretrained("your model name", revision=revision)
+tokenizer = AutoTokenizer.from_pretrained("your model name", revision=revision)
+```
+If this step fails, follow the error messages to debug your model before submitting it. It's likely your model has been improperly uploaded.
+Note: make sure your model is public!
+Note: if your model needs `use_remote_code=True`, we do not support this option yet but we are working on adding it, stay posted!
+### 2) Convert your model weights to [safetensors](https://huggingface.co/docs/safetensors/index)
+It's a new format for storing weights which is safer and faster to load and use. It will also allow us to add the number of parameters of your model to the `Extended Viewer`!
+### 3) Make sure your model has an open license!
+This is a leaderboard for Open LLMs, and we'd love for as many people as possible to know they can use your model 🤗
+### 4) Fill up your model card
+When we add extra information about models to the leaderboard, it will be automatically taken from the model card
+## In case of model failure
+If your model is displayed in the `FAILED` category, its execution stopped.
+Make sure you have followed the above steps first.
+If everything is done, check you can launch the EleutherAIHarness on your model locally, using the above command without modifications (you can add `--limit` to limit the number of examples per task).
+"""
+CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
+CITATION_BUTTON_TEXT = r"""
+"""

src/about.pyZone.Identifier ADDED Viewed

File without changes

src/display/css_html_js.py ADDED Viewed

	@@ -0,0 +1,105 @@

+custom_css = """
+.markdown-text {
+    font-size: 16px !important;
+}
+#models-to-add-text {
+    font-size: 18px !important;
+}
+#citation-button span {
+    font-size: 16px !important;
+}
+#citation-button textarea {
+    font-size: 16px !important;
+}
+#citation-button > label > button {
+    margin: 6px;
+    transform: scale(1.3);
+}
+#leaderboard-table {
+    margin-top: 15px
+}
+#leaderboard-table-lite {
+    margin-top: 15px
+}
+#search-bar-table-box > div:first-child {
+    background: none;
+    border: none;
+}
+#search-bar {
+    padding: 0px;
+}
+/* Limit the width of the first AutoEvalColumn so that names don't expand too much */
+#leaderboard-table td:nth-child(2),
+#leaderboard-table th:nth-child(2) {
+    max-width: 400px;
+    overflow: auto;
+    white-space: nowrap;
+}
+.tab-buttons button {
+    font-size: 20px;
+}
+#scale-logo {
+    border-style: none !important;
+    box-shadow: none;
+    display: block;
+    margin-left: auto;
+    margin-right: auto;
+    max-width: 600px;
+}
+#scale-logo .download {
+    display: none;
+}
+#filter_type{
+    border: 0;
+    padding-left: 0;
+    padding-top: 0;
+}
+#filter_type label {
+    display: flex;
+}
+#filter_type label > span{
+    margin-top: var(--spacing-lg);
+    margin-right: 0.5em;
+}
+#filter_type label > .wrap{
+    width: 103px;
+}
+#filter_type label > .wrap .wrap-inner{
+    padding: 2px;
+}
+#filter_type label > .wrap .wrap-inner input{
+    width: 1px
+}
+#filter-columns-type{
+    border:0;
+    padding:0.5;
+}
+#filter-columns-size{
+    border:0;
+    padding:0.5;
+}
+#box-filter > .form{
+    border: 0
+}
+"""
+get_window_url_params = """
+    function(url_params) {
+        const params = new URLSearchParams(window.location.search);
+        url_params = Object.fromEntries(params);
+        return url_params;
+    }
+    """

src/display/css_html_js.pyZone.Identifier ADDED Viewed

File without changes

src/display/formatting.py ADDED Viewed

	@@ -0,0 +1,27 @@

+def model_hyperlink(link, model_name):
+    return f'<a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model_name}</a>'
+def make_clickable_model(model_name):
+    link = f"https://huggingface.co/{model_name}"
+    return model_hyperlink(link, model_name)
+def styled_error(error):
+    return f"<p style='color: red; font-size: 20px; text-align: center;'>{error}</p>"
+def styled_warning(warn):
+    return f"<p style='color: orange; font-size: 20px; text-align: center;'>{warn}</p>"
+def styled_message(message):
+    return f"<p style='color: green; font-size: 20px; text-align: center;'>{message}</p>"
+def has_no_nan_values(df, columns):
+    return df[columns].notna().all(axis=1)
+def has_nan_values(df, columns):
+    return df[columns].isna().any(axis=1)

src/display/formatting.pyZone.Identifier ADDED Viewed

File without changes

src/display/utils.py ADDED Viewed

	@@ -0,0 +1,145 @@

+from dataclasses import dataclass, make_dataclass
+from enum import Enum
+import pandas as pd
+from src.about import Tasks
+def fields(raw_class):
+    return [v for k, v in raw_class.__dict__.items() if k[:2] != "__" and k[-2:] != "__"]
+# These classes are for user facing column names,
+# to avoid having to change them all around the code
+# when a modif is needed
+@dataclass
+class ColumnContent:
+    name: str
+    type: str
+    displayed_by_default: bool
+    hidden: bool = False
+    never_hidden: bool = False
+## Leaderboard columns
+auto_eval_column_dict = []
+# Init
+auto_eval_column_dict.append(["model_type_symbol", ColumnContent, ColumnContent("T", "str", True, never_hidden=True)])
+auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)])
+#Scores
+auto_eval_column_dict.append(["average", ColumnContent, ColumnContent("Média Geral ⬆️", "number", True)])
+# Adicionando colunas para as médias das áreas
+auto_eval_column_dict.append(["area_medica_avg", ColumnContent, ColumnContent("Área Médica", "number", True)])
+auto_eval_column_dict.append(["area_direito_avg", ColumnContent, ColumnContent("Área do Direito", "number", True)])
+auto_eval_column_dict.append(["provas_militares_avg", ColumnContent, ColumnContent("Provas Militares", "number", True)])
+auto_eval_column_dict.append(["computacao_avg", ColumnContent, ColumnContent("Computação", "number", True)])
+auto_eval_column_dict.append(["discurso_odio_avg", ColumnContent, ColumnContent("Discurso de Ódio", "number", True)])
+auto_eval_column_dict.append(["economia_contabilidade_avg", ColumnContent, ColumnContent("Economia e Contabilidade", "number", True)])
+auto_eval_column_dict.append(["semantica_inferencia_avg", ColumnContent, ColumnContent("Semântica e Inferência", "number", True)])
+auto_eval_column_dict.append(["multidisciplinar_avg", ColumnContent, ColumnContent("Multidisciplinar", "number", True)])
+for task in Tasks:
+    auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", False)]) # Mudar para False para não exibir por padrão na aba geral
+# Model information
+auto_eval_column_dict.append(["model_type", ColumnContent, ColumnContent("Type", "str", False)])
+auto_eval_column_dict.append(["architecture", ColumnContent, ColumnContent("Architecture", "str", False)])
+auto_eval_column_dict.append(["weight_type", ColumnContent, ColumnContent("Weight type", "str", False, True)])
+auto_eval_column_dict.append(["precision", ColumnContent, ColumnContent("Precision", "str", False)])
+auto_eval_column_dict.append(["license", ColumnContent, ColumnContent("Hub License", "str", False)])
+auto_eval_column_dict.append(["params", ColumnContent, ColumnContent("#Params (B)", "number", False)])
+auto_eval_column_dict.append(["likes", ColumnContent, ColumnContent("Hub ❤️", "number", False)])
+auto_eval_column_dict.append(["still_on_hub", ColumnContent, ColumnContent("Available on the hub", "bool", False)])
+auto_eval_column_dict.append(["revision", ColumnContent, ColumnContent("Model sha", "str", False, False)])
+# We use make dataclass to dynamically fill the scores from Tasks
+AutoEvalColumn = make_dataclass("AutoEvalColumn", auto_eval_column_dict, frozen=True)
+# Mapeamento das áreas de conhecimento para os Tasks correspondentes
+AREA_DEFINITIONS = {
+    "Área Médica": [Tasks.REVALIDA, Tasks.MREX],
+    "Área do Direito": [Tasks.OAB, Tasks.ENAM],
+    "Provas Militares": [Tasks.AFA, Tasks.ITA, Tasks.IME],
+    "Computação": [Tasks.POSCOMP, Tasks.OBI],
+    "Discurso de Ódio": [Tasks.HATEBR, Tasks.PT_HATE_SPEECH, Tasks.TWEETSENTBR],
+    "Economia e Contabilidade": [Tasks.BCB, Tasks.CFCES],
+    "Semântica e Inferência": [Tasks.FAQUAD_NLI, Tasks.ASSIN2_RTE, Tasks.ASSIN2_STS],
+    "Multidisciplinar": [Tasks.ENEM, Tasks.BLUEX, Tasks.CNPU, Tasks.ENADE, Tasks.BNDES, Tasks.CACD_1, Tasks.CACD_2],
+}
+# Mapeamento dos nomes das áreas para as colunas de média correspondentes
+AREA_AVG_COLUMN_MAP = {
+    "Área Médica": AutoEvalColumn.area_medica_avg.name,
+    "Área do Direito": AutoEvalColumn.area_direito_avg.name,
+    "Provas Militares": AutoEvalColumn.provas_militares_avg.name,
+    "Computação": AutoEvalColumn.computacao_avg.name,
+    "Discurso de Ódio": AutoEvalColumn.discurso_odio_avg.name,
+    "Economia e Contabilidade": AutoEvalColumn.economia_contabilidade_avg.name,
+    "Semântica e Inferência": AutoEvalColumn.semantica_inferencia_avg.name,
+    "Multidisciplinar": AutoEvalColumn.multidisciplinar_avg.name,
+}
+## For the queue columns in the submission tab
+@dataclass(frozen=True)
+class EvalQueueColumn:  # Queue column
+    model = ColumnContent("model", "markdown", True)
+    revision = ColumnContent("revision", "str", True)
+    private = ColumnContent("private", "bool", True)
+    precision = ColumnContent("precision", "str", True)
+    weight_type = ColumnContent("weight_type", "str", "Original")
+    status = ColumnContent("status", "str", True)
+## All the model information that we might need
+@dataclass
+class ModelDetails:
+    name: str
+    display_name: str = ""
+    symbol: str = "" # emoji
+class ModelType(Enum):
+    PT = ModelDetails(name="pretrained", symbol="🟢")
+    FT = ModelDetails(name="fine-tuned", symbol="🔶")
+    IFT = ModelDetails(name="instruction-tuned", symbol="⭕")
+    RL = ModelDetails(name="RL-tuned", symbol="🟦")
+    Unknown = ModelDetails(name="", symbol="?")
+    def to_str(self, separator=" "):
+        return f"{self.value.symbol}{separator}{self.value.name}"
+    @staticmethod
+    def from_str(type):
+        if "fine-tuned" in type or "🔶" in type:
+            return ModelType.FT
+        if "pretrained" in type or "🟢" in type:
+            return ModelType.PT
+        if "RL-tuned" in type or "🟦" in type:
+            return ModelType.RL
+        if "instruction-tuned" in type or "⭕" in type:
+            return ModelType.IFT
+        return ModelType.Unknown
+class WeightType(Enum):
+    Adapter = ModelDetails("Adapter")
+    Original = ModelDetails("Original")
+    Delta = ModelDetails("Delta")
+class Precision(Enum):
+    float16 = ModelDetails("float16")
+    bfloat16 = ModelDetails("bfloat16")
+    Unknown = ModelDetails("?")
+    def from_str(precision):
+        if precision in ["torch.float16", "float16"]:
+            return Precision.float16
+        if precision in ["torch.bfloat16", "bfloat16"]:
+            return Precision.bfloat16
+        return Precision.Unknown
+# Column selection
+COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden]
+EVAL_COLS = [c.name for c in fields(EvalQueueColumn)]
+EVAL_TYPES = [c.type for c in fields(EvalQueueColumn)]
+BENCHMARK_COLS = [t.value.col_name for t in Tasks]

src/display/utils.pyZone.Identifier ADDED Viewed

File without changes

src/envs.py ADDED Viewed

	@@ -0,0 +1,25 @@

+import os
+from huggingface_hub import HfApi
+# Info to change for your repository
+# ----------------------------------
+TOKEN = os.environ.get("HF_TOKEN") # A read/write token for your org
+OWNER = "demo-leaderboard-backend" # Change to your org - don't forget to create a results and request dataset, with the correct format!
+# ----------------------------------
+REPO_ID = f"{OWNER}/leaderboard"
+QUEUE_REPO = f"{OWNER}/requests"
+RESULTS_REPO = f"{OWNER}/results"
+# If you setup a cache later, just change HF_HOME
+CACHE_PATH=os.getenv("HF_HOME", ".")
+# Local caches
+EVAL_REQUESTS_PATH = os.path.join(CACHE_PATH, "eval-queue")
+EVAL_RESULTS_PATH = os.path.join(CACHE_PATH, "eval-results")
+EVAL_REQUESTS_PATH_BACKEND = os.path.join(CACHE_PATH, "eval-queue-bk")
+EVAL_RESULTS_PATH_BACKEND = os.path.join(CACHE_PATH, "eval-results-bk")
+API = HfApi(token=TOKEN)

src/envs.pyZone.Identifier ADDED Viewed

File without changes

src/leaderboard/read_evals.py ADDED Viewed

	@@ -0,0 +1,196 @@

+import glob
+import json
+import math
+import os
+from dataclasses import dataclass
+import dateutil
+import numpy as np
+from src.display.formatting import make_clickable_model
+from src.display.utils import AutoEvalColumn, ModelType, Tasks, Precision, WeightType
+from src.submission.check_validity import is_model_on_hub
+@dataclass
+class EvalResult:
+    """Represents one full evaluation. Built from a combination of the result and request file for a given run.
+    """
+    eval_name: str # org_model_precision (uid)
+    full_model: str # org/model (path on hub)
+    org: str
+    model: str
+    revision: str # commit hash, "" if main
+    results: dict
+    precision: Precision = Precision.Unknown
+    model_type: ModelType = ModelType.Unknown # Pretrained, fine tuned, ...
+    weight_type: WeightType = WeightType.Original # Original or Adapter
+    architecture: str = "Unknown"
+    license: str = "?"
+    likes: int = 0
+    num_params: int = 0
+    date: str = "" # submission date of request file
+    still_on_hub: bool = False
+    @classmethod
+    def init_from_json_file(self, json_filepath):
+        """Inits the result from the specific model result file"""
+        with open(json_filepath) as fp:
+            data = json.load(fp)
+        config = data.get("config")
+        # Precision
+        precision = Precision.from_str(config.get("model_dtype"))
+        # Get model and org
+        org_and_model = config.get("model_name", config.get("model_args", None))
+        org_and_model = org_and_model.split("/", 1)
+        if len(org_and_model) == 1:
+            org = None
+            model = org_and_model[0]
+            result_key = f"{model}_{precision.value.name}"
+        else:
+            org = org_and_model[0]
+            model = org_and_model[1]
+            result_key = f"{org}_{model}_{precision.value.name}"
+        full_model = "/".join(org_and_model)
+        still_on_hub, _, model_config = is_model_on_hub(
+            full_model, config.get("model_sha", "main"), trust_remote_code=True, test_tokenizer=False
+        )
+        architecture = "?"
+        if model_config is not None:
+            architectures = getattr(model_config, "architectures", None)
+            if architectures:
+                architecture = ";".join(architectures)
+        # Extract results available in this file (some results are split in several files)
+        results = {}
+        for task in Tasks:
+            task = task.value
+            # We average all scores of a given metric (not all metrics are present in all files)
+            accs = np.array([v.get(task.metric, None) for k, v in data["results"].items() if task.benchmark == k])
+            if accs.size == 0 or any([acc is None for acc in accs]):
+                continue
+            mean_acc = np.mean(accs) * 100.0
+            results[task.benchmark] = mean_acc
+        return self(
+            eval_name=result_key,
+            full_model=full_model,
+            org=org,
+            model=model,
+            results=results,
+            precision=precision,
+            revision= config.get("model_sha", ""),
+            still_on_hub=still_on_hub,
+            architecture=architecture
+        )
+    def update_with_request_file(self, requests_path):
+        """Finds the relevant request file for the current model and updates info with it"""
+        request_file = get_request_file_for_model(requests_path, self.full_model, self.precision.value.name)
+        try:
+            with open(request_file, "r") as f:
+                request = json.load(f)
+            self.model_type = ModelType.from_str(request.get("model_type", ""))
+            self.weight_type = WeightType[request.get("weight_type", "Original")]
+            self.license = request.get("license", "?")
+            self.likes = request.get("likes", 0)
+            self.num_params = request.get("params", 0)
+            self.date = request.get("submitted_time", "")
+        except Exception:
+            print(f"Could not find request file for {self.org}/{self.model} with precision {self.precision.value.name}")
+    def to_dict(self):
+        """Converts the Eval Result to a dict compatible with our dataframe display"""
+        average = sum([v for v in self.results.values() if v is not None]) / len(Tasks)
+        data_dict = {
+            "eval_name": self.eval_name,  # not a column, just a save name,
+            AutoEvalColumn.precision.name: self.precision.value.name,
+            AutoEvalColumn.model_type.name: self.model_type.value.name,
+            AutoEvalColumn.model_type_symbol.name: self.model_type.value.symbol,
+            AutoEvalColumn.weight_type.name: self.weight_type.value.name,
+            AutoEvalColumn.architecture.name: self.architecture,
+            AutoEvalColumn.model.name: make_clickable_model(self.full_model),
+            AutoEvalColumn.revision.name: self.revision,
+            AutoEvalColumn.average.name: average,
+            AutoEvalColumn.license.name: self.license,
+            AutoEvalColumn.likes.name: self.likes,
+            AutoEvalColumn.params.name: self.num_params,
+            AutoEvalColumn.still_on_hub.name: self.still_on_hub,
+        }
+        for task in Tasks:
+            data_dict[task.value.col_name] = self.results[task.value.benchmark]
+        return data_dict
+def get_request_file_for_model(requests_path, model_name, precision):
+    """Selects the correct request file for a given model. Only keeps runs tagged as FINISHED"""
+    request_files = os.path.join(
+        requests_path,
+        f"{model_name}_eval_request_*.json",
+    )
+    request_files = glob.glob(request_files)
+    # Select correct request file (precision)
+    request_file = ""
+    request_files = sorted(request_files, reverse=True)
+    for tmp_request_file in request_files:
+        with open(tmp_request_file, "r") as f:
+            req_content = json.load(f)
+            if (
+                req_content["status"] in ["FINISHED"]
+                and req_content["precision"] == precision.split(".")[-1]
+            ):
+                request_file = tmp_request_file
+    return request_file
+def get_raw_eval_results(results_path: str, requests_path: str) -> list[EvalResult]:
+    """From the path of the results folder root, extract all needed info for results"""
+    model_result_filepaths = []
+    for root, _, files in os.walk(results_path):
+        # We should only have json files in model results
+        if len(files) == 0 or any([not f.endswith(".json") for f in files]):
+            continue
+        # Sort the files by date
+        try:
+            files.sort(key=lambda x: x.removesuffix(".json").removeprefix("results_")[:-7])
+        except dateutil.parser._parser.ParserError:
+            files = [files[-1]]
+        for file in files:
+            model_result_filepaths.append(os.path.join(root, file))
+    eval_results = {}
+    for model_result_filepath in model_result_filepaths:
+        # Creation of result
+        eval_result = EvalResult.init_from_json_file(model_result_filepath)
+        eval_result.update_with_request_file(requests_path)
+        # Store results of same eval together
+        eval_name = eval_result.eval_name
+        if eval_name in eval_results.keys():
+            eval_results[eval_name].results.update({k: v for k, v in eval_result.results.items() if v is not None})
+        else:
+            eval_results[eval_name] = eval_result
+    results = []
+    for v in eval_results.values():
+        try:
+            v.to_dict() # we test if the dict version is complete
+            results.append(v)
+        except KeyError:  # not all eval values present
+            continue
+    return results

src/leaderboard/read_evals.pyZone.Identifier ADDED Viewed

File without changes

src/populate.py ADDED Viewed

	@@ -0,0 +1,79 @@

+import json
+import os
+import numpy as np
+import pandas as pd
+from src.display.formatting import has_no_nan_values, make_clickable_model
+from src.display.utils import AutoEvalColumn, EvalQueueColumn, AREA_DEFINITIONS, AREA_AVG_COLUMN_MAP
+from src.leaderboard.read_evals import get_raw_eval_results
+from src.about import Tasks
+def get_leaderboard_df(results_path: str, requests_path: str, cols: list) -> pd.DataFrame:
+    """Creates a dataframe from all the individual experiment results"""
+    raw_data = get_raw_eval_results(results_path, requests_path)
+    all_data_json = [v.to_dict() for v in raw_data]
+    df = pd.DataFrame.from_records(all_data_json)
+    # Calcular médias por área
+    for area_name, tasks_in_area in AREA_DEFINITIONS.items():
+        area_cols = [task.name for task in tasks_in_area if task.name in df.columns]
+        avg_col_name = AREA_AVG_COLUMN_MAP[area_name]
+        if area_cols: # Calcula a média apenas se houver colunas da área no DataFrame
+            df[avg_col_name] = df[area_cols].mean(axis=1)
+        else:
+            df[avg_col_name] = np.nan # Define como NaN se nenhuma coluna da área estiver presente
+    # Calcular Média Geral (agora baseada nas médias das áreas)
+    avg_area_cols = list(AREA_AVG_COLUMN_MAP.values())
+    df[AutoEvalColumn.average.name] = df[avg_area_cols].mean(axis=1)
+    df = df.sort_values(by=[AutoEvalColumn.average.name], ascending=False)
+    # Selecionar e arredondar colunas
+    all_display_cols = [c.name for c in fields(AutoEvalColumn)] # Obter todas as colunas definidas
+    df = df[[col for col in all_display_cols if col in df.columns]] # Manter apenas colunas existentes no df
+    df = df.round(decimals=2)
+    # Filtrar linhas com valores NaN nas colunas de benchmark originais (se necessário)
+    # benchmark_cols = [t.name for t in Tasks] # Descomentar se precisar do filtro original
+    # df = df[has_no_nan_values(df, benchmark_cols)]
+    return df
+def get_evaluation_queue_df(save_path: str, cols: list) -> list[pd.DataFrame]:
+    """Creates the different dataframes for the evaluation queues requestes"""
+    entries = [entry for entry in os.listdir(save_path) if not entry.startswith(".")]
+    all_evals = []
+    for entry in entries:
+        if ".json" in entry:
+            file_path = os.path.join(save_path, entry)
+            with open(file_path) as fp:
+                data = json.load(fp)
+            data[EvalQueueColumn.model.name] = make_clickable_model(data["model"])
+            data[EvalQueueColumn.revision.name] = data.get("revision", "main")
+            all_evals.append(data)
+        elif ".md" not in entry:
+            # this is a folder
+            sub_entries = [e for e in os.listdir(f"{save_path}/{entry}") if os.path.isfile(e) and not e.startswith(".")]
+            for sub_entry in sub_entries:
+                file_path = os.path.join(save_path, entry, sub_entry)
+                with open(file_path) as fp:
+                    data = json.load(fp)
+                data[EvalQueueColumn.model.name] = make_clickable_model(data["model"])
+                data[EvalQueueColumn.revision.name] = data.get("revision", "main")
+                all_evals.append(data)
+    pending_list = [e for e in all_evals if e["status"] in ["PENDING", "RERUN"]]
+    running_list = [e for e in all_evals if e["status"] == "RUNNING"]
+    finished_list = [e for e in all_evals if e["status"].startswith("FINISHED") or e["status"] == "PENDING_NEW_EVAL"]
+    df_pending = pd.DataFrame.from_records(pending_list, columns=cols)
+    df_running = pd.DataFrame.from_records(running_list, columns=cols)
+    df_finished = pd.DataFrame.from_records(finished_list, columns=cols)
+    return df_finished[cols], df_running[cols], df_pending[cols]

src/populate.pyZone.Identifier ADDED Viewed

File without changes

src/submission/check_validity.py ADDED Viewed

	@@ -0,0 +1,99 @@

+import json
+import os
+import re
+from collections import defaultdict
+from datetime import datetime, timedelta, timezone
+import huggingface_hub
+from huggingface_hub import ModelCard
+from huggingface_hub.hf_api import ModelInfo
+from transformers import AutoConfig
+from transformers.models.auto.tokenization_auto import AutoTokenizer
+def check_model_card(repo_id: str) -> tuple[bool, str]:
+    """Checks if the model card and license exist and have been filled"""
+    try:
+        card = ModelCard.load(repo_id)
+    except huggingface_hub.utils.EntryNotFoundError:
+        return False, "Please add a model card to your model to explain how you trained/fine-tuned it."
+    # Enforce license metadata
+    if card.data.license is None:
+        if not ("license_name" in card.data and "license_link" in card.data):
+            return False, (
+                "License not found. Please add a license to your model card using the `license` metadata or a"
+                " `license_name`/`license_link` pair."
+            )
+    # Enforce card content
+    if len(card.text) < 200:
+        return False, "Please add a description to your model card, it is too short."
+    return True, ""
+def is_model_on_hub(model_name: str, revision: str, token: str = None, trust_remote_code=False, test_tokenizer=False) -> tuple[bool, str]:
+    """Checks if the model model_name is on the hub, and whether it (and its tokenizer) can be loaded with AutoClasses."""
+    try:
+        config = AutoConfig.from_pretrained(model_name, revision=revision, trust_remote_code=trust_remote_code, token=token)
+        if test_tokenizer:
+            try:
+                tk = AutoTokenizer.from_pretrained(model_name, revision=revision, trust_remote_code=trust_remote_code, token=token)
+            except ValueError as e:
+                return (
+                    False,
+                    f"uses a tokenizer which is not in a transformers release: {e}",
+                    None
+                )
+            except Exception as e:
+                return (False, "'s tokenizer cannot be loaded. Is your tokenizer class in a stable transformers release, and correctly configured?", None)
+        return True, None, config
+    except ValueError:
+        return (
+            False,
+            "needs to be launched with `trust_remote_code=True`. For safety reason, we do not allow these models to be automatically submitted to the leaderboard.",
+            None
+        )
+    except Exception as e:
+        return False, "was not found on hub!", None
+def get_model_size(model_info: ModelInfo, precision: str):
+    """Gets the model size from the configuration, or the model name if the configuration does not contain the information."""
+    try:
+        model_size = round(model_info.safetensors["total"] / 1e9, 3)
+    except (AttributeError, TypeError):
+        return 0  # Unknown model sizes are indicated as 0, see NUMERIC_INTERVALS in app.py
+    size_factor = 8 if (precision == "GPTQ" or "gptq" in model_info.modelId.lower()) else 1
+    model_size = size_factor * model_size
+    return model_size
+def get_model_arch(model_info: ModelInfo):
+    """Gets the model architecture from the configuration"""
+    return model_info.config.get("architectures", "Unknown")
+def already_submitted_models(requested_models_dir: str) -> set[str]:
+    """Gather a list of already submitted models to avoid duplicates"""
+    depth = 1
+    file_names = []
+    users_to_submission_dates = defaultdict(list)
+    for root, _, files in os.walk(requested_models_dir):
+        current_depth = root.count(os.sep) - requested_models_dir.count(os.sep)
+        if current_depth == depth:
+            for file in files:
+                if not file.endswith(".json"):
+                    continue
+                with open(os.path.join(root, file), "r") as f:
+                    info = json.load(f)
+                    file_names.append(f"{info['model']}_{info['revision']}_{info['precision']}")
+                    # Select organisation
+                    if info["model"].count("/") == 0 or "submitted_time" not in info:
+                        continue
+                    organisation, _ = info["model"].split("/")
+                    users_to_submission_dates[organisation].append(info["submitted_time"])
+    return set(file_names), users_to_submission_dates

src/submission/check_validity.pyZone.Identifier ADDED Viewed

File without changes

src/submission/submit.py ADDED Viewed

	@@ -0,0 +1,119 @@

+import json
+import os
+from datetime import datetime, timezone
+from src.display.formatting import styled_error, styled_message, styled_warning
+from src.envs import API, EVAL_REQUESTS_PATH, TOKEN, QUEUE_REPO
+from src.submission.check_validity import (
+    already_submitted_models,
+    check_model_card,
+    get_model_size,
+    is_model_on_hub,
+)
+REQUESTED_MODELS = None
+USERS_TO_SUBMISSION_DATES = None
+def add_new_eval(
+    model: str,
+    base_model: str,
+    revision: str,
+    precision: str,
+    weight_type: str,
+    model_type: str,
+):
+    global REQUESTED_MODELS
+    global USERS_TO_SUBMISSION_DATES
+    if not REQUESTED_MODELS:
+        REQUESTED_MODELS, USERS_TO_SUBMISSION_DATES = already_submitted_models(EVAL_REQUESTS_PATH)
+    user_name = ""
+    model_path = model
+    if "/" in model:
+        user_name = model.split("/")[0]
+        model_path = model.split("/")[1]
+    precision = precision.split(" ")[0]
+    current_time = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
+    if model_type is None or model_type == "":
+        return styled_error("Please select a model type.")
+    # Does the model actually exist?
+    if revision == "":
+        revision = "main"
+    # Is the model on the hub?
+    if weight_type in ["Delta", "Adapter"]:
+        base_model_on_hub, error, _ = is_model_on_hub(model_name=base_model, revision=revision, token=TOKEN, test_tokenizer=True)
+        if not base_model_on_hub:
+            return styled_error(f'Base model "{base_model}" {error}')
+    if not weight_type == "Adapter":
+        model_on_hub, error, _ = is_model_on_hub(model_name=model, revision=revision, token=TOKEN, test_tokenizer=True)
+        if not model_on_hub:
+            return styled_error(f'Model "{model}" {error}')
+    # Is the model info correctly filled?
+    try:
+        model_info = API.model_info(repo_id=model, revision=revision)
+    except Exception:
+        return styled_error("Could not get your model information. Please fill it up properly.")
+    model_size = get_model_size(model_info=model_info, precision=precision)
+    # Were the model card and license filled?
+    try:
+        license = model_info.cardData["license"]
+    except Exception:
+        return styled_error("Please select a license for your model")
+    modelcard_OK, error_msg = check_model_card(model)
+    if not modelcard_OK:
+        return styled_error(error_msg)
+    # Seems good, creating the eval
+    print("Adding new eval")
+    eval_entry = {
+        "model": model,
+        "base_model": base_model,
+        "revision": revision,
+        "precision": precision,
+        "weight_type": weight_type,
+        "status": "PENDING",
+        "submitted_time": current_time,
+        "model_type": model_type,
+        "likes": model_info.likes,
+        "params": model_size,
+        "license": license,
+        "private": False,
+    }
+    # Check for duplicate submission
+    if f"{model}_{revision}_{precision}" in REQUESTED_MODELS:
+        return styled_warning("This model has been already submitted.")
+    print("Creating eval file")
+    OUT_DIR = f"{EVAL_REQUESTS_PATH}/{user_name}"
+    os.makedirs(OUT_DIR, exist_ok=True)
+    out_path = f"{OUT_DIR}/{model_path}_eval_request_False_{precision}_{weight_type}.json"
+    with open(out_path, "w") as f:
+        f.write(json.dumps(eval_entry))
+    print("Uploading eval file")
+    API.upload_file(
+        path_or_fileobj=out_path,
+        path_in_repo=out_path.split("eval-queue/")[1],
+        repo_id=QUEUE_REPO,
+        repo_type="dataset",
+        commit_message=f"Add {model} to eval queue",
+    )
+    # Remove the local file
+    os.remove(out_path)
+    return styled_message(
+        "Your request has been submitted to the evaluation queue!\nPlease wait for up to an hour for the model to show in the PENDING list."
+    )

src/submission/submit.pyZone.Identifier ADDED Viewed

File without changes