arthrod
/

qwen_cnmoro_baseline

@@ -1,385 +1,397 @@
----
-license: mit
-language:
-- pt
-base_model:
-- Qwen/Qwen2.5-0.5B-Instruct
-pipeline_tag: text-generation
-datasets:
-- adalbertojunior/openHermes_portuguese
-- cnmoro/smoltalk-555k-ptbr
-- cnmoro/RagMixPTBR-Legal-Alpaca-2M
-- adalbertojunior/dolphin-2.9-portuguese
-model-index:
-- name: Qwen2.5-0.5B-Portuguese-v2
-  results:
-  - task:
-      type: text-generation
-      name: Text Generation
-    dataset:
-      name: ENEM Challenge (No Images)
-      type: eduagarcia/enem_challenge
-      split: train
-      args:
-        num_few_shot: 3
-    metrics:
-    - type: acc
-      value: 36.81
-      name: accuracy
-    source:
-      url: https://huggingface.co/spaces/eduagarcia/open_pt_llm_leaderboard?query=cnmoro/Qwen2.5-0.5B-Portuguese-v2
-      name: Open Portuguese LLM Leaderboard
-  - task:
-      type: text-generation
-      name: Text Generation
-    dataset:
-      name: BLUEX (No Images)
-      type: eduagarcia-temp/BLUEX_without_images
-      split: train
-      args:
-        num_few_shot: 3
-    metrics:
-    - type: acc
-      value: 26.84
-      name: accuracy
-    source:
-      url: https://huggingface.co/spaces/eduagarcia/open_pt_llm_leaderboard?query=cnmoro/Qwen2.5-0.5B-Portuguese-v2
-      name: Open Portuguese LLM Leaderboard
-  - task:
-      type: text-generation
-      name: Text Generation
-    dataset:
-      name: OAB Exams
-      type: eduagarcia/oab_exams
-      split: train
-      args:
-        num_few_shot: 3
-    metrics:
-    - type: acc
-      value: 30.62
-      name: accuracy
-    source:
-      url: https://huggingface.co/spaces/eduagarcia/open_pt_llm_leaderboard?query=cnmoro/Qwen2.5-0.5B-Portuguese-v2
-      name: Open Portuguese LLM Leaderboard
-  - task:
-      type: text-generation
-      name: Text Generation
-    dataset:
-      name: Assin2 RTE
-      type: assin2
-      split: test
-      args:
-        num_few_shot: 15
-    metrics:
-    - type: f1_macro
-      value: 87.91
-      name: f1-macro
-    source:
-      url: https://huggingface.co/spaces/eduagarcia/open_pt_llm_leaderboard?query=cnmoro/Qwen2.5-0.5B-Portuguese-v2
-      name: Open Portuguese LLM Leaderboard
-  - task:
-      type: text-generation
-      name: Text Generation
-    dataset:
-      name: Assin2 STS
-      type: eduagarcia/portuguese_benchmark
-      split: test
-      args:
-        num_few_shot: 15
-    metrics:
-    - type: pearson
-      value: 59.01
-      name: pearson
-    source:
-      url: https://huggingface.co/spaces/eduagarcia/open_pt_llm_leaderboard?query=cnmoro/Qwen2.5-0.5B-Portuguese-v2
-      name: Open Portuguese LLM Leaderboard
-  - task:
-      type: text-generation
-      name: Text Generation
-    dataset:
-      name: FaQuAD NLI
-      type: ruanchaves/faquad-nli
-      split: test
-      args:
-        num_few_shot: 15
-    metrics:
-    - type: f1_macro
-      value: 43.97
-      name: f1-macro
-    source:
-      url: https://huggingface.co/spaces/eduagarcia/open_pt_llm_leaderboard?query=cnmoro/Qwen2.5-0.5B-Portuguese-v2
-      name: Open Portuguese LLM Leaderboard
-  - task:
-      type: text-generation
-      name: Text Generation
-    dataset:
-      name: HateBR Binary
-      type: ruanchaves/hatebr
-      split: test
-      args:
-        num_few_shot: 25
-    metrics:
-    - type: f1_macro
-      value: 33.62
-      name: f1-macro
-    source:
-      url: https://huggingface.co/spaces/eduagarcia/open_pt_llm_leaderboard?query=cnmoro/Qwen2.5-0.5B-Portuguese-v2
-      name: Open Portuguese LLM Leaderboard
-  - task:
-      type: text-generation
-      name: Text Generation
-    dataset:
-      name: PT Hate Speech Binary
-      type: hate_speech_portuguese
-      split: test
-      args:
-        num_few_shot: 25
-    metrics:
-    - type: f1_macro
-      value: 41.23
-      name: f1-macro
-    source:
-      url: https://huggingface.co/spaces/eduagarcia/open_pt_llm_leaderboard?query=cnmoro/Qwen2.5-0.5B-Portuguese-v2
-      name: Open Portuguese LLM Leaderboard
-  - task:
-      type: text-generation
-      name: Text Generation
-    dataset:
-      name: tweetSentBR
-      type: eduagarcia/tweetsentbr_fewshot
-      split: test
-      args:
-        num_few_shot: 25
-    metrics:
-    - type: f1_macro
-      value: 52.33
-      name: f1-macro
-    source:
-      url: https://huggingface.co/spaces/eduagarcia/open_pt_llm_leaderboard?query=cnmoro/Qwen2.5-0.5B-Portuguese-v2
-      name: Open Portuguese LLM Leaderboard
----
-Qwen2.5-0.5B finetuned for proficiency in Portuguese language and increased intelligence.
-```text
-https://ollama.com/cnmoro/Qwen2.5-0.5B-Portuguese-v2
-```
-```python
-from transformers import AutoModelForCausalLM, AutoTokenizer
-model_name = "cnmoro/Qwen2.5-0.5B-Portuguese-v2"
-model = AutoModelForCausalLM.from_pretrained(
-    model_name,
-    torch_dtype="auto",
-    device_map="auto"
-)
-tokenizer = AutoTokenizer.from_pretrained(model_name)
-prompt = "Escreva uma breve introdução sobre LLMs (Large Language Models) e suas aplicações."
-# System prompt is always injected and hardcoded automatically
-# for ideal performance in portuguese language.
-# No need to write it again.
-messages = [
-    {"role": "user", "content": prompt}
-]
-text = tokenizer.apply_chat_template(
-    messages,
-    tokenize=False,
-    add_generation_prompt=True
-)
-model_inputs = tokenizer([text], return_tensors="pt").to(model.device)
-generated_ids = model.generate(
-    **model_inputs,
-    max_new_tokens=512
-)
-generated_ids = [
-    output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
-]
-response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
-response
-# As Large Language Models (LLMs) são sistemas computacionais projetados para produzir
-# linguagem natural com alta precisão e fluência. Eles usam algoritmos avançados para compreender
-# e gerar texto, permitindo-lhes realizar tarefas como tradução de idiomas, geração de conteúdo
-# e processamento de linguagem natural.
-#
-# Os LLMs têm sido amplamente utilizados na área da inteligência artificial e do aprendizado
-# de máquina há vários anos. Alguns dos principais usos de LLMs incluem:
-#
-# 1. Tradução automática: Os LLMs podem traduzir textos entre diferentes idiomas, tornando-os
-# úteis em setores onde a comunicação internacional é crítica, como negócios internacionais,
-# diplomacia ou relações públicas.
-#
-# 2. Geração de conteúdo: os LLMs podem criar conteúdo altamente personalizado e adaptado às
-# necessidades específicas de seus usuários, tornando-os ideais para criação de sites, aplicativos
-# móveis ou plataformas de mídia social.
-#
-# 3. Processamento de Linguagem Natural: Os LLMs podem ser treinados para reconhecer e compreender
-# padrões de linguagem, permitindo-lhes compreender melhor as intenções humanas e responder adequadamente.
-#
-# 4. Análise de sentimento: Os LLMs podem analisar dados de texto e identificar sentimentos, ajudando
-# a entender como as pessoas se sentem em relação a determinadas questões ou questões sociais.
-#
-# No geral, os LLMs estão se tornando cada vez mais importantes à medida que a tecnologia continua a
-# avançar. À medida que continuamos a usar LLMs em nossas vidas diárias, podemos esperar ver ainda
-# mais desenvolvimentos interessantes no futuro.
-```
-## Overall Results
-| Task                      | Metric        | Value   | StdErr  |
-|---------------------------|---------------|---------|---------|
-| ASSIN2 RTE                | F1 Macro      | 0.4486  | 0.0067  |
-| ASSIN2 RTE                | Accuracy      | 0.5560  | 0.0071  |
-| ASSIN2 STS                | Pearson       | 0.4091  | 0.0104  |
-| ASSIN2 STS                | MSE           | 5.6395  | N/A     |
-| BluEX                     | Accuracy      | 0.2503  | 0.0094  |
-| ENEM Challenge            | Accuracy      | 0.3128  | 0.0071  |
-| FAQUAD NLI                | F1 Macro      | 0.4611  | 0.0094  |
-| FAQUAD NLI                | Accuracy      | 0.7877  | 0.0113  |
-| HateBR Offensive (Binary) | F1 Macro      | 0.3439  | 0.0049  |
-| HateBR Offensive (Binary) | Accuracy      | 0.4857  | 0.0095  |
-| OAB Exams                 | Accuracy      | 0.3062  | 0.0057  |
-| Portuguese Hate Speech (Binary) | F1 Macro | 0.4119  | 0.0038  |
-| Portuguese Hate Speech (Binary) | Accuracy | 0.7004  | 0.0111  |
-| TweetSentBR               | F1 Macro      | 0.5055  | 0.0078  |
-| TweetSentBR               | Accuracy      | 0.5697  | 0.0078  |
-## Detailed Results by Task
-### ASSIN2 RTE
-| Metric      | Value   | StdErr  |
-|-------------|---------|---------|
-| F1 Macro    | 0.4486  | 0.0067  |
-| Accuracy    | 0.5560  | 0.0071  |
-### ASSIN2 STS
-| Metric      | Value   | StdErr  |
-|-------------|---------|---------|
-| Pearson     | 0.4091  | 0.0104  |
-| MSE         | 5.6395  | N/A     |
-### BluEX
-| Exam ID           | Metric   | Value   | StdErr  |
-|-------------------|----------|---------|---------|
-| All               | Accuracy | 0.2503  | 0.0094  |
-| USP_2018          | Accuracy | 0.2037  | 0.0315  |
-| UNICAMP_2018      | Accuracy | 0.1852  | 0.0306  |
-| UNICAMP_2021_1    | Accuracy | 0.0870  | 0.0240  |
-| USP_2020          | Accuracy | 0.2143  | 0.0317  |
-| USP_2023          | Accuracy | 0.2045  | 0.0350  |
-| UNICAMP_2019      | Accuracy | 0.2600  | 0.0358  |
-| USP_2019          | Accuracy | 0.1500  | 0.0326  |
-| UNICAMP_2020      | Accuracy | 0.2182  | 0.0321  |
-| UNICAMP_2021_2    | Accuracy | 0.2941  | 0.0367  |
-| UNICAMP_2023      | Accuracy | 0.4186  | 0.0433  |
-| UNICAMP_2024      | Accuracy | 0.3111  | 0.0398  |
-| USP_2024          | Accuracy | 0.2683  | 0.0398  |
-| USP_2021          | Accuracy | 0.3269  | 0.0375  |
-| UNICAMP_2022      | Accuracy | 0.3590  | 0.0444  |
-| USP_2022          | Accuracy | 0.2857  | 0.0370  |
-### ENEM Challenge
-| Exam ID   | Metric   | Value   | StdErr  |
-|-----------|----------|---------|---------|
-| All       | Accuracy | 0.3128  | 0.0071  |
-| 2017      | Accuracy | 0.2845  | 0.0241  |
-| 2016      | Accuracy | 0.2479  | 0.0226  |
-| 2016_2    | Accuracy | 0.2846  | 0.0235  |
-| 2022      | Accuracy | 0.3534  | 0.0240  |
-| 2012      | Accuracy | 0.3362  | 0.0253  |
-| 2011      | Accuracy | 0.3333  | 0.0251  |
-| 2010      | Accuracy | 0.3846  | 0.0260  |
-| 2014      | Accuracy | 0.3211  | 0.0259  |
-| 2009      | Accuracy | 0.2696  | 0.0239  |
-| 2015      | Accuracy | 0.2521  | 0.0229  |
-| 2023      | Accuracy | 0.3481  | 0.0236  |
-| 2013      | Accuracy | 0.3333  | 0.0261  |
-### FAQUAD NLI
-| Metric      | Value   | StdErr  |
-|-------------|---------|---------|
-| F1 Macro    | 0.4611  | 0.0094  |
-| Accuracy    | 0.7877  | 0.0113  |
-### HateBR Offensive (Binary)
-| Metric      | Value   | StdErr  |
-|-------------|---------|---------|
-| F1 Macro    | 0.3439  | 0.0049  |
-| Accuracy    | 0.4857  | 0.0095  |
-### OAB Exams
-| Exam ID     | Metric   | Value   | StdErr  |
-|-------------|----------|---------|---------|
-| All         | Accuracy | 0.3062  | 0.0057  |
-| 2011-05     | Accuracy | 0.3375  | 0.0304  |
-| 2012-06a    | Accuracy | 0.2625  | 0.0285  |
-| 2010-02     | Accuracy | 0.3700  | 0.0279  |
-| 2017-22     | Accuracy | 0.3500  | 0.0309  |
-| 2016-20     | Accuracy | 0.3125  | 0.0300  |
-| 2011-03     | Accuracy | 0.2626  | 0.0255  |
-| 2015-17     | Accuracy | 0.3205  | 0.0304  |
-| 2017-23     | Accuracy | 0.2875  | 0.0292  |
-| 2018-25     | Accuracy | 0.3625  | 0.0311  |
-| 2016-19     | Accuracy | 0.2436  | 0.0281  |
-| 2017-24     | Accuracy | 0.1625  | 0.0238  |
-| 2015-16     | Accuracy | 0.3125  | 0.0300  |
-| 2011-04     | Accuracy | 0.3250  | 0.0301  |
-| 2012-07     | Accuracy | 0.3500  | 0.0307  |
-| 2012-06     | Accuracy | 0.1875  | 0.0253  |
-| 2012-09     | Accuracy | 0.2468  | 0.0284  |
-| 2013-12     | Accuracy | 0.3625  | 0.0311  |
-| 2013-11     | Accuracy | 0.3000  | 0.0295  |
-| 2010-01     | Accuracy | 0.3412  | 0.0296  |
-| 2015-18     | Accuracy | 0.2875  | 0.0292  |
-| 2014-13     | Accuracy | 0.3500  | 0.0308  |
-| 2013-10     | Accuracy | 0.3125  | 0.0300  |
-| 2016-20a    | Accuracy | 0.2500  | 0.0279  |
-| 2014-14     | Accuracy | 0.3125  | 0.0301  |
-| 2012-08     | Accuracy | 0.3000  | 0.0296  |
-| 2016-21     | Accuracy | 0.3375  | 0.0304  |
-| 2014-15     | Accuracy | 0.4103  | 0.0321  |
-### Portuguese Hate Speech (Binary)
-| Metric      | Value   | StdErr  |
-|-------------|---------|---------|
-| F1 Macro    | 0.4119  | 0.0038  |
-| Accuracy    | 0.7004  | 0.0111  |
-### TweetSentBR
-| Metric      | Value   | StdErr  |
-|-------------|---------|---------|
-| F1 Macro    | 0.5055  | 0.0078  |
-| Accuracy    | 0.5697  | 0.0078  |
-# Open Portuguese LLM Leaderboard Evaluation Results
-Detailed results can be found [here](https://huggingface.co/datasets/eduagarcia-temp/llm_pt_leaderboard_raw_results/tree/main/cnmoro/Qwen2.5-0.5B-Portuguese-v2) and on the [🚀 Open Portuguese LLM Leaderboard](https://huggingface.co/spaces/eduagarcia/open_pt_llm_leaderboard)
-|          Metric          |  Value  |
-|--------------------------|---------|
-|Average                   |**45.81**|
-|ENEM Challenge (No Images)|    36.81|
-|BLUEX (No Images)         |    26.84|
-|OAB Exams                 |    30.62|
-|Assin2 RTE                |    87.91|
-|Assin2 STS                |    59.01|
-|FaQuAD NLI                |    43.97|
-|HateBR Binary             |    33.62|
-|PT Hate Speech Binary     |    41.23|
-|tweetSentBR               |    52.33|

+---
+license: mit
+language:
+- zho
+- eng
+- fra
+- spa
+- por
+- deu
+- ita
+- rus
+- jpn
+- kor
+- vie
+- tha
+- ara
+base_model:
+- Qwen/Qwen2.5-0.5B-Instruct
+pipeline_tag: text-generation
+datasets:
+- adalbertojunior/openHermes_portuguese
+- cnmoro/smoltalk-555k-ptbr
+- cnmoro/RagMixPTBR-Legal-Alpaca-2M
+- adalbertojunior/dolphin-2.9-portuguese
+model-index:
+- name: Qwen2.5-0.5B-Portuguese-v2
+  results:
+  - task:
+      type: text-generation
+      name: Text Generation
+    dataset:
+      name: ENEM Challenge (No Images)
+      type: eduagarcia/enem_challenge
+      split: train
+      args:
+        num_few_shot: 3
+    metrics:
+    - type: acc
+      value: 36.81
+      name: accuracy
+    source:
+      url: https://huggingface.co/spaces/eduagarcia/open_pt_llm_leaderboard?query=cnmoro/Qwen2.5-0.5B-Portuguese-v2
+      name: Open Portuguese LLM Leaderboard
+  - task:
+      type: text-generation
+      name: Text Generation
+    dataset:
+      name: BLUEX (No Images)
+      type: eduagarcia-temp/BLUEX_without_images
+      split: train
+      args:
+        num_few_shot: 3
+    metrics:
+    - type: acc
+      value: 26.84
+      name: accuracy
+    source:
+      url: https://huggingface.co/spaces/eduagarcia/open_pt_llm_leaderboard?query=cnmoro/Qwen2.5-0.5B-Portuguese-v2
+      name: Open Portuguese LLM Leaderboard
+  - task:
+      type: text-generation
+      name: Text Generation
+    dataset:
+      name: OAB Exams
+      type: eduagarcia/oab_exams
+      split: train
+      args:
+        num_few_shot: 3
+    metrics:
+    - type: acc
+      value: 30.62
+      name: accuracy
+    source:
+      url: https://huggingface.co/spaces/eduagarcia/open_pt_llm_leaderboard?query=cnmoro/Qwen2.5-0.5B-Portuguese-v2
+      name: Open Portuguese LLM Leaderboard
+  - task:
+      type: text-generation
+      name: Text Generation
+    dataset:
+      name: Assin2 RTE
+      type: assin2
+      split: test
+      args:
+        num_few_shot: 15
+    metrics:
+    - type: f1_macro
+      value: 87.91
+      name: f1-macro
+    source:
+      url: https://huggingface.co/spaces/eduagarcia/open_pt_llm_leaderboard?query=cnmoro/Qwen2.5-0.5B-Portuguese-v2
+      name: Open Portuguese LLM Leaderboard
+  - task:
+      type: text-generation
+      name: Text Generation
+    dataset:
+      name: Assin2 STS
+      type: eduagarcia/portuguese_benchmark
+      split: test
+      args:
+        num_few_shot: 15
+    metrics:
+    - type: pearson
+      value: 59.01
+      name: pearson
+    source:
+      url: https://huggingface.co/spaces/eduagarcia/open_pt_llm_leaderboard?query=cnmoro/Qwen2.5-0.5B-Portuguese-v2
+      name: Open Portuguese LLM Leaderboard
+  - task:
+      type: text-generation
+      name: Text Generation
+    dataset:
+      name: FaQuAD NLI
+      type: ruanchaves/faquad-nli
+      split: test
+      args:
+        num_few_shot: 15
+    metrics:
+    - type: f1_macro
+      value: 43.97
+      name: f1-macro
+    source:
+      url: https://huggingface.co/spaces/eduagarcia/open_pt_llm_leaderboard?query=cnmoro/Qwen2.5-0.5B-Portuguese-v2
+      name: Open Portuguese LLM Leaderboard
+  - task:
+      type: text-generation
+      name: Text Generation
+    dataset:
+      name: HateBR Binary
+      type: ruanchaves/hatebr
+      split: test
+      args:
+        num_few_shot: 25
+    metrics:
+    - type: f1_macro
+      value: 33.62
+      name: f1-macro
+    source:
+      url: https://huggingface.co/spaces/eduagarcia/open_pt_llm_leaderboard?query=cnmoro/Qwen2.5-0.5B-Portuguese-v2
+      name: Open Portuguese LLM Leaderboard
+  - task:
+      type: text-generation
+      name: Text Generation
+    dataset:
+      name: PT Hate Speech Binary
+      type: hate_speech_portuguese
+      split: test
+      args:
+        num_few_shot: 25
+    metrics:
+    - type: f1_macro
+      value: 41.23
+      name: f1-macro
+    source:
+      url: https://huggingface.co/spaces/eduagarcia/open_pt_llm_leaderboard?query=cnmoro/Qwen2.5-0.5B-Portuguese-v2
+      name: Open Portuguese LLM Leaderboard
+  - task:
+      type: text-generation
+      name: Text Generation
+    dataset:
+      name: tweetSentBR
+      type: eduagarcia/tweetsentbr_fewshot
+      split: test
+      args:
+        num_few_shot: 25
+    metrics:
+    - type: f1_macro
+      value: 52.33
+      name: f1-macro
+    source:
+      url: https://huggingface.co/spaces/eduagarcia/open_pt_llm_leaderboard?query=cnmoro/Qwen2.5-0.5B-Portuguese-v2
+      name: Open Portuguese LLM Leaderboard
+---
+Qwen2.5-0.5B finetuned for proficiency in Portuguese language and increased intelligence.
+```text
+https://ollama.com/cnmoro/Qwen2.5-0.5B-Portuguese-v2
+```
+```python
+from transformers import AutoModelForCausalLM, AutoTokenizer
+model_name = "cnmoro/Qwen2.5-0.5B-Portuguese-v2"
+model = AutoModelForCausalLM.from_pretrained(
+    model_name,
+    torch_dtype="auto",
+    device_map="auto"
+)
+tokenizer = AutoTokenizer.from_pretrained(model_name)
+prompt = "Escreva uma breve introdução sobre LLMs (Large Language Models) e suas aplicações."
+# System prompt is always injected and hardcoded automatically
+# for ideal performance in portuguese language.
+# No need to write it again.
+messages = [
+    {"role": "user", "content": prompt}
+]
+text = tokenizer.apply_chat_template(
+    messages,
+    tokenize=False,
+    add_generation_prompt=True
+)
+model_inputs = tokenizer([text], return_tensors="pt").to(model.device)
+generated_ids = model.generate(
+    **model_inputs,
+    max_new_tokens=512
+)
+generated_ids = [
+    output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
+]
+response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
+response
+# As Large Language Models (LLMs) são sistemas computacionais projetados para produzir
+# linguagem natural com alta precisão e fluência. Eles usam algoritmos avançados para compreender
+# e gerar texto, permitindo-lhes realizar tarefas como tradução de idiomas, geração de conteúdo
+# e processamento de linguagem natural.
+#
+# Os LLMs têm sido amplamente utilizados na área da inteligência artificial e do aprendizado
+# de máquina há vários anos. Alguns dos principais usos de LLMs incluem:
+#
+# 1. Tradução automática: Os LLMs podem traduzir textos entre diferentes idiomas, tornando-os
+# úteis em setores onde a comunicação internacional é crítica, como negócios internacionais,
+# diplomacia ou relações públicas.
+#
+# 2. Geração de conteúdo: os LLMs podem criar conteúdo altamente personalizado e adaptado às
+# necessidades específicas de seus usuários, tornando-os ideais para criação de sites, aplicativos
+# móveis ou plataformas de mídia social.
+#
+# 3. Processamento de Linguagem Natural: Os LLMs podem ser treinados para reconhecer e compreender
+# padrões de linguagem, permitindo-lhes compreender melhor as intenções humanas e responder adequadamente.
+#
+# 4. Análise de sentimento: Os LLMs podem analisar dados de texto e identificar sentimentos, ajudando
+# a entender como as pessoas se sentem em relação a determinadas questões ou questões sociais.
+#
+# No geral, os LLMs estão se tornando cada vez mais importantes à medida que a tecnologia continua a
+# avançar. À medida que continuamos a usar LLMs em nossas vidas diárias, podemos esperar ver ainda
+# mais desenvolvimentos interessantes no futuro.
+```
+## Overall Results
+| Task                      | Metric        | Value   | StdErr  |
+|---------------------------|---------------|---------|---------|
+| ASSIN2 RTE                | F1 Macro      | 0.4486  | 0.0067  |
+| ASSIN2 RTE                | Accuracy      | 0.5560  | 0.0071  |
+| ASSIN2 STS                | Pearson       | 0.4091  | 0.0104  |
+| ASSIN2 STS                | MSE           | 5.6395  | N/A     |
+| BluEX                     | Accuracy      | 0.2503  | 0.0094  |
+| ENEM Challenge            | Accuracy      | 0.3128  | 0.0071  |
+| FAQUAD NLI                | F1 Macro      | 0.4611  | 0.0094  |
+| FAQUAD NLI                | Accuracy      | 0.7877  | 0.0113  |
+| HateBR Offensive (Binary) | F1 Macro      | 0.3439  | 0.0049  |
+| HateBR Offensive (Binary) | Accuracy      | 0.4857  | 0.0095  |
+| OAB Exams                 | Accuracy      | 0.3062  | 0.0057  |
+| Portuguese Hate Speech (Binary) | F1 Macro | 0.4119  | 0.0038  |
+| Portuguese Hate Speech (Binary) | Accuracy | 0.7004  | 0.0111  |
+| TweetSentBR               | F1 Macro      | 0.5055  | 0.0078  |
+| TweetSentBR               | Accuracy      | 0.5697  | 0.0078  |
+## Detailed Results by Task
+### ASSIN2 RTE
+| Metric      | Value   | StdErr  |
+|-------------|---------|---------|
+| F1 Macro    | 0.4486  | 0.0067  |
+| Accuracy    | 0.5560  | 0.0071  |
+### ASSIN2 STS
+| Metric      | Value   | StdErr  |
+|-------------|---------|---------|
+| Pearson     | 0.4091  | 0.0104  |
+| MSE         | 5.6395  | N/A     |
+### BluEX
+| Exam ID           | Metric   | Value   | StdErr  |
+|-------------------|----------|---------|---------|
+| All               | Accuracy | 0.2503  | 0.0094  |
+| USP_2018          | Accuracy | 0.2037  | 0.0315  |
+| UNICAMP_2018      | Accuracy | 0.1852  | 0.0306  |
+| UNICAMP_2021_1    | Accuracy | 0.0870  | 0.0240  |
+| USP_2020          | Accuracy | 0.2143  | 0.0317  |
+| USP_2023          | Accuracy | 0.2045  | 0.0350  |
+| UNICAMP_2019      | Accuracy | 0.2600  | 0.0358  |
+| USP_2019          | Accuracy | 0.1500  | 0.0326  |
+| UNICAMP_2020      | Accuracy | 0.2182  | 0.0321  |
+| UNICAMP_2021_2    | Accuracy | 0.2941  | 0.0367  |
+| UNICAMP_2023      | Accuracy | 0.4186  | 0.0433  |
+| UNICAMP_2024      | Accuracy | 0.3111  | 0.0398  |
+| USP_2024          | Accuracy | 0.2683  | 0.0398  |
+| USP_2021          | Accuracy | 0.3269  | 0.0375  |
+| UNICAMP_2022      | Accuracy | 0.3590  | 0.0444  |
+| USP_2022          | Accuracy | 0.2857  | 0.0370  |
+### ENEM Challenge
+| Exam ID   | Metric   | Value   | StdErr  |
+|-----------|----------|---------|---------|
+| All       | Accuracy | 0.3128  | 0.0071  |
+| 2017      | Accuracy | 0.2845  | 0.0241  |
+| 2016      | Accuracy | 0.2479  | 0.0226  |
+| 2016_2    | Accuracy | 0.2846  | 0.0235  |
+| 2022      | Accuracy | 0.3534  | 0.0240  |
+| 2012      | Accuracy | 0.3362  | 0.0253  |
+| 2011      | Accuracy | 0.3333  | 0.0251  |
+| 2010      | Accuracy | 0.3846  | 0.0260  |
+| 2014      | Accuracy | 0.3211  | 0.0259  |
+| 2009      | Accuracy | 0.2696  | 0.0239  |
+| 2015      | Accuracy | 0.2521  | 0.0229  |
+| 2023      | Accuracy | 0.3481  | 0.0236  |
+| 2013      | Accuracy | 0.3333  | 0.0261  |
+### FAQUAD NLI
+| Metric      | Value   | StdErr  |
+|-------------|---------|---------|
+| F1 Macro    | 0.4611  | 0.0094  |
+| Accuracy    | 0.7877  | 0.0113  |
+### HateBR Offensive (Binary)
+| Metric      | Value   | StdErr  |
+|-------------|---------|---------|
+| F1 Macro    | 0.3439  | 0.0049  |
+| Accuracy    | 0.4857  | 0.0095  |
+### OAB Exams
+| Exam ID     | Metric   | Value   | StdErr  |
+|-------------|----------|---------|---------|
+| All         | Accuracy | 0.3062  | 0.0057  |
+| 2011-05     | Accuracy | 0.3375  | 0.0304  |
+| 2012-06a    | Accuracy | 0.2625  | 0.0285  |
+| 2010-02     | Accuracy | 0.3700  | 0.0279  |
+| 2017-22     | Accuracy | 0.3500  | 0.0309  |
+| 2016-20     | Accuracy | 0.3125  | 0.0300  |
+| 2011-03     | Accuracy | 0.2626  | 0.0255  |
+| 2015-17     | Accuracy | 0.3205  | 0.0304  |
+| 2017-23     | Accuracy | 0.2875  | 0.0292  |
+| 2018-25     | Accuracy | 0.3625  | 0.0311  |
+| 2016-19     | Accuracy | 0.2436  | 0.0281  |
+| 2017-24     | Accuracy | 0.1625  | 0.0238  |
+| 2015-16     | Accuracy | 0.3125  | 0.0300  |
+| 2011-04     | Accuracy | 0.3250  | 0.0301  |
+| 2012-07     | Accuracy | 0.3500  | 0.0307  |
+| 2012-06     | Accuracy | 0.1875  | 0.0253  |
+| 2012-09     | Accuracy | 0.2468  | 0.0284  |
+| 2013-12     | Accuracy | 0.3625  | 0.0311  |
+| 2013-11     | Accuracy | 0.3000  | 0.0295  |
+| 2010-01     | Accuracy | 0.3412  | 0.0296  |
+| 2015-18     | Accuracy | 0.2875  | 0.0292  |
+| 2014-13     | Accuracy | 0.3500  | 0.0308  |
+| 2013-10     | Accuracy | 0.3125  | 0.0300  |
+| 2016-20a    | Accuracy | 0.2500  | 0.0279  |
+| 2014-14     | Accuracy | 0.3125  | 0.0301  |
+| 2012-08     | Accuracy | 0.3000  | 0.0296  |
+| 2016-21     | Accuracy | 0.3375  | 0.0304  |
+| 2014-15     | Accuracy | 0.4103  | 0.0321  |
+### Portuguese Hate Speech (Binary)
+| Metric      | Value   | StdErr  |
+|-------------|---------|---------|
+| F1 Macro    | 0.4119  | 0.0038  |
+| Accuracy    | 0.7004  | 0.0111  |
+### TweetSentBR
+| Metric      | Value   | StdErr  |
+|-------------|---------|---------|
+| F1 Macro    | 0.5055  | 0.0078  |
+| Accuracy    | 0.5697  | 0.0078  |
+# Open Portuguese LLM Leaderboard Evaluation Results
+Detailed results can be found [here](https://huggingface.co/datasets/eduagarcia-temp/llm_pt_leaderboard_raw_results/tree/main/cnmoro/Qwen2.5-0.5B-Portuguese-v2) and on the [🚀 Open Portuguese LLM Leaderboard](https://huggingface.co/spaces/eduagarcia/open_pt_llm_leaderboard)
+|          Metric          |  Value  |
+|--------------------------|---------|
+|Average                   |**45.81**|
+|ENEM Challenge (No Images)|    36.81|
+|BLUEX (No Images)         |    26.84|
+|OAB Exams                 |    30.62|
+|Assin2 RTE                |    87.91|
+|Assin2 STS                |    59.01|
+|FaQuAD NLI                |    43.97|
+|HateBR Binary             |    33.62|
+|PT Hate Speech Binary     |    41.23|
+|tweetSentBR               |    52.33|