mirageco commited on
Commit
15fd462
·
1 Parent(s): fe6d8a4

Remove old leaderboard files

Browse files
.gitattributes DELETED
@@ -1,35 +0,0 @@
1
- *.7z filter=lfs diff=lfs merge=lfs -text
2
- *.arrow filter=lfs diff=lfs merge=lfs -text
3
- *.bin filter=lfs diff=lfs merge=lfs -text
4
- *.bz2 filter=lfs diff=lfs merge=lfs -text
5
- *.ckpt filter=lfs diff=lfs merge=lfs -text
6
- *.ftz filter=lfs diff=lfs merge=lfs -text
7
- *.gz filter=lfs diff=lfs merge=lfs -text
8
- *.h5 filter=lfs diff=lfs merge=lfs -text
9
- *.joblib filter=lfs diff=lfs merge=lfs -text
10
- *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
- *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
- *.model filter=lfs diff=lfs merge=lfs -text
13
- *.msgpack filter=lfs diff=lfs merge=lfs -text
14
- *.npy filter=lfs diff=lfs merge=lfs -text
15
- *.npz filter=lfs diff=lfs merge=lfs -text
16
- *.onnx filter=lfs diff=lfs merge=lfs -text
17
- *.ot filter=lfs diff=lfs merge=lfs -text
18
- *.parquet filter=lfs diff=lfs merge=lfs -text
19
- *.pb filter=lfs diff=lfs merge=lfs -text
20
- *.pickle filter=lfs diff=lfs merge=lfs -text
21
- *.pkl filter=lfs diff=lfs merge=lfs -text
22
- *.pt filter=lfs diff=lfs merge=lfs -text
23
- *.pth filter=lfs diff=lfs merge=lfs -text
24
- *.rar filter=lfs diff=lfs merge=lfs -text
25
- *.safetensors filter=lfs diff=lfs merge=lfs -text
26
- saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
- *.tar.* filter=lfs diff=lfs merge=lfs -text
28
- *.tflite filter=lfs diff=lfs merge=lfs -text
29
- *.tgz filter=lfs diff=lfs merge=lfs -text
30
- *.wasm filter=lfs diff=lfs merge=lfs -text
31
- *.xz filter=lfs diff=lfs merge=lfs -text
32
- *.zip filter=lfs diff=lfs merge=lfs -text
33
- *.zst filter=lfs diff=lfs merge=lfs -text
34
- *tfevents* filter=lfs diff=lfs merge=lfs -text
35
- scale-hf-logo.png filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
.gitignore CHANGED
@@ -1,13 +1,45 @@
1
- auto_evals/
2
- venv/
3
- __pycache__/
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4
  .env
5
- .ipynb_checkpoints
6
- *ipynb
7
- .vscode/
8
-
9
- eval-queue/
10
- eval-results/
11
- eval-queue-bk/
12
- eval-results-bk/
13
- logs/
 
1
+ # See https://help.github.com/articles/ignoring-files/ for more about ignoring files.
2
+
3
+ __pycache__
4
+ .cache/
5
+
6
+ # dependencies
7
+
8
+ frontend/node_modules
9
+ /.pnp
10
+ .pnp.js
11
+
12
+ # testing
13
+
14
+ /coverage
15
+
16
+ # production
17
+
18
+ /build
19
+
20
+ # misc
21
+
22
+ .DS_Store
23
+ .env.local
24
+ .env.development.local
25
+ .env.test.local
26
+ .env.production.local
27
+
28
+ npm-debug.log*
29
+ yarn-debug.log*
30
+ yarn-error.log\*
31
+
32
+ src/dataframe.json
33
+
34
+ yarn.lock
35
+ package-lock.json
36
+
37
+ /public
38
+
39
+ .claudesync/
40
+
41
+ # Environment variables
42
  .env
43
+ .env.*
44
+ !.env.example
45
+
 
 
 
 
 
 
.pre-commit-config.yaml DELETED
@@ -1,53 +0,0 @@
1
- # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
2
- #
3
- # Licensed under the Apache License, Version 2.0 (the "License");
4
- # you may not use this file except in compliance with the License.
5
- # You may obtain a copy of the License at
6
- #
7
- # http://www.apache.org/licenses/LICENSE-2.0
8
- #
9
- # Unless required by applicable law or agreed to in writing, software
10
- # distributed under the License is distributed on an "AS IS" BASIS,
11
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
- # See the License for the specific language governing permissions and
13
- # limitations under the License.
14
-
15
- default_language_version:
16
- python: python3
17
-
18
- ci:
19
- autofix_prs: true
20
- autoupdate_commit_msg: '[pre-commit.ci] pre-commit suggestions'
21
- autoupdate_schedule: quarterly
22
-
23
- repos:
24
- - repo: https://github.com/pre-commit/pre-commit-hooks
25
- rev: v4.3.0
26
- hooks:
27
- - id: check-yaml
28
- - id: check-case-conflict
29
- - id: detect-private-key
30
- - id: check-added-large-files
31
- args: ['--maxkb=1000']
32
- - id: requirements-txt-fixer
33
- - id: end-of-file-fixer
34
- - id: trailing-whitespace
35
-
36
- - repo: https://github.com/PyCQA/isort
37
- rev: 5.12.0
38
- hooks:
39
- - id: isort
40
- name: Format imports
41
-
42
- - repo: https://github.com/psf/black
43
- rev: 22.12.0
44
- hooks:
45
- - id: black
46
- name: Format code
47
- additional_dependencies: ['click==8.0.2']
48
-
49
- - repo: https://github.com/charliermarsh/ruff-pre-commit
50
- # Ruff version.
51
- rev: 'v0.0.267'
52
- hooks:
53
- - id: ruff
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
LICENSE DELETED
@@ -1,201 +0,0 @@
1
- Apache License
2
- Version 2.0, January 2004
3
- http://www.apache.org/licenses/
4
-
5
- TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
6
-
7
- 1. Definitions.
8
-
9
- "License" shall mean the terms and conditions for use, reproduction,
10
- and distribution as defined by Sections 1 through 9 of this document.
11
-
12
- "Licensor" shall mean the copyright owner or entity authorized by
13
- the copyright owner that is granting the License.
14
-
15
- "Legal Entity" shall mean the union of the acting entity and all
16
- other entities that control, are controlled by, or are under common
17
- control with that entity. For the purposes of this definition,
18
- "control" means (i) the power, direct or indirect, to cause the
19
- direction or management of such entity, whether by contract or
20
- otherwise, or (ii) ownership of fifty percent (50%) or more of the
21
- outstanding shares, or (iii) beneficial ownership of such entity.
22
-
23
- "You" (or "Your") shall mean an individual or Legal Entity
24
- exercising permissions granted by this License.
25
-
26
- "Source" form shall mean the preferred form for making modifications,
27
- including but not limited to software source code, documentation
28
- source, and configuration files.
29
-
30
- "Object" form shall mean any form resulting from mechanical
31
- transformation or translation of a Source form, including but
32
- not limited to compiled object code, generated documentation,
33
- and conversions to other media types.
34
-
35
- "Work" shall mean the work of authorship, whether in Source or
36
- Object form, made available under the License, as indicated by a
37
- copyright notice that is included in or attached to the work
38
- (an example is provided in the Appendix below).
39
-
40
- "Derivative Works" shall mean any work, whether in Source or Object
41
- form, that is based on (or derived from) the Work and for which the
42
- editorial revisions, annotations, elaborations, or other modifications
43
- represent, as a whole, an original work of authorship. For the purposes
44
- of this License, Derivative Works shall not include works that remain
45
- separable from, or merely link (or bind by name) to the interfaces of,
46
- the Work and Derivative Works thereof.
47
-
48
- "Contribution" shall mean any work of authorship, including
49
- the original version of the Work and any modifications or additions
50
- to that Work or Derivative Works thereof, that is intentionally
51
- submitted to Licensor for inclusion in the Work by the copyright owner
52
- or by an individual or Legal Entity authorized to submit on behalf of
53
- the copyright owner. For the purposes of this definition, "submitted"
54
- means any form of electronic, verbal, or written communication sent
55
- to the Licensor or its representatives, including but not limited to
56
- communication on electronic mailing lists, source code control systems,
57
- and issue tracking systems that are managed by, or on behalf of, the
58
- Licensor for the purpose of discussing and improving the Work, but
59
- excluding communication that is conspicuously marked or otherwise
60
- designated in writing by the copyright owner as "Not a Contribution."
61
-
62
- "Contributor" shall mean Licensor and any individual or Legal Entity
63
- on behalf of whom a Contribution has been received by Licensor and
64
- subsequently incorporated within the Work.
65
-
66
- 2. Grant of Copyright License. Subject to the terms and conditions of
67
- this License, each Contributor hereby grants to You a perpetual,
68
- worldwide, non-exclusive, no-charge, royalty-free, irrevocable
69
- copyright license to reproduce, prepare Derivative Works of,
70
- publicly display, publicly perform, sublicense, and distribute the
71
- Work and such Derivative Works in Source or Object form.
72
-
73
- 3. Grant of Patent License. Subject to the terms and conditions of
74
- this License, each Contributor hereby grants to You a perpetual,
75
- worldwide, non-exclusive, no-charge, royalty-free, irrevocable
76
- (except as stated in this section) patent license to make, have made,
77
- use, offer to sell, sell, import, and otherwise transfer the Work,
78
- where such license applies only to those patent claims licensable
79
- by such Contributor that are necessarily infringed by their
80
- Contribution(s) alone or by combination of their Contribution(s)
81
- with the Work to which such Contribution(s) was submitted. If You
82
- institute patent litigation against any entity (including a
83
- cross-claim or counterclaim in a lawsuit) alleging that the Work
84
- or a Contribution incorporated within the Work constitutes direct
85
- or contributory patent infringement, then any patent licenses
86
- granted to You under this License for that Work shall terminate
87
- as of the date such litigation is filed.
88
-
89
- 4. Redistribution. You may reproduce and distribute copies of the
90
- Work or Derivative Works thereof in any medium, with or without
91
- modifications, and in Source or Object form, provided that You
92
- meet the following conditions:
93
-
94
- (a) You must give any other recipients of the Work or
95
- Derivative Works a copy of this License; and
96
-
97
- (b) You must cause any modified files to carry prominent notices
98
- stating that You changed the files; and
99
-
100
- (c) You must retain, in the Source form of any Derivative Works
101
- that You distribute, all copyright, patent, trademark, and
102
- attribution notices from the Source form of the Work,
103
- excluding those notices that do not pertain to any part of
104
- the Derivative Works; and
105
-
106
- (d) If the Work includes a "NOTICE" text file as part of its
107
- distribution, then any Derivative Works that You distribute must
108
- include a readable copy of the attribution notices contained
109
- within such NOTICE file, excluding those notices that do not
110
- pertain to any part of the Derivative Works, in at least one
111
- of the following places: within a NOTICE text file distributed
112
- as part of the Derivative Works; within the Source form or
113
- documentation, if provided along with the Derivative Works; or,
114
- within a display generated by the Derivative Works, if and
115
- wherever such third-party notices normally appear. The contents
116
- of the NOTICE file are for informational purposes only and
117
- do not modify the License. You may add Your own attribution
118
- notices within Derivative Works that You distribute, alongside
119
- or as an addendum to the NOTICE text from the Work, provided
120
- that such additional attribution notices cannot be construed
121
- as modifying the License.
122
-
123
- You may add Your own copyright statement to Your modifications and
124
- may provide additional or different license terms and conditions
125
- for use, reproduction, or distribution of Your modifications, or
126
- for any such Derivative Works as a whole, provided Your use,
127
- reproduction, and distribution of the Work otherwise complies with
128
- the conditions stated in this License.
129
-
130
- 5. Submission of Contributions. Unless You explicitly state otherwise,
131
- any Contribution intentionally submitted for inclusion in the Work
132
- by You to the Licensor shall be under the terms and conditions of
133
- this License, without any additional terms or conditions.
134
- Notwithstanding the above, nothing herein shall supersede or modify
135
- the terms of any separate license agreement you may have executed
136
- with Licensor regarding such Contributions.
137
-
138
- 6. Trademarks. This License does not grant permission to use the trade
139
- names, trademarks, service marks, or product names of the Licensor,
140
- except as required for reasonable and customary use in describing the
141
- origin of the Work and reproducing the content of the NOTICE file.
142
-
143
- 7. Disclaimer of Warranty. Unless required by applicable law or
144
- agreed to in writing, Licensor provides the Work (and each
145
- Contributor provides its Contributions) on an "AS IS" BASIS,
146
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147
- implied, including, without limitation, any warranties or conditions
148
- of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149
- PARTICULAR PURPOSE. You are solely responsible for determining the
150
- appropriateness of using or redistributing the Work and assume any
151
- risks associated with Your exercise of permissions under this License.
152
-
153
- 8. Limitation of Liability. In no event and under no legal theory,
154
- whether in tort (including negligence), contract, or otherwise,
155
- unless required by applicable law (such as deliberate and grossly
156
- negligent acts) or agreed to in writing, shall any Contributor be
157
- liable to You for damages, including any direct, indirect, special,
158
- incidental, or consequential damages of any character arising as a
159
- result of this License or out of the use or inability to use the
160
- Work (including but not limited to damages for loss of goodwill,
161
- work stoppage, computer failure or malfunction, or any and all
162
- other commercial damages or losses), even if such Contributor
163
- has been advised of the possibility of such damages.
164
-
165
- 9. Accepting Warranty or Additional Liability. While redistributing
166
- the Work or Derivative Works thereof, You may choose to offer,
167
- and charge a fee for, acceptance of support, warranty, indemnity,
168
- or other liability obligations and/or rights consistent with this
169
- License. However, in accepting such obligations, You may act only
170
- on Your own behalf and on Your sole responsibility, not on behalf
171
- of any other Contributor, and only if You agree to indemnify,
172
- defend, and hold each Contributor harmless for any liability
173
- incurred by, or claims asserted against, such Contributor by reason
174
- of your accepting any such warranty or additional liability.
175
-
176
- END OF TERMS AND CONDITIONS
177
-
178
- APPENDIX: How to apply the Apache License to your work.
179
-
180
- To apply the Apache License to your work, attach the following
181
- boilerplate notice, with the fields enclosed by brackets "{}"
182
- replaced with your own identifying information. (Don't include
183
- the brackets!) The text should be enclosed in the appropriate
184
- comment syntax for the file format. We also recommend that a
185
- file or class name and description of purpose be included on the
186
- same "printed page" as the copyright notice for easier
187
- identification within third-party archives.
188
-
189
- Copyright 2024 The Fintech Open Source Foundation
190
-
191
- Licensed under the Apache License, Version 2.0 (the "License");
192
- you may not use this file except in compliance with the License.
193
- You may obtain a copy of the License at
194
-
195
- http://www.apache.org/licenses/LICENSE-2.0
196
-
197
- Unless required by applicable law or agreed to in writing, software
198
- distributed under the License is distributed on an "AS IS" BASIS,
199
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200
- See the License for the specific language governing permissions and
201
- limitations under the License.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
LICENSE.spdx DELETED
@@ -1,7 +0,0 @@
1
- SPDXVersion: SPDX-2.0
2
- DataLicense: CC0-1.0
3
- Creator: The Fintech Open Source Foundation
4
- PackageName: Open Financial LLMs Leaderboard
5
- PackageOriginator: The Fintech Open Source Foundation
6
- PackageHomePage: https://github.com/finos/open-financial-llms-leaderboard
7
- PackageLicenseDeclared: Apache-2.0
 
 
 
 
 
 
 
 
Makefile DELETED
@@ -1,13 +0,0 @@
1
- .PHONY: style format
2
-
3
-
4
- style:
5
- python -m black --line-length 119 .
6
- python -m isort .
7
- ruff check --fix .
8
-
9
-
10
- quality:
11
- python -m black --check --line-length 119 .
12
- python -m isort --check-only .
13
- ruff check .
 
 
 
 
 
 
 
 
 
 
 
 
 
 
NOTICE DELETED
@@ -1,4 +0,0 @@
1
- Open Financial LLMs Leaderboard - FINOS
2
- Copyright 2024 The Fintech Open Source Foundation [email protected]
3
-
4
- This product includes software developed at the Fintech Open Source Foundation (https://www.finos.org/).
 
 
 
 
 
README.md CHANGED
@@ -1,111 +1,91 @@
1
  ---
2
- title: Open FinLLM Leaderboard
3
- emoji: 🥇
4
- colorFrom: green
5
- colorTo: indigo
6
- sdk: gradio
7
- sdk_version: 4.42.0
8
- app_file: app.py
9
  pinned: true
10
  license: apache-2.0
 
 
 
 
 
 
 
 
 
 
11
  ---
12
 
13
- ![badge-labs](https://user-images.githubusercontent.com/327285/230928932-7c75f8ed-e57b-41db-9fb7-a292a13a1e58.svg)
14
-
15
- # Open Financial LLM Leaderboard (OFLL)
16
-
17
- The growing complexity of financial large language models (LLMs) demands evaluations that go beyond general NLP benchmarks. Traditional leaderboards often focus on broader tasks like translation or summarization, but they fall short of addressing the specific needs of the finance industry. Financial tasks such as predicting stock movements, assessing credit risks, and extracting information from financial reports present unique challenges, requiring models with specialized capabilities. This is why we created the **Open Financial LLM Leaderboard (OFLL)**.
18
-
19
- ## Why OFLL?
20
-
21
- OFLL provides a specialized evaluation framework tailored specifically to the financial sector. It fills a critical gap by offering a transparent, one-stop solution to assess model readiness for real-world financial applications. The leaderboard focuses on tasks that matter most to finance professionals—information extraction from financial documents, market sentiment analysis, and financial trend forecasting.
22
-
23
- ## Key Differentiators
24
-
25
- - **Comprehensive Financial Task Coverage**: Unlike general LLM leaderboards that evaluate broad NLP capabilities, OFLL focuses exclusively on tasks directly relevant to finance. These include information extraction, sentiment analysis, credit risk scoring, and stock movement forecasting—tasks crucial for real-world financial decision-making.
26
 
27
- - **Real-World Financial Relevance**: OFLL uses datasets that represent real-world challenges in the finance industry. This ensures models are not only tested on general NLP tasks but are also evaluated on their ability to handle complex financial data, making them suitable for industry applications.
28
 
29
- - **Focused Zero-Shot Evaluation**: OFLL employs a zero-shot evaluation method, testing models on unseen financial tasks without prior fine-tuning. This highlights a model’s ability to generalize and perform well in financial contexts, such as predicting stock price movements or extracting entities from regulatory filings, without being explicitly trained on these tasks.
30
 
31
- ## Key Features of OFLL
 
 
 
 
 
32
 
33
- - **Diverse Task Categories**: OFLL covers tasks across seven categories: Information Extraction (IE), Textual Analysis (TA), Question Answering (QA), Text Generation (TG), Risk Management (RM), Forecasting (FO), and Decision-Making (DM).
34
 
35
- - **Robust Evaluation Metrics**: Models are assessed using various metrics, including Accuracy, F1 Score, ROUGE Score, and Matthews Correlation Coefficient (MCC). These metrics provide a multidimensional view of model performance, helping users identify the strengths and weaknesses of each model.
36
 
37
- The Open Financial LLM Leaderboard aims to set a new standard in evaluating the capabilities of language models in the financial domain, offering a specialized, real-world-focused benchmarking solution.
38
 
 
 
 
 
 
 
 
 
 
 
 
39
 
40
- # Contribute to OFLL
41
 
42
- To make the leaderboard more accessible for external contributors, we offer clear guidelines for adding tasks, updating result files, and other maintenance activities.
 
 
 
 
 
 
 
 
 
 
 
 
 
43
 
44
- 1. **Primary Files**:
45
- - `src/env.py`: Modify variables like repository paths for customization.
46
- - `src/about.py`: Update task configurations here to add new datasets.
47
 
48
- 2. **Adding New Tasks**:
49
- - Navigate to `src/about.py` and specify new tasks in the `Tasks` enum section.
50
- - Each task requires details such as `benchmark`, `metric`, `col_name`, and `category`. For example:
51
- ```python
52
- taskX = Task("DatasetName", "MetricType", "ColumnName", category="Category")
53
- ```
54
 
55
- 3. **Updating Results Files**:
56
- - Results files should be in JSON format and structured as follows:
57
- ```json
58
- {
59
- "config": {
60
- "model_dtype": "torch.float16",
61
- "model_name": "path of the model on the hub: org/model",
62
- "model_sha": "revision on the hub"
63
- },
64
- "results": {
65
- "task_name": {
66
- "metric_name": score
67
- },
68
- "task_name2": {
69
- "metric_name": score
70
- }
71
- }
72
- }
73
- ```
74
-
75
- 4. **Updating Leaderboard Data**:
76
- - When a new task is added, ensure that the results JSON files reflect this update. This process will be automated in future releases.
77
- - Access the current results at [Hugging Face Datasets](https://huggingface.co/datasets/TheFinAI/results/tree/main/demo-leaderboard).
78
 
79
- 5. **Useful Links**:
80
- - [Hugging Face Leaderboard Documentation](https://huggingface.co/docs/leaderboards/en/leaderboards/building_page)
81
- - [OFLL Demo on Hugging Face](https://huggingface.co/spaces/finosfoundation/Open-Financial-LLM-Leaderboard)
82
 
83
-
84
- If you encounter problem on the space, don't hesitate to restart it to remove the create eval-queue, eval-queue-bk, eval-results and eval-results-bk created folder.
 
85
 
86
- # Code logic for more complex edits
87
 
88
- You'll find
89
- - the main table' columns names and properties in `src/display/utils.py`
90
- - the logic to read all results and request files, then convert them in dataframe lines, in `src/leaderboard/read_evals.py`, and `src/populate.py`
91
- - teh logic to allow or filter submissions in `src/submission/submit.py` and `src/submission/check_validity.py`
92
-
93
- ## License
94
-
95
- Copyright 2024 Fintech Open Source Foundation
96
-
97
- Distributed under the [Apache License, Version 2.0](http://www.apache.org/licenses/LICENSE-2.0).
98
-
99
- SPDX-License-Identifier: [Apache-2.0](https://spdx.org/licenses/Apache-2.0)
100
-
101
-
102
- ### Current submissions are manully evaluated. Will open a automatic evaluation pipeline in the future update
103
- tags:
104
- - leaderboard
105
- - modality:text
106
- - submission:manual
107
- - test:public
108
- - judge:humans
109
- - eval:generation
110
- - language:English
111
 
 
 
 
 
1
  ---
2
+ title: Open Financial LLM Leaderboard
3
+ emoji: 🏆
4
+ colorFrom: blue
5
+ colorTo: red
6
+ sdk: docker
7
+ hf_oauth: true
 
8
  pinned: true
9
  license: apache-2.0
10
+ duplicated_from: open-llm-leaderboard/open_llm_leaderboard
11
+ short_description: Evaluating LLMs on Multilingual Multimodal Financial Tasks
12
+ tags:
13
+ - leaderboard
14
+ - modality:text
15
+ - submission:manual
16
+ - test:public
17
+ - judge:function
18
+ - eval:generation
19
+ - domain:financial
20
  ---
21
 
22
+ # Open LLM Leaderboard
 
 
 
 
 
 
 
 
 
 
 
 
23
 
24
+ Modern React interface for comparing Large Language Models (LLMs) in an open and reproducible way.
25
 
26
+ ## Features
27
 
28
+ - 📊 Interactive table with advanced sorting and filtering
29
+ - 🔍 Semantic model search
30
+ - 📌 Pin models for comparison
31
+ - 📱 Responsive and modern interface
32
+ - 🎨 Dark/Light mode
33
+ - ⚡️ Optimized performance with virtualization
34
 
35
+ ## Architecture
36
 
37
+ The project is split into two main parts:
38
 
39
+ ### Frontend (React)
40
 
41
+ ```
42
+ frontend/
43
+ ├── src/
44
+ │ ├── components/ # Reusable UI components
45
+ │ ├── pages/ # Application pages
46
+ │ ├── hooks/ # Custom React hooks
47
+ │ ├── context/ # React contexts
48
+ │ └── constants/ # Constants and configurations
49
+ ├── public/ # Static assets
50
+ └── server.js # Express server for production
51
+ ```
52
 
53
+ ### Backend (FastAPI)
54
 
55
+ ```
56
+ backend/
57
+ ├── app/
58
+ │ ├── api/ # API router and endpoints
59
+ │ │ └── endpoints/ # Specific API endpoints
60
+ │ ├── core/ # Core functionality
61
+ │ ├── config/ # Configuration
62
+ │ └── services/ # Business logic services
63
+ │ ├── leaderboard.py
64
+ │ ├── models.py
65
+ │ ├── votes.py
66
+ │ └── hf_service.py
67
+ └── utils/ # Utility functions
68
+ ```
69
 
70
+ ## Technologies
 
 
71
 
72
+ ### Frontend
 
 
 
 
 
73
 
74
+ - React
75
+ - Material-UI
76
+ - TanStack Table & Virtual
77
+ - Express.js
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
78
 
79
+ ### Backend
 
 
80
 
81
+ - FastAPI
82
+ - Hugging Face API
83
+ - Docker
84
 
85
+ ## Development
86
 
87
+ The application is containerized using Docker and can be run using:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
88
 
89
+ ```bash
90
+ docker-compose up
91
+ ```
app.py DELETED
@@ -1,508 +0,0 @@
1
- import subprocess
2
- import gradio as gr
3
- import pandas as pd
4
- from apscheduler.schedulers.background import BackgroundScheduler
5
- from huggingface_hub import snapshot_download
6
- import os
7
-
8
- from src.about import (
9
- CITATION_BUTTON_LABEL,
10
- CITATION_BUTTON_TEXT,
11
- EVALUATION_QUEUE_TEXT,
12
- INTRODUCTION_TEXT,
13
- LLM_BENCHMARKS_TEXT,
14
- TITLE,
15
- )
16
- from src.display.css_html_js import custom_css
17
- from src.display.utils import (
18
- BENCHMARK_COLS,
19
- COLS,
20
- EVAL_COLS,
21
- EVAL_TYPES,
22
- NUMERIC_INTERVALS,
23
- TYPES,
24
- AutoEvalColumn,
25
- ModelType,
26
- fields,
27
- WeightType,
28
- Precision
29
- )
30
- from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, REPO_ID, RESULTS_REPO, TOKEN
31
- from src.populate import get_evaluation_queue_df, get_leaderboard_df
32
- from src.submission.submit import add_new_eval
33
-
34
-
35
- def restart_space():
36
- API.restart_space(repo_id=REPO_ID)
37
-
38
- try:
39
- print(EVAL_REQUESTS_PATH)
40
- snapshot_download(
41
- repo_id=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
42
- )
43
- except Exception:
44
- restart_space()
45
- try:
46
- print(EVAL_RESULTS_PATH)
47
- snapshot_download(
48
- repo_id=RESULTS_REPO, local_dir=EVAL_RESULTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
49
- )
50
- except Exception:
51
- restart_space()
52
-
53
-
54
- raw_data, original_df = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS)
55
- leaderboard_df = original_df.copy()
56
-
57
- (
58
- finished_eval_queue_df,
59
- running_eval_queue_df,
60
- pending_eval_queue_df,
61
- ) = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
62
-
63
-
64
- # Searching and filtering
65
- def update_table(
66
- hidden_df: pd.DataFrame,
67
- columns_info: list,
68
- columns_IE: list,
69
- columns_TA: list,
70
- columns_QA: list,
71
- columns_TG: list,
72
- columns_RM: list,
73
- columns_FO: list,
74
- columns_DM: list,
75
- columns_spanish: list,
76
- columns_other: list,
77
- type_query: list,
78
- precision_query: list,
79
- size_query: list,
80
- show_deleted: bool,
81
- query: str,
82
- ):
83
- # Combine all column selections
84
- selected_columns = (
85
- columns_info + columns_IE + columns_TA + columns_QA + columns_TG +
86
- columns_RM + columns_FO + columns_DM + columns_spanish + columns_other
87
- )
88
- # Filter models based on queries
89
- filtered_df = filter_models(hidden_df, type_query, size_query, precision_query, show_deleted)
90
- filtered_df = filter_queries(query, filtered_df)
91
- df = select_columns(filtered_df, selected_columns)
92
- return df
93
-
94
-
95
- def search_table(df: pd.DataFrame, query: str) -> pd.DataFrame:
96
- return df[(df[AutoEvalColumn.model.name].str.contains(query, case=False))]
97
-
98
-
99
- def select_columns(df: pd.DataFrame, columns: list) -> pd.DataFrame:
100
- always_here_cols = [
101
- AutoEvalColumn.model_type_symbol.name,
102
- AutoEvalColumn.model.name,
103
- ]
104
-
105
- # Ensure no duplicates and add the new average columns
106
- unique_columns = set(always_here_cols + columns)
107
-
108
- # We use COLS to maintain sorting
109
- filtered_df = df[[c for c in COLS if c in df.columns and c in unique_columns]]
110
-
111
- # Debugging print to see if the new columns are included
112
- print(f"Columns included in DataFrame: {filtered_df.columns.tolist()}")
113
-
114
- return filtered_df
115
-
116
-
117
-
118
-
119
- def filter_queries(query: str, filtered_df: pd.DataFrame) -> pd.DataFrame:
120
- final_df = []
121
- if query != "":
122
- queries = [q.strip() for q in query.split(";")]
123
- for _q in queries:
124
- _q = _q.strip()
125
- if _q != "":
126
- temp_filtered_df = search_table(filtered_df, _q)
127
- if len(temp_filtered_df) > 0:
128
- final_df.append(temp_filtered_df)
129
- if len(final_df) > 0:
130
- filtered_df = pd.concat(final_df)
131
- filtered_df = filtered_df.drop_duplicates(
132
- subset=[AutoEvalColumn.model.name, AutoEvalColumn.precision.name, AutoEvalColumn.revision.name]
133
- )
134
-
135
- return filtered_df
136
-
137
-
138
- def filter_models(
139
- df: pd.DataFrame, type_query: list, size_query: list, precision_query: list, show_deleted: bool
140
- ) -> pd.DataFrame:
141
- # Show all models
142
- if show_deleted:
143
- filtered_df = df
144
- else:
145
- filtered_df = df[df[AutoEvalColumn.still_on_hub.name] == True]
146
-
147
- if "All" not in type_query:
148
- if "?" in type_query:
149
- filtered_df = filtered_df.loc[~df[AutoEvalColumn.model_type_symbol.name].isin([t for t in ModelType if t != "?"])]
150
- else:
151
- type_emoji = [t[0] for t in type_query]
152
- filtered_df = filtered_df.loc[df[AutoEvalColumn.model_type_symbol.name].isin(type_emoji)]
153
-
154
- if "All" not in precision_query:
155
- if "?" in precision_query:
156
- filtered_df = filtered_df.loc[df[AutoEvalColumn.precision.name].isna()]
157
- else:
158
- filtered_df = filtered_df.loc[df[AutoEvalColumn.precision.name].isin(precision_query + ["None"])]
159
-
160
- if "All" not in size_query:
161
- if "?" in size_query:
162
- filtered_df = filtered_df.loc[df[AutoEvalColumn.params.name].isna()]
163
- else:
164
- numeric_interval = pd.IntervalIndex(sorted([NUMERIC_INTERVALS[s] for s in size_query]))
165
- params_column = pd.to_numeric(df[AutoEvalColumn.params.name], errors="coerce")
166
- mask = params_column.apply(lambda x: any(numeric_interval.contains(x)))
167
- filtered_df = filtered_df.loc[mask]
168
-
169
- return filtered_df
170
-
171
-
172
-
173
- def uncheck_all():
174
- return [], [], [], [], [], [], [], [], [], []
175
-
176
- # Get a list of all logo files in the directory
177
- logos_dir = "logos"
178
- logo_files = sorted([f for f in os.listdir(logos_dir) if f.endswith(('.png', '.jpg', '.jpeg'))])
179
-
180
- demo = gr.Blocks(css=custom_css)
181
- with demo:
182
- gr.HTML(TITLE)
183
- gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
184
-
185
- with gr.Tabs(elem_classes="tab-buttons") as tabs:
186
- with gr.TabItem("🏅 LLM Benchmark", elem_id="llm-benchmark-tab-table", id=0):
187
- with gr.Row():
188
- with gr.Column():
189
- with gr.Row():
190
- search_bar = gr.Textbox(
191
- placeholder=" 🔍 Search for your model (separate multiple queries with `;`) and press ENTER...",
192
- show_label=False,
193
- elem_id="search-bar",
194
- )
195
- with gr.Row():
196
- with gr.Accordion("Select columns to show"):
197
- with gr.Tab("Model Information"):
198
- shown_columns_info = gr.CheckboxGroup(
199
- choices=[c.name for c in fields(AutoEvalColumn) if c.category == "Model Information"],
200
- value=[c.name for c in fields(AutoEvalColumn) if c.displayed_by_default and c.category == "Model Information"],
201
- label="Model Information",
202
- interactive=True,
203
- )
204
- with gr.Tab("Information Extraction (IE)"):
205
- shown_columns_IE = gr.CheckboxGroup(
206
- choices=[c.name for c in fields(AutoEvalColumn) if c.category == "Information Extraction (IE)"],
207
- value=[c.name for c in fields(AutoEvalColumn) if c.displayed_by_default and c.category == "Information Extraction (IE)"],
208
- label="Information Extraction (IE)",
209
- interactive=True,
210
- )
211
- with gr.Tab("Textual Analysis (TA)"):
212
- shown_columns_TA = gr.CheckboxGroup(
213
- choices=[c.name for c in fields(AutoEvalColumn) if c.category == "Textual Analysis (TA)"],
214
- value=[c.name for c in fields(AutoEvalColumn) if c.displayed_by_default and c.category == "Textual Analysis (TA)"],
215
- label="Textual Analysis (TA)",
216
- interactive=True,
217
- )
218
- with gr.Tab("Question Answering (QA)"):
219
- shown_columns_QA = gr.CheckboxGroup(
220
- choices=[c.name for c in fields(AutoEvalColumn) if c.category == "Question Answering (QA)"],
221
- value=[c.name for c in fields(AutoEvalColumn) if c.displayed_by_default and c.category == "Question Answering (QA)"],
222
- label="Question Answering (QA)",
223
- interactive=True,
224
- )
225
- with gr.Tab("Text Generation (TG)"):
226
- shown_columns_TG = gr.CheckboxGroup(
227
- choices=[c.name for c in fields(AutoEvalColumn) if c.category == "Text Generation (TG)"],
228
- value=[c.name for c in fields(AutoEvalColumn) if c.displayed_by_default and c.category == "Text Generation (TG)"],
229
- label="Text Generation (TG)",
230
- interactive=True,
231
- )
232
- with gr.Tab("Risk Management (RM)"):
233
- shown_columns_RM = gr.CheckboxGroup(
234
- choices=[c.name for c in fields(AutoEvalColumn) if c.category == "Risk Management (RM)"],
235
- value=[c.name for c in fields(AutoEvalColumn) if c.displayed_by_default and c.category == "Risk Management (RM)"],
236
- label="Risk Management (RM)",
237
- interactive=True,
238
- )
239
- with gr.Tab("Forecasting (FO)"):
240
- shown_columns_FO = gr.CheckboxGroup(
241
- choices=[c.name for c in fields(AutoEvalColumn) if c.category == "Forecasting (FO)"],
242
- value=[c.name for c in fields(AutoEvalColumn) if c.displayed_by_default and c.category == "Forecasting (FO)"],
243
- label="Forecasting (FO)",
244
- interactive=True,
245
- )
246
- with gr.Tab("Decision-Making (DM)"):
247
- shown_columns_DM = gr.CheckboxGroup(
248
- choices=[c.name for c in fields(AutoEvalColumn) if c.category == "Decision-Making (DM)"],
249
- value=[c.name for c in fields(AutoEvalColumn) if c.displayed_by_default and c.category == "Decision-Making (DM)"],
250
- label="Decision-Making (DM)",
251
- interactive=True,
252
- )
253
- with gr.Tab("Spanish"):
254
- shown_columns_spanish = gr.CheckboxGroup(
255
- choices=[c.name for c in fields(AutoEvalColumn) if c.category == "Spanish"],
256
- value=[c.name for c in fields(AutoEvalColumn) if c.displayed_by_default and c.category == "Spanish"],
257
- label="Spanish",
258
- interactive=True,
259
- )
260
- with gr.Tab("Other"):
261
- shown_columns_other = gr.CheckboxGroup(
262
- choices=[c.name for c in fields(AutoEvalColumn) if c.category == "Other"],
263
- value=[c.name for c in fields(AutoEvalColumn) if c.displayed_by_default and c.category == "Other"],
264
- label="Other",
265
- interactive=True,
266
- )
267
- with gr.Row():
268
- uncheck_all_button = gr.Button("Uncheck All")
269
- uncheck_all_button.click(
270
- uncheck_all,
271
- inputs=[],
272
- outputs=[
273
- shown_columns_info,
274
- shown_columns_IE,
275
- shown_columns_TA,
276
- shown_columns_QA,
277
- shown_columns_TG,
278
- shown_columns_RM,
279
- shown_columns_FO,
280
- shown_columns_DM,
281
- shown_columns_spanish,
282
- shown_columns_other,
283
-
284
- ],
285
- )
286
- with gr.Row():
287
- deleted_models_visibility = gr.Checkbox(
288
- value=True, label="Show gated/private/deleted models", interactive=True
289
- )
290
- with gr.Column(min_width=320):
291
- #with gr.Box(elem_id="box-filter"):
292
- filter_columns_type = gr.CheckboxGroup(
293
- label="Model types",
294
- choices=["All"] + [t.to_str() for t in ModelType],
295
- value=["All"],
296
- interactive=True,
297
- elem_id="filter-columns-type",
298
- )
299
- filter_columns_precision = gr.CheckboxGroup(
300
- label="Precision",
301
- choices=["All"] + [i.value.name for i in Precision],
302
- value=["All"],
303
- interactive=True,
304
- elem_id="filter-columns-precision",
305
- )
306
- filter_columns_size = gr.CheckboxGroup(
307
- label="Model sizes (in billions of parameters)",
308
- choices=["All"] + list(NUMERIC_INTERVALS.keys()) + ["?"],
309
- value=["All"],
310
- interactive=True,
311
- elem_id="filter-columns-size",
312
- )
313
-
314
-
315
- leaderboard_table = gr.Dataframe(
316
- value=leaderboard_df[
317
- [c.name for c in fields(AutoEvalColumn) if c.never_hidden]
318
- + [c.name for c in fields(AutoEvalColumn) if c.displayed_by_default and not c.never_hidden]
319
- ],
320
- headers=[c.name for c in fields(AutoEvalColumn) if c.never_hidden]
321
- + [c.name for c in fields(AutoEvalColumn) if c.displayed_by_default and not c.never_hidden],
322
- datatype=TYPES,
323
- elem_id="leaderboard-table",
324
- interactive=False,
325
- visible=True,
326
- )
327
-
328
-
329
- # Dummy leaderboard for handling the case when the user uses backspace key
330
- hidden_leaderboard_table_for_search = gr.Dataframe(
331
- value=original_df[COLS],
332
- headers=COLS,
333
- datatype=TYPES,
334
- visible=False,
335
- )
336
- search_bar.submit(
337
- update_table,
338
- inputs=[
339
- hidden_leaderboard_table_for_search,
340
- shown_columns_info,
341
- shown_columns_IE,
342
- shown_columns_TA,
343
- shown_columns_QA,
344
- shown_columns_TG,
345
- shown_columns_RM,
346
- shown_columns_FO,
347
- shown_columns_DM,
348
- shown_columns_spanish,
349
- shown_columns_other,
350
- filter_columns_type,
351
- filter_columns_precision,
352
- filter_columns_size,
353
- deleted_models_visibility,
354
- search_bar,
355
- ],
356
- outputs=leaderboard_table,
357
- )
358
- for selector in [
359
- shown_columns_info,
360
- shown_columns_IE,
361
- shown_columns_TA,
362
- shown_columns_QA,
363
- shown_columns_TG,
364
- shown_columns_RM,
365
- shown_columns_FO,
366
- shown_columns_DM,
367
- shown_columns_spanish,
368
- shown_columns_other,
369
- filter_columns_type, filter_columns_precision,
370
- filter_columns_size, deleted_models_visibility
371
- ]:
372
- selector.change(
373
- update_table,
374
- inputs=[
375
- hidden_leaderboard_table_for_search,
376
- shown_columns_info,
377
- shown_columns_IE,
378
- shown_columns_TA,
379
- shown_columns_QA,
380
- shown_columns_TG,
381
- shown_columns_RM,
382
- shown_columns_FO,
383
- shown_columns_DM,
384
- shown_columns_spanish,
385
- shown_columns_other,
386
- filter_columns_type,
387
- filter_columns_precision,
388
- filter_columns_size,
389
- deleted_models_visibility,
390
- search_bar,
391
- ],
392
- outputs=leaderboard_table,
393
- queue=True,
394
- )
395
-
396
- with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=2):
397
- gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
398
-
399
- with gr.TabItem("🚀 Submit here! ", elem_id="llm-benchmark-tab-table", id=3):
400
- with gr.Column():
401
- with gr.Row():
402
- gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
403
-
404
- with gr.Column():
405
- with gr.Accordion(
406
- f"✅ Finished Evaluations ({len(finished_eval_queue_df)})",
407
- open=False,
408
- ):
409
- with gr.Row():
410
- finished_eval_table = gr.Dataframe(
411
- value=finished_eval_queue_df,
412
- headers=EVAL_COLS,
413
- datatype=EVAL_TYPES,
414
- row_count=5,
415
- )
416
- with gr.Accordion(
417
- f"🔄 Running Evaluation Queue ({len(running_eval_queue_df)})",
418
- open=False,
419
- ):
420
- with gr.Row():
421
- running_eval_table = gr.Dataframe(
422
- value=running_eval_queue_df,
423
- headers=EVAL_COLS,
424
- datatype=EVAL_TYPES,
425
- row_count=5,
426
- )
427
-
428
- with gr.Accordion(
429
- f"⏳ Pending Evaluation Queue ({len(pending_eval_queue_df)})",
430
- open=False,
431
- ):
432
- with gr.Row():
433
- pending_eval_table = gr.Dataframe(
434
- value=pending_eval_queue_df,
435
- headers=EVAL_COLS,
436
- datatype=EVAL_TYPES,
437
- row_count=5,
438
- )
439
- with gr.Row():
440
- gr.Markdown("# ✉️✨ Submit your model here!", elem_classes="markdown-text")
441
-
442
- with gr.Row():
443
- with gr.Column():
444
- model_name_textbox = gr.Textbox(label="Model name")
445
- revision_name_textbox = gr.Textbox(label="Revision commit", placeholder="main")
446
- model_type = gr.Dropdown(
447
- choices=[t.to_str(" : ") for t in ModelType if t != ModelType.Unknown],
448
- label="Model type",
449
- multiselect=False,
450
- value=None,
451
- interactive=True,
452
- )
453
-
454
- with gr.Column():
455
- precision = gr.Dropdown(
456
- choices=[i.value.name for i in Precision if i != Precision.Unknown],
457
- label="Precision",
458
- multiselect=False,
459
- value="float16",
460
- interactive=True,
461
- )
462
- weight_type = gr.Dropdown(
463
- choices=[i.value.name for i in WeightType],
464
- label="Weights type",
465
- multiselect=False,
466
- value="Original",
467
- interactive=True,
468
- )
469
- base_model_name_textbox = gr.Textbox(label="Base model (for delta or adapter weights)")
470
-
471
- submit_button = gr.Button("Submit Eval")
472
- submission_result = gr.Markdown()
473
- submit_button.click(
474
- add_new_eval,
475
- [
476
- model_name_textbox,
477
- base_model_name_textbox,
478
- revision_name_textbox,
479
- precision,
480
- weight_type,
481
- model_type,
482
- ],
483
- submission_result,
484
- )
485
-
486
- # Footer with logos
487
- with gr.Row(elem_id="footer"):
488
- num_columns = min(5, len(logo_files))
489
- for i in range(0, len(logo_files), num_columns):
490
- with gr.Row():
491
- for logo in logo_files[i:i + num_columns]:
492
- logo_path = os.path.join(logos_dir, logo)
493
- gr.Image(logo_path, show_label=False, elem_id="logo-image", width=100, height=100)
494
-
495
- with gr.Row():
496
- with gr.Accordion("📙 Citation", open=False):
497
- citation_button = gr.Textbox(
498
- value=CITATION_BUTTON_TEXT,
499
- label=CITATION_BUTTON_LABEL,
500
- lines=20,
501
- elem_id="citation-button",
502
- show_copy_button=True,
503
- )
504
-
505
- scheduler = BackgroundScheduler()
506
- scheduler.add_job(restart_space, "interval", seconds=1800)
507
- scheduler.start()
508
- demo.queue(default_concurrency_limit=40).launch()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
examples/model_evaluation.ipynb DELETED
The diff for this file is too large to render. See raw diff
 
frontend/public/index.html ADDED
@@ -0,0 +1,96 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <!DOCTYPE html>
2
+ <html lang="en">
3
+ <head>
4
+ <meta charset="utf-8" />
5
+ <link rel="icon" href="%PUBLIC_URL%/logo32.png" />
6
+ <meta
7
+ name="viewport"
8
+ content="width=device-width, initial-scale=1, maximum-scale=1, user-scalable=no, viewport-fit=cover"
9
+ />
10
+ <meta
11
+ name="description"
12
+ content="Interactive leaderboard for comparing LLM performance across financial benchmarks."
13
+ />
14
+
15
+ <!-- Open Graph / Facebook -->
16
+ <meta property="og:type" content="website" />
17
+ <meta
18
+ property="og:url"
19
+ content="https://huggingface.co/spaces/TheFinAI/open_finllm_leaderboard"
20
+ />
21
+ <meta
22
+ property="og:title"
23
+ content="Open Financial LLM Leaderboard - Compare Large Language Models in Financial Domain"
24
+ />
25
+ <meta
26
+ property="og:description"
27
+ content="Interactive leaderboard for comparing LLM performance across financial benchmarks."
28
+ />
29
+ <meta property="og:image" content="%PUBLIC_URL%/og-image.png" />
30
+
31
+ <!-- Twitter -->
32
+ <meta property="twitter:card" content="summary_large_image" />
33
+ <meta
34
+ property="twitter:url"
35
+ content="https://huggingface.co/spaces/TheFinAI/open_finllm_leaderboard"
36
+ />
37
+ <meta
38
+ property="twitter:title"
39
+ content="Open Financial LLM Leaderboard - Compare Large Language Models in Financial Domain"
40
+ />
41
+ <meta
42
+ property="twitter:description"
43
+ content="Interactive leaderboard for comparing LLM performance across financial benchmarks."
44
+ />
45
+ <meta property="twitter:image" content="%PUBLIC_URL%/og-image.png" />
46
+ <!--
47
+ Notice the use of %PUBLIC_URL% in the tags above.
48
+ It will be replaced with the URL of the `public` folder during the build.
49
+ Only files inside the `public` folder can be referenced from the HTML.
50
+
51
+ Unlike "/favicon.ico" or "favicon.ico", "%PUBLIC_URL%/favicon.ico" will
52
+ work correctly both with client-side routing and a non-root public URL.
53
+ Learn how to configure a non-root public URL by running `npm run build`.
54
+ -->
55
+ <title>
56
+ Open Financial LLM Leaderboard - Compare Large Language Models in Financial Domain
57
+ </title>
58
+ <link
59
+ href="https://fonts.googleapis.com/css2?family=Source+Sans+Pro:wght@400;600;700&display=swap"
60
+ rel="stylesheet"
61
+ />
62
+ <style>
63
+ html,
64
+ body {
65
+ position: fixed;
66
+ width: 100%;
67
+ height: 100%;
68
+ overflow: hidden;
69
+ -webkit-overflow-scrolling: touch;
70
+ }
71
+ #root {
72
+ position: absolute;
73
+ top: 0;
74
+ left: 0;
75
+ right: 0;
76
+ bottom: 0;
77
+ overflow-y: auto;
78
+ -webkit-overflow-scrolling: touch;
79
+ }
80
+ </style>
81
+ </head>
82
+ <body>
83
+ <noscript>You need to enable JavaScript to run this app.</noscript>
84
+ <div id="root"></div>
85
+ <!--
86
+ This HTML file is a template.
87
+ If you open it directly in the browser, you will see an empty page.
88
+
89
+ You can add webfonts, meta tags, or analytics to this file.
90
+ The build step will place the bundled scripts into the <body> tag.
91
+
92
+ To begin the development, run `npm start` or `yarn start`.
93
+ To create a production bundle, use `npm run build` or `yarn build`.
94
+ -->
95
+ </body>
96
+ </html>
frontend/public/og-image.jpg ADDED
frontend/public/robots.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ # https://www.robotstxt.org/robotstxt.html
2
+ User-agent: *
3
+ Disallow:
logos/1_columbia.png DELETED
Binary file (121 kB)
 
logos/2_openfinance.jpg DELETED
Binary file (28.6 kB)
 
logos/3_rpi.png DELETED
Binary file (89.8 kB)
 
logos/4_finai-logo.jpg DELETED
Binary file (65.8 kB)
 
logos/5_huggingface.jpeg DELETED
Binary file (7.12 kB)
 
logos/FinOS.png DELETED
Binary file (35.2 kB)
 
logos/archimedes logo GB copy.jpg DELETED
Binary file (38 kB)
 
logos/manc.png DELETED
Binary file (15 kB)
 
logos/nactemlogo.jpg DELETED
Binary file (4.47 kB)
 
logos/uf.png DELETED
Binary file (15 kB)
 
logos/wuhan.png DELETED
Binary file (64.7 kB)
 
pyproject.toml DELETED
@@ -1,13 +0,0 @@
1
- [tool.ruff]
2
- # Enable pycodestyle (`E`) and Pyflakes (`F`) codes by default.
3
- select = ["E", "F"]
4
- ignore = ["E501"] # line too long (black is taking care of this)
5
- line-length = 119
6
- fixable = ["A", "B", "C", "D", "E", "F", "G", "I", "N", "Q", "S", "T", "W", "ANN", "ARG", "BLE", "COM", "DJ", "DTZ", "EM", "ERA", "EXE", "FBT", "ICN", "INP", "ISC", "NPY", "PD", "PGH", "PIE", "PL", "PT", "PTH", "PYI", "RET", "RSE", "RUF", "SIM", "SLF", "TCH", "TID", "TRY", "UP", "YTT"]
7
-
8
- [tool.isort]
9
- profile = "black"
10
- line_length = 119
11
-
12
- [tool.black]
13
- line-length = 119
 
 
 
 
 
 
 
 
 
 
 
 
 
 
requirements.txt DELETED
@@ -1,20 +0,0 @@
1
- APScheduler==3.10.1
2
- black==23.11.0
3
- click==8.1.3
4
- datasets==2.14.5
5
- gradio==4.42.0
6
- gradio_client==1.3.0
7
- huggingface-hub>=0.18.0
8
- matplotlib==3.7.1
9
- numpy==1.24.2
10
- pandas==2.0.0
11
- python-dateutil==2.8.2
12
- requests==2.32.3
13
- tqdm==4.65.0
14
- transformers==4.35.2
15
- tokenizers>=0.15.0
16
- git+https://github.com/EleutherAI/lm-evaluation-harness.git@b281b0921b636bc36ad05c0b0b0763bd6dd43463#egg=lm-eval
17
- accelerate==0.24.1
18
- pydantic==2.9.1
19
- fastapi==0.112.4
20
- sentencepiece
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/about.py DELETED
@@ -1,210 +0,0 @@
1
- from dataclasses import dataclass
2
- from enum import Enum
3
-
4
-
5
- @dataclass
6
- class Task:
7
- benchmark: str
8
- metric: str
9
- col_name: str
10
- category: str
11
-
12
-
13
- # Select your tasks here
14
- # ---------------------------------------------------
15
- class Tasks(Enum):
16
- task0 = Task("FPB", "F1", "FPB", category="Textual Analysis (TA)")
17
- task2 = Task("FiQA-SA", "F1", "FiQA-SA", category="Textual Analysis (TA)")
18
- task3 = Task("TSA", "RMSE", "TSA", category="Textual Analysis (TA)")
19
- task4 = Task("Headlines", "AvgF1", "Headlines", category="Textual Analysis (TA)")
20
- task5 = Task("FOMC", "F1", "FOMC", category="Textual Analysis (TA)")
21
- task7 = Task("FinArg-ACC", "MicroF1", "FinArg-ACC", category="Textual Analysis (TA)")
22
- task8 = Task("FinArg-ARC", "MicroF1", "FinArg-ARC", category="Textual Analysis (TA)")
23
- task9 = Task("MultiFin", "MicroF1", "MultiFin", category="Textual Analysis (TA)")
24
- task10 = Task("MA", "MicroF1", "MA", category="Textual Analysis (TA)")
25
- task11 = Task("MLESG", "MicroF1", "MLESG", category="Textual Analysis (TA)")
26
- task12 = Task("NER", "EntityF1", "NER", category="Information Extraction (IE)")
27
- task13 = Task("FINER-ORD", "EntityF1", "FINER-ORD", category="Information Extraction (IE)")
28
- task14 = Task("FinRED", "F1", "FinRED", category="Information Extraction (IE)")
29
- task15 = Task("SC", "F1", "SC", category="Information Extraction (IE)")
30
- task16 = Task("CD", "F1", "CD", category="Information Extraction (IE)")
31
- task17 = Task("FinQA", "EmAcc", "FinQA", category="Question Answering (QA)")
32
- task18 = Task("TATQA", "EmAcc", "TATQA", category="Question Answering (QA)")
33
- task19 = Task("ConvFinQA", "EmAcc", "ConvFinQA", category="Question Answering (QA)")
34
- task20 = Task("FNXL", "EntityF1", "FNXL", category="Information Extraction (IE)")
35
- task21 = Task("FSRL", "EntityF1", "FSRL", category="Information Extraction (IE)")
36
- task22 = Task("EDTSUM", "Rouge-1", "EDTSUM", category="Text Generation (TG)")
37
- task25 = Task("ECTSUM", "Rouge-1", "ECTSUM", category="Text Generation (TG)")
38
- task28 = Task("BigData22", "Acc", "BigData22", category="Forecasting (FO)")
39
- task30 = Task("ACL18", "Acc", "ACL18", category="Forecasting (FO)")
40
- task32 = Task("CIKM18", "Acc", "CIKM18", category="Forecasting (FO)")
41
- task34 = Task("German", "MCC", "German", category="Risk Management (RM)")
42
- task36 = Task("Australian", "MCC", "Australian", category="Risk Management (RM)")
43
- task38 = Task("LendingClub", "MCC", "LendingClub", category="Risk Management (RM)")
44
- task40 = Task("ccf", "MCC", "ccf", category="Risk Management (RM)")
45
- task42 = Task("ccfraud", "MCC", "ccfraud", category="Risk Management (RM)")
46
- task44 = Task("polish", "MCC", "polish", category="Risk Management (RM)")
47
- task46 = Task("taiwan", "MCC", "taiwan", category="Risk Management (RM)")
48
- task48 = Task("portoseguro", "MCC", "portoseguro", category="Risk Management (RM)")
49
- task50 = Task("travelinsurance", "MCC", "travelinsurance", category="Risk Management (RM)")
50
- task51 = Task("MultiFin-ES", "F1", "MultiFin-ES", category="Spanish")
51
- task52 = Task("EFP", "F1", "EFP", category="Spanish")
52
- task53 = Task("EFPA", "F1", "EFPA", category="Spanish")
53
- task54 = Task("FinanceES", "F1", "FinanceES", category="Spanish")
54
- task55 = Task("TSA-Spanish", "F1", "TSA-Spanish", category="Spanish")
55
- task56 = Task("FinTrade", "SR", "FinTrade", category="Decision-Making (DM)")
56
-
57
- NUM_FEWSHOT = 0 # Change with your few shot
58
- # ---------------------------------------------------
59
-
60
-
61
- # Your leaderboard name
62
- TITLE = """<h1 align="center" id="space-title">🐲 Open Financial LLM Leaderboard</h1>"""
63
-
64
- # What does your leaderboard evaluate?
65
- INTRODUCTION_TEXT = """
66
- 🌟 The Open Financial LLM Leaderboard: Evaluate and compare the performance of financial Large Language Models (LLMs).
67
-
68
- When you submit a model on the "Submit here!" page, it is automatically evaluated on a set of financial benchmarks.
69
-
70
- The GPU used for evaluation is operated with the support of __[Wuhan University](http://en.whu.edu.cn/)__ and __[University of Florida](https://www.ufl.edu/)__.
71
-
72
- The datasets used for evaluation consist of diverse financial datasets from `FinBen` benchmark to assess tasks such as sentiment analysis, named entity recognition, question answering, and more.
73
-
74
- More details about the benchmarks and the evaluation process are provided on the “About” page.
75
- """
76
-
77
- # Which evaluations are you running? how can people reproduce what you have?
78
- LLM_BENCHMARKS_TEXT = """
79
- ## Introduction
80
-
81
- The **Open Financial LLMs Leaderboard (OFLL)** is meticulously designed to rigorously track, rank, and evaluate state-of-the-art models in financial Natural Language Understanding and Prediction. Our leaderboard not only covers standard NLP tasks but also incorporates financial prediction tasks such as stock movement and credit scoring, offering a comprehensive evaluation for real-world financial applications.
82
-
83
- ## Icons & Model Types
84
-
85
- - 🟢 : pretrained or continuously pretrained
86
- - 🔶 : fine-tuned on domain-specific datasets
87
- - 💬 : chat models (RLHF, DPO, ORPO, ...)
88
- - 🤝 : base merges and moerges
89
-
90
- If the icon is "?", it indicates that there is insufficient information about the model. Please provide information about the model through an issue! 🤩
91
-
92
- **Note 1**: We reserve the right to correct any incorrect tags/icons after manual verification to ensure the accuracy and reliability of the leaderboard.
93
-
94
- **Note 2** ⚠️: Some models might be widely discussed as subjects of caution by the community, implying that users should exercise restraint when using them. Models that have used the evaluation set for training to achieve a high leaderboard ranking, among others, may be selected as subjects of caution and might result in their deletion from the leaderboard.
95
-
96
- ## How It Works
97
-
98
- 📈 We evaluate models using Pixiu, a powerful and straightforward framework to test and assess language models on a large number of different evaluation tasks from FinBen, using datasets validated by financial experts.
99
-
100
- ### Evaluation Metrics
101
-
102
- Our evaluation metrics include, but are not limited to, Accuracy, F1 Score, ROUGE score, BERTScore, and Matthews correlation coefficient (MCC), providing a multidimensional assessment of model performance. Metrics for specific tasks are as follows:
103
-
104
- - **FPB**: F1, Accuracy. Financial PhraseBank classification task. This dataset is from the Financial PhraseBank, containing annotated phrases used in financial contexts. The classification task involves determining sentiment (positive, negative, neutral) for each phrase, essential for understanding financial news and reports.
105
- - **FiQA-SA**: F1. Sentiment analysis on FiQA financial domain. Derived from the FiQA dataset, this task focuses on sentiment analysis in the financial domain, particularly within news and social media. The dataset is crucial for gauging market sentiment based on financial communications.
106
- - **TSA**: F1, RMSE. Sentiment analysis on social media. The TSA dataset is utilized to analyze sentiment from tweets related to financial markets. The dataset is essential for real-time sentiment analysis, providing insights into market trends influenced by public opinion.
107
- - **Headlines**: AvgF1. News headline classification. This dataset consists of financial news headlines, with each headline categorized into various financial events or sentiment classes. The task challenges models to understand and classify brief, context-rich text segments that drive market movements.
108
- - **FOMC**: F1, Accuracy. Hawkish-dovish classification. Derived from transcripts of the Federal Open Market Committee (FOMC) meetings, this dataset involves classifying statements as hawkish or dovish, which indicates the stance of monetary policy. Accurate classification helps predict market reactions to central bank communications.
109
- - **FinArg-ACC**: F1, Accuracy. Financial argument unit classification. This dataset involves the classification of argument units in financial documents, such as identifying the main claim, supporting evidence, or counterarguments. The task is crucial for automated financial document analysis, enabling the extraction of structured information from unstructured text.
110
- - **FinArg-ARC**: F1, Accuracy. Financial argument relation classification. This task focuses on classifying relationships between different argument units within financial texts, such as support, opposition, or neutrality. Understanding these relations is critical for constructing coherent financial narratives from fragmented data.
111
- - **MultiFin**: F1, Accuracy. Multi-class financial sentiment analysis. The MultiFin dataset includes diverse financial texts requiring sentiment classification across multiple categories, such as bullish, bearish, or neutral. The task is pivotal for analyzing sentiment in financial markets from varied sources like reports, news articles, and social media.
112
- - **MA**: F1, Accuracy. Deal completeness classification. The dataset revolves around classifying mergers and acquisitions (M&A) reports to determine whether a deal has been completed. The task helps in tracking and analyzing the outcomes of corporate transactions, which is key for investment decisions.
113
- - **MLESG**: F1, Accuracy. ESG issue identification. This dataset focuses on identifying Environmental, Social, and Governance (ESG) issues within financial texts. Models are evaluated on their ability to correctly classify and categorize ESG-related content, which is increasingly important for responsible investing.
114
- - **NER**: EntityF1. Named entity recognition in financial texts. This task involves identifying and classifying named entities (e.g., companies, financial instruments, persons) within financial documents. Accurate NER is crucial for information extraction and financial analysis automation.
115
- - **FINER-ORD**: EntityF1. Ordinal classification in financial NER. This dataset extends standard NER by requiring models to classify entities not just by type but also by their ordinal relevance (e.g., primary, secondary importance) within the text. This is useful for prioritizing information in financial summaries.
116
- - **FinRED**: F1, EntityF1. Financial relation extraction from text. The task involves extracting relationships between financial entities, such as ownership, acquisition, or partnership relations. This is important for building knowledge graphs and conducting in-depth financial analysis.
117
- - **SC**: F1, EntityF1. Causal classification task in the financial domain. The dataset requires models to classify causal relationships in financial texts, such as determining whether one event causes another. Understanding causality is critical for risk assessment and decision-making in finance.
118
- - **CD**: F1, EntityF1. Causal detection. Similar to SC, but focused on detecting causality in a broader range of financial texts, including reports, news, and social media. The task evaluates the model's ability to identify causal links, which are key drivers in financial analysis.
119
- - **FinQA**: EmAcc. Numerical question answering in finance. FinQA involves answering numerical questions based on financial documents, such as balance sheets or income statements. The task tests a model's ability to perform calculations or identify numerical data in a text.
120
- - **TATQA**: F1, EmAcc. Table-based question answering in financial documents. This task is centered around answering questions that require interpreting and extracting information from tables in financial documents. It's crucial for automating the analysis of structured financial data.
121
- - **ConvFinQA**: EmAcc. Multi-turn question answering in finance. ConvFinQA extends standard QA tasks by requiring models to handle multi-turn dialogues, where each question builds on the previous one. This simulates real-world scenarios where financial analysts ask a series of related questions.
122
- - **FNXL**: F1, EmAcc. Numeric labeling in financial texts. This dataset requires models to label numeric values within financial documents, categorizing them by type (e.g., revenue, profit) and relevance. It tests the model's ability to understand the role of numbers in financial contexts.
123
- - **FSRL**: F1, EmAcc. Financial statement relation linking. The task involves linking related information across different financial statements, such as matching revenue figures from income statements with corresponding cash flow data. This is key for comprehensive financial analysis.
124
- - **EDTSUM**: ROUGE, BERTScore, BARTScore. Extractive document summarization in finance. The dataset involves summarizing lengthy financial documents by extracting the most relevant sentences. This task evaluates a model's ability to generate concise summaries that retain critical information.
125
- - **ECTSUM**: ROUGE, BERTScore, BARTScore. Extractive content summarization. Similar to EDTSUM, but with a broader focus on summarizing content from various financial document types, including reports, articles, and regulatory filings.
126
- - **BigData22**: Accuracy, MCC. Stock movement prediction. This dataset is used for predicting stock price movements based on financial news and reports. The task evaluates a model's ability to forecast market trends, which is essential for investment strategies.
127
- - **ACL18**: Accuracy, MCC. Financial news-based stock prediction. The ACL18 dataset focuses on predicting stock movements specifically using news headlines and articles. It's a benchmark for evaluating the impact of news on stock prices.
128
- - **CIKM18**: Accuracy, MCC. Financial market prediction using news. This task involves predicting broader market movements, such as indices, based on financial news. It tests the model's ability to aggregate and interpret multiple sources of financial information.
129
- - **German**: F1, MCC. Credit scoring in the German market. The dataset includes data on loan applicants in Germany, with the task being to predict creditworthiness. This is important for financial institutions in assessing loan risks.
130
- - **Australian**: F1, MCC. Credit scoring in the Australian market. Similar to the German dataset, but tailored for the Australian financial context, this task evaluates the model's ability to predict credit risk in this specific market.
131
- - **LendingClub**: F1, MCC. Peer-to-peer lending risk prediction. This dataset involves predicting the risk of default for loans issued through the LendingClub platform, which is a major peer-to-peer lending service. The task is crucial for risk management in alternative finance.
132
- - **ccf**: F1, MCC. Credit card fraud detection. The dataset is used to identify fraudulent transactions within a large dataset of credit card operations. Accurate detection is critical for financial security and fraud prevention.
133
- - **ccfraud**: F1, MCC. Credit card transaction fraud detection. Similar to the ccf dataset but focusing on transaction-level analysis, this task evaluates the model's ability to detect anomalies that indicate fraud.
134
- - **polish**: F1, MCC. Credit risk prediction in the Polish market. This task involves predicting the likelihood of default for loan applicants in Poland, with the dataset tailored to local economic and financial conditions.
135
- - **taiwan**: F1, MCC. Credit risk prediction in the Taiwanese market. Similar to the Polish dataset but focused on Taiwan, this task evaluates the model's ability to assess credit risk in this market.
136
- - **portoseguro**: F1, MCC. Claim analysis in the Brazilian market. The dataset involves predicting insurance claim risks in Brazil, specifically for auto insurance. The task tests the model's ability to assess and manage insurance risks.
137
- - **travelinsurance**: F1, MCC. Travel insurance claim prediction. This dataset is used for predicting the likelihood of a travel insurance claim being made, which is important for risk pricing and policy management in the travel insurance industry.
138
- - **MultiFin-ES**: F1. Multi-class financial sentiment analysis in Spanish. This dataset is used to analyze sentiment in Spanish-language financial texts. It evaluates the model's ability to handle sentiment classification across multiple categories in a non-English context.
139
- - **EFP**: F1. Financial phrase classification in Spanish. Similar to the FPB dataset but in Spanish, this task involves classifying financial phrases according to sentiment or intent, specifically for Spanish-language content.
140
- - **EFPA**: F1. Financial argument classification in Spanish. This dataset requires the classification of arguments in Spanish financial documents, focusing on identifying claims, evidence, and other argumentative structures.
141
- - **FinanceES**: F1. Financial sentiment classification in Spanish. The task involves classifying sentiment in a broad range of Spanish financial documents, including news articles and reports. It tests the model's ability to adapt sentiment analysis techniques to a non-English language.
142
- - **TSA-Spanish**: F1. Sentiment analysis in Spanish. This dataset involves sentiment analysis on Spanish-language tweets and short texts, similar to the English TSA dataset but tailored for Spanish speakers. It evaluates the model's ability to process and analyze sentiment in social media content.
143
- - **FinTrade**: SR. Stock trading dataset. FinTrade is a novel dataset developed specifically for evaluating stock trading tasks using LLMs. It incorporates historical stock prices, financial news, and sentiment data from 10 different stocks over a year. This dataset is designed to simulate real-world trading scenarios, allowing models to perform agent-based financial trading. The task evaluates the models on multiple financial metrics such as Cumulative Return (CR), Sharpe Ratio (SR), Daily Volatility (DV), Annualized Volatility (AV), and Maximum Drawdown (MD). These metrics provide a comprehensive assessment of the model's profitability, risk management, and decision-making capabilities.
144
-
145
-
146
-
147
- To ensure a fair and unbiased assessment of the models' true capabilities, all evaluations are conducted in zero-shot settings (0-shots). This approach eliminates any potential advantage from task-specific fine-tuning, providing a clear indication of how well the models can generalize to new tasks.
148
-
149
- Given the nature of the tasks, which include multiple-choice and yes/no questions, we extract options from the generated text to evaluate performance.
150
-
151
- Please, consider reaching out to us through the discussions tab if you are working on benchmarks for financial LLMs and willing to see them on this leaderboard as well. Your benchmark might change the whole game for financial models!
152
-
153
- GPUs are provided by Wuhan University and the University of Florida for the evaluations.
154
-
155
- ## Details and Logs
156
-
157
- - Detailed numerical results in the [results FinBen dataset](https://huggingface.co/datasets/FinBen/results)
158
- - Community queries and running status in the [requests FinBen dataset](https://huggingface.co/datasets/FinBen/requests)
159
-
160
- ## More Resources
161
-
162
- If you still have questions, you can check our github repository [here](https://github.com/The-FinAI/PIXIU).
163
- """
164
-
165
- EVALUATION_QUEUE_TEXT = """
166
- ## Some good practices before submitting a model
167
-
168
- ### 1) Make sure you can load your model and tokenizer using AutoClasses:
169
- ```python
170
- from transformers import AutoConfig, AutoModel, AutoTokenizer
171
- config = AutoConfig.from_pretrained("your model name", revision=revision)
172
- model = AutoModel.from_pretrained("your model name", revision=revision)
173
- tokenizer = AutoTokenizer.from_pretrained("your model name", revision=revision)
174
- ```
175
- If this step fails, follow the error messages to debug your model before submitting it. It's likely your model has been improperly uploaded.
176
-
177
- Note: make sure your model is public!
178
- Note: if your model needs `use_remote_code=True`, we do not support this option yet but we are working on adding it, stay posted!
179
-
180
- ### 2) Convert your model weights to [safetensors](https://huggingface.co/docs/safetensors/index)
181
- It's a new format for storing weights which is safer and faster to load and use. It will also allow us to add the number of parameters of your model to the `Extended Viewer`!
182
-
183
- ### 3) Make sure your model has an open license!
184
- This is a leaderboard for Open LLMs, and we'd love for as many people as possible to know they can use your model 🤗
185
-
186
- ### 4) Fill up your model card
187
- When we add extra information about models to the leaderboard, it will be automatically taken from the model card
188
-
189
- ## In case of model failure
190
- If your model is displayed in the `FAILED` category, its execution stopped.
191
- Make sure you have followed the above steps first.
192
- If everything is done, check you can launch the EleutherAIHarness on your model locally, using the above command without modifications (you can add `--limit` to limit the number of examples per task).
193
- """
194
-
195
- CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
196
- CITATION_BUTTON_TEXT = r"""
197
- @article{Xie2024FinBen,
198
- title={FinBen: A Holistic Financial Benchmark for Large Language Models},
199
- author={Qianqian Xie and Weiguang Han and Zhengyu Chen and Ruoyu Xiang and Xiao Zhang and Yueru He and Mengxi Xiao and Dong Li and Yongfu Dai and Duanyu Feng and Yijing Xu and Haoqiang Kang and Ziyan Kuang and Chenhan Yuan and Kailai Yang and Zheheng Luo and Tianlin Zhang and Zhiwei Liu and Guojun Xiong and Zhiyang Deng and Yuechen Jiang and Zhiyuan Yao and Haohang Li and Yangyang Yu and Gang Hu and Jiajia Huang and Xiao-Yang Liu and Alejandro Lopez-Lira and Benyou Wang and Yanzhao Lai and Hao Wang and Min Peng and Sophia Ananiadou and Jimin Huang},
200
- journal={NeurIPS, Special Track on Datasets and Benchmarks},
201
- year={2024}
202
- }
203
-
204
- @article{Xie2023PIXIU,
205
- title={PIXIU: A comprehensive benchmark, instruction dataset and large language model for finance},
206
- author={Qianqian Xie and Weiguang Han and Xiao Zhang and Yanzhao Lai and Min Peng and Alejandro Lopez-Lira and Jimin Huang},
207
- journal={NeurIPS, Special Track on Datasets and Benchmarks},
208
- year={2023}
209
- }
210
- """
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/display/css_html_js.py DELETED
@@ -1,105 +0,0 @@
1
- custom_css = """
2
-
3
- .markdown-text {
4
- font-size: 16px !important;
5
- }
6
-
7
- #models-to-add-text {
8
- font-size: 18px !important;
9
- }
10
-
11
- #citation-button span {
12
- font-size: 16px !important;
13
- }
14
-
15
- #citation-button textarea {
16
- font-size: 16px !important;
17
- }
18
-
19
- #citation-button > label > button {
20
- margin: 6px;
21
- transform: scale(1.3);
22
- }
23
-
24
- #leaderboard-table {
25
- margin-top: 15px
26
- }
27
-
28
- #leaderboard-table-lite {
29
- margin-top: 15px
30
- }
31
-
32
- #search-bar-table-box > div:first-child {
33
- background: none;
34
- border: none;
35
- }
36
-
37
- #search-bar {
38
- padding: 0px;
39
- }
40
-
41
- /* Limit the width of the first AutoEvalColumn so that names don't expand too much */
42
- table td:first-child,
43
- table th:first-child {
44
- max-width: 400px;
45
- overflow: auto;
46
- white-space: nowrap;
47
- }
48
-
49
- .tab-buttons button {
50
- font-size: 20px;
51
- }
52
-
53
- #scale-logo {
54
- border-style: none !important;
55
- box-shadow: none;
56
- display: block;
57
- margin-left: auto;
58
- margin-right: auto;
59
- max-width: 600px;
60
- }
61
-
62
- #scale-logo .download {
63
- display: none;
64
- }
65
- #filter_type{
66
- border: 0;
67
- padding-left: 0;
68
- padding-top: 0;
69
- }
70
- #filter_type label {
71
- display: flex;
72
- }
73
- #filter_type label > span{
74
- margin-top: var(--spacing-lg);
75
- margin-right: 0.5em;
76
- }
77
- #filter_type label > .wrap{
78
- width: 103px;
79
- }
80
- #filter_type label > .wrap .wrap-inner{
81
- padding: 2px;
82
- }
83
- #filter_type label > .wrap .wrap-inner input{
84
- width: 1px
85
- }
86
- #filter-columns-type{
87
- border:0;
88
- padding:0.5;
89
- }
90
- #filter-columns-size{
91
- border:0;
92
- padding:0.5;
93
- }
94
- #box-filter > .form{
95
- border: 0
96
- }
97
- """
98
-
99
- get_window_url_params = """
100
- function(url_params) {
101
- const params = new URLSearchParams(window.location.search);
102
- url_params = Object.fromEntries(params);
103
- return url_params;
104
- }
105
- """
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/display/formatting.py DELETED
@@ -1,27 +0,0 @@
1
- def model_hyperlink(link, model_name):
2
- return f'<a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model_name}</a>'
3
-
4
-
5
- def make_clickable_model(model_name):
6
- link = f"https://huggingface.co/{model_name}"
7
- return model_hyperlink(link, model_name)
8
-
9
-
10
- def styled_error(error):
11
- return f"<p style='color: red; font-size: 20px; text-align: center;'>{error}</p>"
12
-
13
-
14
- def styled_warning(warn):
15
- return f"<p style='color: orange; font-size: 20px; text-align: center;'>{warn}</p>"
16
-
17
-
18
- def styled_message(message):
19
- return f"<p style='color: green; font-size: 20px; text-align: center;'>{message}</p>"
20
-
21
-
22
- def has_no_nan_values(df, columns):
23
- return df[columns].notna().all(axis=1)
24
-
25
-
26
- def has_nan_values(df, columns):
27
- return df[columns].isna().any(axis=1)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/display/utils.py DELETED
@@ -1,146 +0,0 @@
1
- from dataclasses import dataclass, make_dataclass
2
- from enum import Enum
3
-
4
- import pandas as pd
5
-
6
- from src.about import Tasks
7
-
8
- def fields(raw_class):
9
- return [v for k, v in raw_class.__dict__.items() if k[:2] != "__" and k[-2:] != "__"]
10
-
11
-
12
- # These classes are for user facing column names,
13
- # to avoid having to change them all around the code
14
- # when a modif is needed
15
- @dataclass
16
- class ColumnContent:
17
- name: str
18
- type: str
19
- displayed_by_default: bool
20
- category: str = "" # New attribute to hold the category
21
- hidden: bool = False
22
- never_hidden: bool = False
23
-
24
- ## Leaderboard columns
25
- auto_eval_column_dict = []
26
-
27
- # Model Information
28
- auto_eval_column_dict.append(["model_type_symbol", ColumnContent, ColumnContent("T", "str", True, category="Model Information", never_hidden=True)])
29
- auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model", "markdown", True, category="Model Information", never_hidden=True)])
30
-
31
- auto_eval_column_dict.append(["average", ColumnContent, ColumnContent("Average ⬆️", "number", True, category="Model Information")])
32
- auto_eval_column_dict.append(["average_IE", ColumnContent, ColumnContent("Average IE ⬆️", "number", True, category="Information Extraction (IE)")])
33
- auto_eval_column_dict.append(["average_TA", ColumnContent, ColumnContent("Average TA ⬆️", "number", True, category="Textual Analysis (TA)")])
34
- auto_eval_column_dict.append(["average_QA", ColumnContent, ColumnContent("Average QA ⬆️", "number", True, category="Question Answering (QA)")])
35
- auto_eval_column_dict.append(["average_TG", ColumnContent, ColumnContent("Average TG ⬆️", "number", True, category="Text Generation (TG)")])
36
- auto_eval_column_dict.append(["average_RM", ColumnContent, ColumnContent("Average RM ⬆️", "number", True, category="Risk Management (RM)")])
37
- auto_eval_column_dict.append(["average_FO", ColumnContent, ColumnContent("Average FO ⬆️", "number", True, category="Forecasting (FO)")])
38
- auto_eval_column_dict.append(["average_DM", ColumnContent, ColumnContent("Average DM ⬆️", "number", True, category="Decision-Making (DM)")])
39
- auto_eval_column_dict.append(["average_Spanish", ColumnContent, ColumnContent("Average Spanish ⬆️", "number", True, category="Spanish")])
40
-
41
- auto_eval_column_dict.append(["model_type", ColumnContent, ColumnContent("Type", "str", False, category="Model Information")])
42
- auto_eval_column_dict.append(["architecture", ColumnContent, ColumnContent("Architecture", "str", False, category="Model Information")])
43
- auto_eval_column_dict.append(["weight_type", ColumnContent, ColumnContent("Weight type", "str", False, category="Model Information", hidden=True)])
44
- auto_eval_column_dict.append(["precision", ColumnContent, ColumnContent("Precision", "str", False, category="Model Information")])
45
- auto_eval_column_dict.append(["license", ColumnContent, ColumnContent("Hub License", "str", False, category="Model Information")])
46
- auto_eval_column_dict.append(["params", ColumnContent, ColumnContent("#Params (B)", "number", False, category="Model Information")])
47
- auto_eval_column_dict.append(["likes", ColumnContent, ColumnContent("Hub ❤️", "number", False, category="Model Information")])
48
- auto_eval_column_dict.append(["still_on_hub", ColumnContent, ColumnContent("Available on the hub", "bool", False, category="Model Information")])
49
- auto_eval_column_dict.append(["revision", ColumnContent, ColumnContent("Model sha", "str", False, category="Model Information", hidden=False)])
50
-
51
- for task in Tasks:
52
- auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", False, category=task.value.category)])
53
-
54
- # We use make_dataclass to dynamically fill the scores from Tasks
55
- AutoEvalColumn = make_dataclass("AutoEvalColumn", auto_eval_column_dict, frozen=True)
56
-
57
- ## For the queue columns in the submission tab
58
- @dataclass(frozen=True)
59
- class EvalQueueColumn: # Queue column
60
- model = ColumnContent("model", "markdown", True)
61
- revision = ColumnContent("revision", "str", True)
62
- private = ColumnContent("private", "bool", True)
63
- precision = ColumnContent("precision", "str", True)
64
- weight_type = ColumnContent("weight_type", "str", "Original")
65
- status = ColumnContent("status", "str", True)
66
-
67
- ## All the model information that we might need
68
- @dataclass
69
- class ModelDetails:
70
- name: str
71
- display_name: str = ""
72
- symbol: str = "" # emoji
73
-
74
-
75
- class ModelType(Enum):
76
- PT = ModelDetails(name="pretrained", symbol="🟢")
77
- FT = ModelDetails(name="fine-tuned", symbol="🔶")
78
- IFT = ModelDetails(name="instruction-tuned", symbol="⭕")
79
- RL = ModelDetails(name="RL-tuned", symbol="🟦")
80
- Unknown = ModelDetails(name="", symbol="?")
81
-
82
- def to_str(self, separator=" "):
83
- return f"{self.value.symbol}{separator}{self.value.name}"
84
-
85
- @staticmethod
86
- def from_str(type):
87
- if "fine-tuned" in type or "🔶" in type:
88
- return ModelType.FT
89
- if "pretrained" in type or "🟢" in type:
90
- return ModelType.PT
91
- if "RL-tuned" in type or "🟦" in type:
92
- return ModelType.RL
93
- if "instruction-tuned" in type or "⭕" in type:
94
- return ModelType.IFT
95
- return ModelType.Unknown
96
-
97
- class WeightType(Enum):
98
- Adapter = ModelDetails("Adapter")
99
- Original = ModelDetails("Original")
100
- Delta = ModelDetails("Delta")
101
-
102
- class Precision(Enum):
103
- float16 = ModelDetails("float16")
104
- bfloat16 = ModelDetails("bfloat16")
105
- float32 = ModelDetails("float32")
106
- #qt_8bit = ModelDetails("8bit")
107
- #qt_4bit = ModelDetails("4bit")
108
- #qt_GPTQ = ModelDetails("GPTQ")
109
- Unknown = ModelDetails("?")
110
-
111
- def from_str(precision):
112
- if precision in ["torch.float16", "float16"]:
113
- return Precision.float16
114
- if precision in ["torch.bfloat16", "bfloat16"]:
115
- return Precision.bfloat16
116
- if precision in ["float32"]:
117
- return Precision.float32
118
- #if precision in ["8bit"]:
119
- # return Precision.qt_8bit
120
- #if precision in ["4bit"]:
121
- # return Precision.qt_4bit
122
- #if precision in ["GPTQ", "None"]:
123
- # return Precision.qt_GPTQ
124
- return Precision.Unknown
125
-
126
- # Column selection
127
- COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden]
128
- TYPES = [c.type for c in fields(AutoEvalColumn) if not c.hidden]
129
- COLS_LITE = [c.name for c in fields(AutoEvalColumn) if c.displayed_by_default and not c.hidden]
130
- TYPES_LITE = [c.type for c in fields(AutoEvalColumn) if c.displayed_by_default and not c.hidden]
131
-
132
- EVAL_COLS = [c.name for c in fields(EvalQueueColumn)]
133
- EVAL_TYPES = [c.type for c in fields(EvalQueueColumn)]
134
-
135
- BENCHMARK_COLS = [t.value.col_name for t in Tasks]
136
-
137
- NUMERIC_INTERVALS = {
138
- "?": pd.Interval(-1, 0, closed="right"),
139
- "~1.5": pd.Interval(0, 2, closed="right"),
140
- "~3": pd.Interval(2, 4, closed="right"),
141
- "~7": pd.Interval(4, 9, closed="right"),
142
- "~13": pd.Interval(9, 20, closed="right"),
143
- "~35": pd.Interval(20, 45, closed="right"),
144
- "~60": pd.Interval(45, 70, closed="right"),
145
- "70+": pd.Interval(70, 10000, closed="right"),
146
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/envs.py DELETED
@@ -1,25 +0,0 @@
1
- import os
2
-
3
- from huggingface_hub import HfApi
4
-
5
- # Info to change for your repository
6
- # ----------------------------------
7
- TOKEN = os.environ.get("TOKEN") # A read/write token for your org
8
-
9
- OWNER = "TheFinAI" # Change to your org - don't forget to create a results and request dataset, with the correct format!
10
- # ----------------------------------
11
-
12
- REPO_ID = f"{OWNER}/FinBen-Leaderboard"
13
- QUEUE_REPO = f"{OWNER}/requests"
14
- RESULTS_REPO = f"{OWNER}/results"
15
-
16
- # If you setup a cache later, just change HF_HOME
17
- CACHE_PATH=os.getenv("HF_HOME", ".")
18
-
19
- # Local caches
20
- EVAL_REQUESTS_PATH = os.path.join(CACHE_PATH, "eval-queue")
21
- EVAL_RESULTS_PATH = os.path.join(CACHE_PATH, "eval-results")
22
- EVAL_REQUESTS_PATH_BACKEND = os.path.join(CACHE_PATH, "eval-queue-bk")
23
- EVAL_RESULTS_PATH_BACKEND = os.path.join(CACHE_PATH, "eval-results-bk")
24
-
25
- API = HfApi(token=TOKEN)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/leaderboard/read_evals.py DELETED
@@ -1,273 +0,0 @@
1
- import glob
2
- import json
3
- import math
4
- import os
5
- from dataclasses import dataclass
6
-
7
- import dateutil
8
- import numpy as np
9
-
10
- from src.display.formatting import make_clickable_model
11
- from src.display.utils import AutoEvalColumn, ModelType, Tasks, Precision, WeightType
12
- from src.submission.check_validity import is_model_on_hub
13
-
14
- task_benchmarks = {task.value.benchmark for task in Tasks}
15
-
16
- @dataclass
17
- class EvalResult:
18
- """Represents one full evaluation. Built from a combination of the result and request file for a given run.
19
- """
20
- eval_name: str # org_model_precision (uid)
21
- full_model: str # org/model (path on hub)
22
- org: str
23
- model: str
24
- revision: str # commit hash, "" if main
25
- results: dict
26
- precision: Precision = Precision.Unknown
27
- model_type: ModelType = ModelType.Unknown # Pretrained, fine tuned, ...
28
- weight_type: WeightType = WeightType.Original # Original or Adapter
29
- architecture: str = "Unknown"
30
- license: str = "?"
31
- likes: int = 0
32
- num_params: int = 0
33
- date: str = "" # submission date of request file
34
- still_on_hub: bool = False
35
-
36
- @classmethod
37
- def init_from_json_file(self, json_filepath):
38
- """Inits the result from the specific model result file"""
39
- with open(json_filepath) as fp:
40
- print(json_filepath)
41
- data = json.load(fp)
42
-
43
- config = data.get("config")
44
- # Precision
45
- precision = Precision.from_str(config.get("model_dtype"))
46
-
47
- # ModelType
48
- model_type = ModelType.from_str(config.get("model_type"))
49
-
50
- # Get model and org
51
- org_and_model = config.get("model_name", config.get("model_args", None))
52
- org_and_model = org_and_model.split("/", 1)
53
-
54
- if len(org_and_model) == 1:
55
- org = None
56
- model = org_and_model[0]
57
- result_key = f"{model}_{precision.value.name}"
58
- else:
59
- org = org_and_model[0]
60
- model = org_and_model[1]
61
- result_key = f"{org}_{model}_{precision.value.name}"
62
- full_model = "/".join(org_and_model)
63
-
64
- still_on_hub, _, model_config = is_model_on_hub(
65
- full_model, config.get("model_sha", "main"), trust_remote_code=True, test_tokenizer=False
66
- )
67
- architecture = "?"
68
- if model_config is not None:
69
- architectures = getattr(model_config, "architectures", None)
70
- if architectures:
71
- architecture = ";".join(architectures)
72
-
73
- # Extract results available in this file (some results are split in several files)
74
- results = {}
75
- for task in Tasks:
76
- task = task.value
77
-
78
- # We average all scores of a given metric (not all metrics are present in all files)
79
- accs = np.array([v.get(task.metric, None) for k, v in data["results"].items() if task.benchmark == k])
80
- if accs.size == 0 or any([acc is None for acc in accs]):
81
- continue
82
-
83
- mean_acc = np.mean(accs) * 100.0
84
- results[task.benchmark] = mean_acc
85
-
86
- # Print missing benchmarks if any
87
- missing_benchmarks = task_benchmarks - results.keys()
88
- if missing_benchmarks:
89
- print(f"(Missing results) Model {model} is missing {', '.join(missing_benchmarks)} from result files")
90
- for benchmark in missing_benchmarks:
91
- results[benchmark] = "missing"
92
-
93
-
94
-
95
- return self(
96
- eval_name=result_key,
97
- full_model=full_model,
98
- org=org,
99
- model=model,
100
- results=results,
101
- precision=precision,
102
- revision= config.get("model_sha", ""),
103
- still_on_hub=still_on_hub,
104
- architecture=architecture,
105
- model_type=model_type
106
- )
107
-
108
-
109
- def update_with_request_file(self, requests_path):
110
- """Finds the relevant request file for the current model and updates info with it"""
111
- request_file = get_request_file_for_model(requests_path, self.full_model, self.precision.value.name)
112
- try:
113
- with open(request_file, "r") as f:
114
- request = json.load(f)
115
- self.model_type = ModelType.from_str(request.get("model_type", ""))
116
- self.weight_type = WeightType[request.get("weight_type", "Original")]
117
- self.license = request.get("license", "?")
118
- self.likes = request.get("likes", 0)
119
- self.num_params = request.get("params", 0)
120
- self.date = request.get("submitted_time", "")
121
- except Exception:
122
- print(f"Could not find request file for {self.org}/{self.model} with precision {self.precision.value.name}")
123
-
124
- def to_dict(self):
125
- """Converts the Eval Result to a dict compatible with our dataframe display"""
126
-
127
- # Initialize category averages
128
- category_averages = {
129
- "average_IE": [],
130
- "average_TA": [],
131
- "average_QA": [],
132
- "average_TG": [],
133
- "average_RM": [],
134
- "average_FO": [],
135
- "average_DM": [],
136
- "average_Spanish": []
137
- }
138
-
139
- # Calculate averages for each task
140
- for task in Tasks:
141
- score = self.results.get(task.value.benchmark)
142
- if score is not None:
143
- # Append score to the appropriate category
144
- if task.value.category == "Information Extraction (IE)":
145
- category_averages["average_IE"].append(score)
146
- elif task.value.category == "Textual Analysis (TA)":
147
- category_averages["average_TA"].append(score)
148
- elif task.value.category == "Question Answering (QA)":
149
- category_averages["average_QA"].append(score)
150
- elif task.value.category == "Text Generation (TG)":
151
- category_averages["average_TG"].append(score)
152
- elif task.value.category == "Risk Management (RM)":
153
- if score == "missing":
154
- category_averages["average_RM"].append(score)
155
- else:
156
- category_averages["average_RM"].append((score + 100) / 2)
157
- elif task.value.category == "Forecasting (FO)":
158
- category_averages["average_FO"].append(score)
159
- elif task.value.category == "Decision-Making (DM)":
160
- if task.value.benchmark == "FinTrade" and score != "missing":
161
- category_averages["average_DM"].append((score + 300)/6)
162
- else:
163
- category_averages["average_DM"].append(score)
164
- elif task.value.category == "Spanish":
165
- category_averages["average_Spanish"].append(score)
166
-
167
- # Calculate the mean for each category and add to data_dict
168
- data_dict = {}
169
- for category, scores in category_averages.items():
170
- # Calculate the average if there are valid scores, otherwise set to 0
171
- valid_scores = [score for score in scores if score != "missing"]
172
- if valid_scores:
173
- average = sum(valid_scores) / len(valid_scores)
174
- else:
175
- average = 0
176
- data_dict[category] = average
177
-
178
- # Overall average
179
- total_scores = [v for v in self.results.values() if v != "missing"]
180
- overall_average = sum(total_scores) / len(total_scores) if total_scores else 0
181
-
182
- # Add other columns
183
- data_dict.update({
184
- "eval_name": self.eval_name, # not a column, just a save name,
185
- AutoEvalColumn.precision.name: self.precision.value.name,
186
- AutoEvalColumn.model_type.name: self.model_type.value.name,
187
- AutoEvalColumn.model_type_symbol.name: self.model_type.value.symbol,
188
- AutoEvalColumn.weight_type.name: self.weight_type.value.name,
189
- AutoEvalColumn.architecture.name: self.architecture,
190
- AutoEvalColumn.model.name: make_clickable_model(self.full_model),
191
- AutoEvalColumn.revision.name: self.revision,
192
- AutoEvalColumn.average.name: overall_average,
193
- AutoEvalColumn.license.name: self.license,
194
- AutoEvalColumn.likes.name: self.likes,
195
- AutoEvalColumn.params.name: self.num_params,
196
- AutoEvalColumn.still_on_hub.name: self.still_on_hub,
197
- })
198
-
199
- # Add task results to the data dictionary
200
- for task in Tasks:
201
- data_dict[task.value.col_name] = self.results.get(task.value.benchmark)
202
-
203
- return data_dict
204
-
205
-
206
-
207
-
208
-
209
- def get_request_file_for_model(requests_path, model_name, precision):
210
- """Selects the correct request file for a given model. Only keeps runs tagged as FINISHED"""
211
- request_files = os.path.join(
212
- requests_path,
213
- f"{model_name}_eval_request_*.json",
214
- )
215
- request_files = glob.glob(request_files)
216
-
217
- # Select correct request file (precision)
218
- request_file = ""
219
- request_files = sorted(request_files, reverse=True)
220
- for tmp_request_file in request_files:
221
- with open(tmp_request_file, "r") as f:
222
- req_content = json.load(f)
223
- if (
224
- req_content["status"] in ["FINISHED"]
225
- and req_content["precision"] == precision.split(".")[-1]
226
- ):
227
- request_file = tmp_request_file
228
- return request_file
229
-
230
-
231
- def get_raw_eval_results(results_path: str, requests_path: str) -> list[EvalResult]:
232
- """From the path of the results folder root, extract all needed info for results"""
233
- model_result_filepaths = []
234
-
235
- for root, _, files in os.walk(results_path):
236
- # We should only have json files in model results
237
- if len(files) == 0 or any([not f.endswith(".json") for f in files]):
238
- continue
239
-
240
- # Sort the files by date
241
- try:
242
- files.sort(key=lambda x: x.removesuffix(".json").removeprefix("results_")[:-7])
243
- except dateutil.parser._parser.ParserError:
244
- files = [files[-1]]
245
-
246
- for file in files:
247
- model_result_filepaths.append(os.path.join(root, file))
248
-
249
- print(f"Found {len(model_result_filepaths)} JSON files to process.")
250
-
251
- eval_results = {}
252
- for model_result_filepath in model_result_filepaths:
253
- # Creation of result
254
- eval_result = EvalResult.init_from_json_file(model_result_filepath)
255
- eval_result.update_with_request_file(requests_path)
256
-
257
- # Store results of same eval together
258
- eval_name = eval_result.eval_name
259
- if eval_name in eval_results.keys():
260
- eval_results[eval_name].results.update({k: v for k, v in eval_result.results.items() if v is not None})
261
- else:
262
- eval_results[eval_name] = eval_result
263
-
264
- results = []
265
- for v in eval_results.values():
266
- try:
267
- v.to_dict() # we test if the dict version is complete
268
- results.append(v)
269
- except KeyError: # not all eval values present
270
- continue
271
-
272
- print(f"Successfully loaded {len(results)} models.")
273
- return results
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/populate.py DELETED
@@ -1,99 +0,0 @@
1
- import json
2
- import os
3
- import pandas as pd
4
- import numpy as np
5
-
6
- from src.display.formatting import has_no_nan_values, make_clickable_model
7
- from src.display.utils import AutoEvalColumn, EvalQueueColumn
8
- from src.leaderboard.read_evals import get_raw_eval_results
9
-
10
-
11
- def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchmark_cols: list) -> pd.DataFrame:
12
- """Creates a dataframe from all the individual experiment results"""
13
- raw_data = get_raw_eval_results(results_path, requests_path)
14
- all_data_json = [v.to_dict() for v in raw_data]
15
-
16
- df = pd.DataFrame.from_records(all_data_json)
17
-
18
- # Add category average columns with default values
19
- category_avg_columns = {
20
- "Average IE ⬆️": "average_IE",
21
- "Average TA ⬆️": "average_TA",
22
- "Average QA ⬆️": "average_QA",
23
- "Average TG ⬆️": "average_TG",
24
- "Average RM ⬆️": "average_RM",
25
- "Average FO ⬆️": "average_FO",
26
- "Average DM ⬆️": "average_DM",
27
- "Average Spanish ⬆️": "average_Spanish"
28
- }
29
-
30
- for display_name, internal_name in category_avg_columns.items():
31
- df[display_name] = df[internal_name]
32
-
33
- df = df.sort_values(by=[AutoEvalColumn.average.name], ascending=False)
34
-
35
- # Apply the transformation for MCC values
36
- mcc_tasks = ["German", "Australian", "LendingClub", "ccf", "ccfraud", "polish", "taiwan", "portoseguro", "travelinsurance"]
37
- for task in mcc_tasks:
38
- if task in df.columns:
39
- df[task] = df.apply(lambda row: (row[task] + 100) / 2.0 if row[task] != "missing" else row[task], axis=1)
40
-
41
- for index, row in df.iterrows():
42
- if "FinTrade" in row and row["FinTrade"] != "missing":
43
- df.loc[index, "FinTrade"] = (row["FinTrade"] + 300) / 6
44
-
45
- # Now, select the columns that were passed to the function
46
- df = df[cols]
47
-
48
- # Function to round numeric values, including those in string format
49
- def round_numeric(x):
50
- try:
51
- return round(float(x), 1)
52
- except ValueError:
53
- return x
54
-
55
- # Apply rounding to all columns except 'T' and 'Model'
56
- for col in df.columns:
57
- if col not in ['T', 'Model']:
58
- df[col] = df[col].apply(round_numeric)
59
-
60
- # Filter out if any of the benchmarks have not been produced
61
- df = df[has_no_nan_values(df, benchmark_cols)]
62
-
63
- return raw_data, df
64
-
65
-
66
- def get_evaluation_queue_df(save_path: str, cols: list) -> list[pd.DataFrame]:
67
- """Creates the different dataframes for the evaluation queues requests"""
68
- entries = [entry for entry in os.listdir(save_path) if not entry.startswith(".")]
69
- all_evals = []
70
-
71
- for entry in entries:
72
- if ".json" in entry:
73
- file_path = os.path.join(save_path, entry)
74
- with open(file_path) as fp:
75
- data = json.load(fp)
76
-
77
- data[EvalQueueColumn.model.name] = make_clickable_model(data["model"])
78
- data[EvalQueueColumn.revision.name] = data.get("revision", "main")
79
-
80
- all_evals.append(data)
81
- elif ".md" not in entry:
82
- # this is a folder
83
- sub_entries = [e for e in os.listdir(f"{save_path}/{entry}") if not e.startswith(".")]
84
- for sub_entry in sub_entries:
85
- file_path = os.path.join(save_path, entry, sub_entry)
86
- with open(file_path) as fp:
87
- data = json.load(fp)
88
-
89
- data[EvalQueueColumn.model.name] = make_clickable_model(data["model"])
90
- data[EvalQueueColumn.revision.name] = data.get("revision", "main")
91
- all_evals.append(data)
92
-
93
- pending_list = [e for e in all_evals if e["status"] in ["PENDING", "RERUN"]]
94
- running_list = [e for e in all_evals if e["status"] == "RUNNING"]
95
- finished_list = [e for e in all_evals if e["status"].startswith("FINISHED") or e["status"] == "PENDING_NEW_EVAL"]
96
- df_pending = pd.DataFrame.from_records(pending_list, columns=cols)
97
- df_running = pd.DataFrame.from_records(running_list, columns=cols)
98
- df_finished = pd.DataFrame.from_records(finished_list, columns=cols)
99
- return df_finished[cols], df_running[cols], df_pending[cols]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/submission/check_validity.py DELETED
@@ -1,99 +0,0 @@
1
- import json
2
- import os
3
- import re
4
- from collections import defaultdict
5
- from datetime import datetime, timedelta, timezone
6
-
7
- import huggingface_hub
8
- from huggingface_hub import ModelCard
9
- from huggingface_hub.hf_api import ModelInfo
10
- from transformers import AutoConfig
11
- from transformers.models.auto.tokenization_auto import AutoTokenizer
12
-
13
- def check_model_card(repo_id: str) -> tuple[bool, str]:
14
- """Checks if the model card and license exist and have been filled"""
15
- try:
16
- card = ModelCard.load(repo_id)
17
- except huggingface_hub.utils.EntryNotFoundError:
18
- return False, "Please add a model card to your model to explain how you trained/fine-tuned it."
19
-
20
- # Enforce license metadata
21
- if card.data.license is None:
22
- if not ("license_name" in card.data and "license_link" in card.data):
23
- return False, (
24
- "License not found. Please add a license to your model card using the `license` metadata or a"
25
- " `license_name`/`license_link` pair."
26
- )
27
-
28
- # Enforce card content
29
- if len(card.text) < 200:
30
- return False, "Please add a description to your model card, it is too short."
31
-
32
- return True, ""
33
-
34
- def is_model_on_hub(model_name: str, revision: str, token: str = None, trust_remote_code=False, test_tokenizer=False) -> tuple[bool, str]:
35
- """Checks if the model model_name is on the hub, and whether it (and its tokenizer) can be loaded with AutoClasses."""
36
- try:
37
- config = AutoConfig.from_pretrained(model_name, revision=revision, trust_remote_code=trust_remote_code, token=token)
38
- if test_tokenizer:
39
- try:
40
- tk = AutoTokenizer.from_pretrained(model_name, revision=revision, trust_remote_code=trust_remote_code, token=token)
41
- except ValueError as e:
42
- return (
43
- False,
44
- f"uses a tokenizer which is not in a transformers release: {e}",
45
- None
46
- )
47
- except Exception as e:
48
- return (False, "'s tokenizer cannot be loaded. Is your tokenizer class in a stable transformers release, and correctly configured?", None)
49
- return True, None, config
50
-
51
- except ValueError:
52
- return (
53
- False,
54
- "needs to be launched with `trust_remote_code=True`. For safety reason, we do not allow these models to be automatically submitted to the leaderboard.",
55
- None
56
- )
57
-
58
- except Exception as e:
59
- return False, "was not found on hub!", None
60
-
61
-
62
- def get_model_size(model_info: ModelInfo, precision: str):
63
- """Gets the model size from the configuration, or the model name if the configuration does not contain the information."""
64
- try:
65
- model_size = round(model_info.safetensors["total"] / 1e9, 3)
66
- except (AttributeError, TypeError):
67
- return 0 # Unknown model sizes are indicated as 0, see NUMERIC_INTERVALS in app.py
68
-
69
- size_factor = 8 if (precision == "GPTQ" or "gptq" in model_info.modelId.lower()) else 1
70
- model_size = size_factor * model_size
71
- return model_size
72
-
73
- def get_model_arch(model_info: ModelInfo):
74
- """Gets the model architecture from the configuration"""
75
- return model_info.config.get("architectures", "Unknown")
76
-
77
- def already_submitted_models(requested_models_dir: str) -> set[str]:
78
- """Gather a list of already submitted models to avoid duplicates"""
79
- depth = 1
80
- file_names = []
81
- users_to_submission_dates = defaultdict(list)
82
-
83
- for root, _, files in os.walk(requested_models_dir):
84
- current_depth = root.count(os.sep) - requested_models_dir.count(os.sep)
85
- if current_depth == depth:
86
- for file in files:
87
- if not file.endswith(".json"):
88
- continue
89
- with open(os.path.join(root, file), "r") as f:
90
- info = json.load(f)
91
- file_names.append(f"{info['model']}_{info['revision']}_{info['precision']}")
92
-
93
- # Select organisation
94
- if info["model"].count("/") == 0 or "submitted_time" not in info:
95
- continue
96
- organisation, _ = info["model"].split("/")
97
- users_to_submission_dates[organisation].append(info["submitted_time"])
98
-
99
- return set(file_names), users_to_submission_dates
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/submission/submit.py DELETED
@@ -1,119 +0,0 @@
1
- import json
2
- import os
3
- from datetime import datetime, timezone
4
-
5
- from src.display.formatting import styled_error, styled_message, styled_warning
6
- from src.envs import API, EVAL_REQUESTS_PATH, TOKEN, QUEUE_REPO
7
- from src.submission.check_validity import (
8
- already_submitted_models,
9
- check_model_card,
10
- get_model_size,
11
- is_model_on_hub,
12
- )
13
-
14
- REQUESTED_MODELS = None
15
- USERS_TO_SUBMISSION_DATES = None
16
-
17
- def add_new_eval(
18
- model: str,
19
- base_model: str,
20
- revision: str,
21
- precision: str,
22
- weight_type: str,
23
- model_type: str,
24
- ):
25
- global REQUESTED_MODELS
26
- global USERS_TO_SUBMISSION_DATES
27
- if not REQUESTED_MODELS:
28
- REQUESTED_MODELS, USERS_TO_SUBMISSION_DATES = already_submitted_models(EVAL_REQUESTS_PATH)
29
-
30
- user_name = ""
31
- model_path = model
32
- if "/" in model:
33
- user_name = model.split("/")[0]
34
- model_path = model.split("/")[1]
35
-
36
- precision = precision.split(" ")[0]
37
- current_time = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
38
-
39
- if model_type is None or model_type == "":
40
- return styled_error("Please select a model type.")
41
-
42
- # Does the model actually exist?
43
- if revision == "":
44
- revision = "main"
45
-
46
- # Is the model on the hub?
47
- if weight_type in ["Delta", "Adapter"]:
48
- base_model_on_hub, error, _ = is_model_on_hub(model_name=base_model, revision=revision, token=TOKEN, test_tokenizer=True)
49
- if not base_model_on_hub:
50
- return styled_error(f'Base model "{base_model}" {error}')
51
-
52
- if not weight_type == "Adapter":
53
- model_on_hub, error, _ = is_model_on_hub(model_name=model, revision=revision, token=TOKEN, test_tokenizer=True)
54
- if not model_on_hub:
55
- return styled_error(f'Model "{model}" {error}')
56
-
57
- # Is the model info correctly filled?
58
- try:
59
- model_info = API.model_info(repo_id=model, revision=revision)
60
- except Exception:
61
- return styled_error("Could not get your model information. Please fill it up properly.")
62
-
63
- model_size = get_model_size(model_info=model_info, precision=precision)
64
-
65
- # Were the model card and license filled?
66
- try:
67
- license = model_info.cardData["license"]
68
- except Exception:
69
- return styled_error("Please select a license for your model")
70
-
71
- modelcard_OK, error_msg = check_model_card(model)
72
- if not modelcard_OK:
73
- return styled_error(error_msg)
74
-
75
- # Seems good, creating the eval
76
- print("Adding new eval")
77
-
78
- eval_entry = {
79
- "model": model,
80
- "base_model": base_model,
81
- "revision": revision,
82
- "precision": precision,
83
- "weight_type": weight_type,
84
- "status": "PENDING",
85
- "submitted_time": current_time,
86
- "model_type": model_type,
87
- "likes": model_info.likes,
88
- "params": model_size,
89
- "license": license,
90
- "private": False,
91
- }
92
-
93
- # Check for duplicate submission
94
- if f"{model}_{revision}_{precision}" in REQUESTED_MODELS:
95
- return styled_warning("This model has been already submitted.")
96
-
97
- print("Creating eval file")
98
- OUT_DIR = f"{EVAL_REQUESTS_PATH}/{user_name}"
99
- os.makedirs(OUT_DIR, exist_ok=True)
100
- out_path = f"{OUT_DIR}/{model_path}_eval_request_False_{precision}_{weight_type}.json"
101
-
102
- with open(out_path, "w") as f:
103
- f.write(json.dumps(eval_entry))
104
-
105
- print("Uploading eval file")
106
- API.upload_file(
107
- path_or_fileobj=out_path,
108
- path_in_repo=out_path.split("eval-queue/")[1],
109
- repo_id=QUEUE_REPO,
110
- repo_type="dataset",
111
- commit_message=f"Add {model} to eval queue",
112
- )
113
-
114
- # Remove the local file
115
- os.remove(out_path)
116
-
117
- return styled_message(
118
- "Your request has been submitted to the evaluation queue!\nPlease wait for up to an hour for the model to show in the PENDING list."
119
- )