Remove old leaderboard files
Browse files- .gitattributes +0 -35
- .gitignore +44 -12
- .pre-commit-config.yaml +0 -53
- LICENSE +0 -201
- LICENSE.spdx +0 -7
- Makefile +0 -13
- NOTICE +0 -4
- README.md +69 -89
- app.py +0 -508
- examples/model_evaluation.ipynb +0 -0
- frontend/public/index.html +96 -0
- frontend/public/og-image.jpg +0 -0
- frontend/public/robots.txt +3 -0
- logos/1_columbia.png +0 -0
- logos/2_openfinance.jpg +0 -0
- logos/3_rpi.png +0 -0
- logos/4_finai-logo.jpg +0 -0
- logos/5_huggingface.jpeg +0 -0
- logos/FinOS.png +0 -0
- logos/archimedes logo GB copy.jpg +0 -0
- logos/manc.png +0 -0
- logos/nactemlogo.jpg +0 -0
- logos/uf.png +0 -0
- logos/wuhan.png +0 -0
- pyproject.toml +0 -13
- requirements.txt +0 -20
- src/about.py +0 -210
- src/display/css_html_js.py +0 -105
- src/display/formatting.py +0 -27
- src/display/utils.py +0 -146
- src/envs.py +0 -25
- src/leaderboard/read_evals.py +0 -273
- src/populate.py +0 -99
- src/submission/check_validity.py +0 -99
- src/submission/submit.py +0 -119
.gitattributes
DELETED
@@ -1,35 +0,0 @@
|
|
1 |
-
*.7z filter=lfs diff=lfs merge=lfs -text
|
2 |
-
*.arrow filter=lfs diff=lfs merge=lfs -text
|
3 |
-
*.bin filter=lfs diff=lfs merge=lfs -text
|
4 |
-
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
5 |
-
*.ckpt filter=lfs diff=lfs merge=lfs -text
|
6 |
-
*.ftz filter=lfs diff=lfs merge=lfs -text
|
7 |
-
*.gz filter=lfs diff=lfs merge=lfs -text
|
8 |
-
*.h5 filter=lfs diff=lfs merge=lfs -text
|
9 |
-
*.joblib filter=lfs diff=lfs merge=lfs -text
|
10 |
-
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
11 |
-
*.mlmodel filter=lfs diff=lfs merge=lfs -text
|
12 |
-
*.model filter=lfs diff=lfs merge=lfs -text
|
13 |
-
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
14 |
-
*.npy filter=lfs diff=lfs merge=lfs -text
|
15 |
-
*.npz filter=lfs diff=lfs merge=lfs -text
|
16 |
-
*.onnx filter=lfs diff=lfs merge=lfs -text
|
17 |
-
*.ot filter=lfs diff=lfs merge=lfs -text
|
18 |
-
*.parquet filter=lfs diff=lfs merge=lfs -text
|
19 |
-
*.pb filter=lfs diff=lfs merge=lfs -text
|
20 |
-
*.pickle filter=lfs diff=lfs merge=lfs -text
|
21 |
-
*.pkl filter=lfs diff=lfs merge=lfs -text
|
22 |
-
*.pt filter=lfs diff=lfs merge=lfs -text
|
23 |
-
*.pth filter=lfs diff=lfs merge=lfs -text
|
24 |
-
*.rar filter=lfs diff=lfs merge=lfs -text
|
25 |
-
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
26 |
-
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
27 |
-
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
28 |
-
*.tflite filter=lfs diff=lfs merge=lfs -text
|
29 |
-
*.tgz filter=lfs diff=lfs merge=lfs -text
|
30 |
-
*.wasm filter=lfs diff=lfs merge=lfs -text
|
31 |
-
*.xz filter=lfs diff=lfs merge=lfs -text
|
32 |
-
*.zip filter=lfs diff=lfs merge=lfs -text
|
33 |
-
*.zst filter=lfs diff=lfs merge=lfs -text
|
34 |
-
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
35 |
-
scale-hf-logo.png filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
.gitignore
CHANGED
@@ -1,13 +1,45 @@
|
|
1 |
-
|
2 |
-
|
3 |
-
__pycache__
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
4 |
.env
|
5 |
-
.
|
6 |
-
|
7 |
-
|
8 |
-
|
9 |
-
eval-queue/
|
10 |
-
eval-results/
|
11 |
-
eval-queue-bk/
|
12 |
-
eval-results-bk/
|
13 |
-
logs/
|
|
|
1 |
+
# See https://help.github.com/articles/ignoring-files/ for more about ignoring files.
|
2 |
+
|
3 |
+
__pycache__
|
4 |
+
.cache/
|
5 |
+
|
6 |
+
# dependencies
|
7 |
+
|
8 |
+
frontend/node_modules
|
9 |
+
/.pnp
|
10 |
+
.pnp.js
|
11 |
+
|
12 |
+
# testing
|
13 |
+
|
14 |
+
/coverage
|
15 |
+
|
16 |
+
# production
|
17 |
+
|
18 |
+
/build
|
19 |
+
|
20 |
+
# misc
|
21 |
+
|
22 |
+
.DS_Store
|
23 |
+
.env.local
|
24 |
+
.env.development.local
|
25 |
+
.env.test.local
|
26 |
+
.env.production.local
|
27 |
+
|
28 |
+
npm-debug.log*
|
29 |
+
yarn-debug.log*
|
30 |
+
yarn-error.log\*
|
31 |
+
|
32 |
+
src/dataframe.json
|
33 |
+
|
34 |
+
yarn.lock
|
35 |
+
package-lock.json
|
36 |
+
|
37 |
+
/public
|
38 |
+
|
39 |
+
.claudesync/
|
40 |
+
|
41 |
+
# Environment variables
|
42 |
.env
|
43 |
+
.env.*
|
44 |
+
!.env.example
|
45 |
+
|
|
|
|
|
|
|
|
|
|
|
|
.pre-commit-config.yaml
DELETED
@@ -1,53 +0,0 @@
|
|
1 |
-
# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
|
2 |
-
#
|
3 |
-
# Licensed under the Apache License, Version 2.0 (the "License");
|
4 |
-
# you may not use this file except in compliance with the License.
|
5 |
-
# You may obtain a copy of the License at
|
6 |
-
#
|
7 |
-
# http://www.apache.org/licenses/LICENSE-2.0
|
8 |
-
#
|
9 |
-
# Unless required by applicable law or agreed to in writing, software
|
10 |
-
# distributed under the License is distributed on an "AS IS" BASIS,
|
11 |
-
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
12 |
-
# See the License for the specific language governing permissions and
|
13 |
-
# limitations under the License.
|
14 |
-
|
15 |
-
default_language_version:
|
16 |
-
python: python3
|
17 |
-
|
18 |
-
ci:
|
19 |
-
autofix_prs: true
|
20 |
-
autoupdate_commit_msg: '[pre-commit.ci] pre-commit suggestions'
|
21 |
-
autoupdate_schedule: quarterly
|
22 |
-
|
23 |
-
repos:
|
24 |
-
- repo: https://github.com/pre-commit/pre-commit-hooks
|
25 |
-
rev: v4.3.0
|
26 |
-
hooks:
|
27 |
-
- id: check-yaml
|
28 |
-
- id: check-case-conflict
|
29 |
-
- id: detect-private-key
|
30 |
-
- id: check-added-large-files
|
31 |
-
args: ['--maxkb=1000']
|
32 |
-
- id: requirements-txt-fixer
|
33 |
-
- id: end-of-file-fixer
|
34 |
-
- id: trailing-whitespace
|
35 |
-
|
36 |
-
- repo: https://github.com/PyCQA/isort
|
37 |
-
rev: 5.12.0
|
38 |
-
hooks:
|
39 |
-
- id: isort
|
40 |
-
name: Format imports
|
41 |
-
|
42 |
-
- repo: https://github.com/psf/black
|
43 |
-
rev: 22.12.0
|
44 |
-
hooks:
|
45 |
-
- id: black
|
46 |
-
name: Format code
|
47 |
-
additional_dependencies: ['click==8.0.2']
|
48 |
-
|
49 |
-
- repo: https://github.com/charliermarsh/ruff-pre-commit
|
50 |
-
# Ruff version.
|
51 |
-
rev: 'v0.0.267'
|
52 |
-
hooks:
|
53 |
-
- id: ruff
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
LICENSE
DELETED
@@ -1,201 +0,0 @@
|
|
1 |
-
Apache License
|
2 |
-
Version 2.0, January 2004
|
3 |
-
http://www.apache.org/licenses/
|
4 |
-
|
5 |
-
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
|
6 |
-
|
7 |
-
1. Definitions.
|
8 |
-
|
9 |
-
"License" shall mean the terms and conditions for use, reproduction,
|
10 |
-
and distribution as defined by Sections 1 through 9 of this document.
|
11 |
-
|
12 |
-
"Licensor" shall mean the copyright owner or entity authorized by
|
13 |
-
the copyright owner that is granting the License.
|
14 |
-
|
15 |
-
"Legal Entity" shall mean the union of the acting entity and all
|
16 |
-
other entities that control, are controlled by, or are under common
|
17 |
-
control with that entity. For the purposes of this definition,
|
18 |
-
"control" means (i) the power, direct or indirect, to cause the
|
19 |
-
direction or management of such entity, whether by contract or
|
20 |
-
otherwise, or (ii) ownership of fifty percent (50%) or more of the
|
21 |
-
outstanding shares, or (iii) beneficial ownership of such entity.
|
22 |
-
|
23 |
-
"You" (or "Your") shall mean an individual or Legal Entity
|
24 |
-
exercising permissions granted by this License.
|
25 |
-
|
26 |
-
"Source" form shall mean the preferred form for making modifications,
|
27 |
-
including but not limited to software source code, documentation
|
28 |
-
source, and configuration files.
|
29 |
-
|
30 |
-
"Object" form shall mean any form resulting from mechanical
|
31 |
-
transformation or translation of a Source form, including but
|
32 |
-
not limited to compiled object code, generated documentation,
|
33 |
-
and conversions to other media types.
|
34 |
-
|
35 |
-
"Work" shall mean the work of authorship, whether in Source or
|
36 |
-
Object form, made available under the License, as indicated by a
|
37 |
-
copyright notice that is included in or attached to the work
|
38 |
-
(an example is provided in the Appendix below).
|
39 |
-
|
40 |
-
"Derivative Works" shall mean any work, whether in Source or Object
|
41 |
-
form, that is based on (or derived from) the Work and for which the
|
42 |
-
editorial revisions, annotations, elaborations, or other modifications
|
43 |
-
represent, as a whole, an original work of authorship. For the purposes
|
44 |
-
of this License, Derivative Works shall not include works that remain
|
45 |
-
separable from, or merely link (or bind by name) to the interfaces of,
|
46 |
-
the Work and Derivative Works thereof.
|
47 |
-
|
48 |
-
"Contribution" shall mean any work of authorship, including
|
49 |
-
the original version of the Work and any modifications or additions
|
50 |
-
to that Work or Derivative Works thereof, that is intentionally
|
51 |
-
submitted to Licensor for inclusion in the Work by the copyright owner
|
52 |
-
or by an individual or Legal Entity authorized to submit on behalf of
|
53 |
-
the copyright owner. For the purposes of this definition, "submitted"
|
54 |
-
means any form of electronic, verbal, or written communication sent
|
55 |
-
to the Licensor or its representatives, including but not limited to
|
56 |
-
communication on electronic mailing lists, source code control systems,
|
57 |
-
and issue tracking systems that are managed by, or on behalf of, the
|
58 |
-
Licensor for the purpose of discussing and improving the Work, but
|
59 |
-
excluding communication that is conspicuously marked or otherwise
|
60 |
-
designated in writing by the copyright owner as "Not a Contribution."
|
61 |
-
|
62 |
-
"Contributor" shall mean Licensor and any individual or Legal Entity
|
63 |
-
on behalf of whom a Contribution has been received by Licensor and
|
64 |
-
subsequently incorporated within the Work.
|
65 |
-
|
66 |
-
2. Grant of Copyright License. Subject to the terms and conditions of
|
67 |
-
this License, each Contributor hereby grants to You a perpetual,
|
68 |
-
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
|
69 |
-
copyright license to reproduce, prepare Derivative Works of,
|
70 |
-
publicly display, publicly perform, sublicense, and distribute the
|
71 |
-
Work and such Derivative Works in Source or Object form.
|
72 |
-
|
73 |
-
3. Grant of Patent License. Subject to the terms and conditions of
|
74 |
-
this License, each Contributor hereby grants to You a perpetual,
|
75 |
-
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
|
76 |
-
(except as stated in this section) patent license to make, have made,
|
77 |
-
use, offer to sell, sell, import, and otherwise transfer the Work,
|
78 |
-
where such license applies only to those patent claims licensable
|
79 |
-
by such Contributor that are necessarily infringed by their
|
80 |
-
Contribution(s) alone or by combination of their Contribution(s)
|
81 |
-
with the Work to which such Contribution(s) was submitted. If You
|
82 |
-
institute patent litigation against any entity (including a
|
83 |
-
cross-claim or counterclaim in a lawsuit) alleging that the Work
|
84 |
-
or a Contribution incorporated within the Work constitutes direct
|
85 |
-
or contributory patent infringement, then any patent licenses
|
86 |
-
granted to You under this License for that Work shall terminate
|
87 |
-
as of the date such litigation is filed.
|
88 |
-
|
89 |
-
4. Redistribution. You may reproduce and distribute copies of the
|
90 |
-
Work or Derivative Works thereof in any medium, with or without
|
91 |
-
modifications, and in Source or Object form, provided that You
|
92 |
-
meet the following conditions:
|
93 |
-
|
94 |
-
(a) You must give any other recipients of the Work or
|
95 |
-
Derivative Works a copy of this License; and
|
96 |
-
|
97 |
-
(b) You must cause any modified files to carry prominent notices
|
98 |
-
stating that You changed the files; and
|
99 |
-
|
100 |
-
(c) You must retain, in the Source form of any Derivative Works
|
101 |
-
that You distribute, all copyright, patent, trademark, and
|
102 |
-
attribution notices from the Source form of the Work,
|
103 |
-
excluding those notices that do not pertain to any part of
|
104 |
-
the Derivative Works; and
|
105 |
-
|
106 |
-
(d) If the Work includes a "NOTICE" text file as part of its
|
107 |
-
distribution, then any Derivative Works that You distribute must
|
108 |
-
include a readable copy of the attribution notices contained
|
109 |
-
within such NOTICE file, excluding those notices that do not
|
110 |
-
pertain to any part of the Derivative Works, in at least one
|
111 |
-
of the following places: within a NOTICE text file distributed
|
112 |
-
as part of the Derivative Works; within the Source form or
|
113 |
-
documentation, if provided along with the Derivative Works; or,
|
114 |
-
within a display generated by the Derivative Works, if and
|
115 |
-
wherever such third-party notices normally appear. The contents
|
116 |
-
of the NOTICE file are for informational purposes only and
|
117 |
-
do not modify the License. You may add Your own attribution
|
118 |
-
notices within Derivative Works that You distribute, alongside
|
119 |
-
or as an addendum to the NOTICE text from the Work, provided
|
120 |
-
that such additional attribution notices cannot be construed
|
121 |
-
as modifying the License.
|
122 |
-
|
123 |
-
You may add Your own copyright statement to Your modifications and
|
124 |
-
may provide additional or different license terms and conditions
|
125 |
-
for use, reproduction, or distribution of Your modifications, or
|
126 |
-
for any such Derivative Works as a whole, provided Your use,
|
127 |
-
reproduction, and distribution of the Work otherwise complies with
|
128 |
-
the conditions stated in this License.
|
129 |
-
|
130 |
-
5. Submission of Contributions. Unless You explicitly state otherwise,
|
131 |
-
any Contribution intentionally submitted for inclusion in the Work
|
132 |
-
by You to the Licensor shall be under the terms and conditions of
|
133 |
-
this License, without any additional terms or conditions.
|
134 |
-
Notwithstanding the above, nothing herein shall supersede or modify
|
135 |
-
the terms of any separate license agreement you may have executed
|
136 |
-
with Licensor regarding such Contributions.
|
137 |
-
|
138 |
-
6. Trademarks. This License does not grant permission to use the trade
|
139 |
-
names, trademarks, service marks, or product names of the Licensor,
|
140 |
-
except as required for reasonable and customary use in describing the
|
141 |
-
origin of the Work and reproducing the content of the NOTICE file.
|
142 |
-
|
143 |
-
7. Disclaimer of Warranty. Unless required by applicable law or
|
144 |
-
agreed to in writing, Licensor provides the Work (and each
|
145 |
-
Contributor provides its Contributions) on an "AS IS" BASIS,
|
146 |
-
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
|
147 |
-
implied, including, without limitation, any warranties or conditions
|
148 |
-
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
|
149 |
-
PARTICULAR PURPOSE. You are solely responsible for determining the
|
150 |
-
appropriateness of using or redistributing the Work and assume any
|
151 |
-
risks associated with Your exercise of permissions under this License.
|
152 |
-
|
153 |
-
8. Limitation of Liability. In no event and under no legal theory,
|
154 |
-
whether in tort (including negligence), contract, or otherwise,
|
155 |
-
unless required by applicable law (such as deliberate and grossly
|
156 |
-
negligent acts) or agreed to in writing, shall any Contributor be
|
157 |
-
liable to You for damages, including any direct, indirect, special,
|
158 |
-
incidental, or consequential damages of any character arising as a
|
159 |
-
result of this License or out of the use or inability to use the
|
160 |
-
Work (including but not limited to damages for loss of goodwill,
|
161 |
-
work stoppage, computer failure or malfunction, or any and all
|
162 |
-
other commercial damages or losses), even if such Contributor
|
163 |
-
has been advised of the possibility of such damages.
|
164 |
-
|
165 |
-
9. Accepting Warranty or Additional Liability. While redistributing
|
166 |
-
the Work or Derivative Works thereof, You may choose to offer,
|
167 |
-
and charge a fee for, acceptance of support, warranty, indemnity,
|
168 |
-
or other liability obligations and/or rights consistent with this
|
169 |
-
License. However, in accepting such obligations, You may act only
|
170 |
-
on Your own behalf and on Your sole responsibility, not on behalf
|
171 |
-
of any other Contributor, and only if You agree to indemnify,
|
172 |
-
defend, and hold each Contributor harmless for any liability
|
173 |
-
incurred by, or claims asserted against, such Contributor by reason
|
174 |
-
of your accepting any such warranty or additional liability.
|
175 |
-
|
176 |
-
END OF TERMS AND CONDITIONS
|
177 |
-
|
178 |
-
APPENDIX: How to apply the Apache License to your work.
|
179 |
-
|
180 |
-
To apply the Apache License to your work, attach the following
|
181 |
-
boilerplate notice, with the fields enclosed by brackets "{}"
|
182 |
-
replaced with your own identifying information. (Don't include
|
183 |
-
the brackets!) The text should be enclosed in the appropriate
|
184 |
-
comment syntax for the file format. We also recommend that a
|
185 |
-
file or class name and description of purpose be included on the
|
186 |
-
same "printed page" as the copyright notice for easier
|
187 |
-
identification within third-party archives.
|
188 |
-
|
189 |
-
Copyright 2024 The Fintech Open Source Foundation
|
190 |
-
|
191 |
-
Licensed under the Apache License, Version 2.0 (the "License");
|
192 |
-
you may not use this file except in compliance with the License.
|
193 |
-
You may obtain a copy of the License at
|
194 |
-
|
195 |
-
http://www.apache.org/licenses/LICENSE-2.0
|
196 |
-
|
197 |
-
Unless required by applicable law or agreed to in writing, software
|
198 |
-
distributed under the License is distributed on an "AS IS" BASIS,
|
199 |
-
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
200 |
-
See the License for the specific language governing permissions and
|
201 |
-
limitations under the License.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
LICENSE.spdx
DELETED
@@ -1,7 +0,0 @@
|
|
1 |
-
SPDXVersion: SPDX-2.0
|
2 |
-
DataLicense: CC0-1.0
|
3 |
-
Creator: The Fintech Open Source Foundation
|
4 |
-
PackageName: Open Financial LLMs Leaderboard
|
5 |
-
PackageOriginator: The Fintech Open Source Foundation
|
6 |
-
PackageHomePage: https://github.com/finos/open-financial-llms-leaderboard
|
7 |
-
PackageLicenseDeclared: Apache-2.0
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
Makefile
DELETED
@@ -1,13 +0,0 @@
|
|
1 |
-
.PHONY: style format
|
2 |
-
|
3 |
-
|
4 |
-
style:
|
5 |
-
python -m black --line-length 119 .
|
6 |
-
python -m isort .
|
7 |
-
ruff check --fix .
|
8 |
-
|
9 |
-
|
10 |
-
quality:
|
11 |
-
python -m black --check --line-length 119 .
|
12 |
-
python -m isort --check-only .
|
13 |
-
ruff check .
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
NOTICE
DELETED
@@ -1,4 +0,0 @@
|
|
1 |
-
Open Financial LLMs Leaderboard - FINOS
|
2 |
-
Copyright 2024 The Fintech Open Source Foundation [email protected]
|
3 |
-
|
4 |
-
This product includes software developed at the Fintech Open Source Foundation (https://www.finos.org/).
|
|
|
|
|
|
|
|
|
|
README.md
CHANGED
@@ -1,111 +1,91 @@
|
|
1 |
---
|
2 |
-
title: Open
|
3 |
-
emoji:
|
4 |
-
colorFrom:
|
5 |
-
colorTo:
|
6 |
-
sdk:
|
7 |
-
|
8 |
-
app_file: app.py
|
9 |
pinned: true
|
10 |
license: apache-2.0
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
11 |
---
|
12 |
|
13 |
-
|
14 |
-
|
15 |
-
# Open Financial LLM Leaderboard (OFLL)
|
16 |
-
|
17 |
-
The growing complexity of financial large language models (LLMs) demands evaluations that go beyond general NLP benchmarks. Traditional leaderboards often focus on broader tasks like translation or summarization, but they fall short of addressing the specific needs of the finance industry. Financial tasks such as predicting stock movements, assessing credit risks, and extracting information from financial reports present unique challenges, requiring models with specialized capabilities. This is why we created the **Open Financial LLM Leaderboard (OFLL)**.
|
18 |
-
|
19 |
-
## Why OFLL?
|
20 |
-
|
21 |
-
OFLL provides a specialized evaluation framework tailored specifically to the financial sector. It fills a critical gap by offering a transparent, one-stop solution to assess model readiness for real-world financial applications. The leaderboard focuses on tasks that matter most to finance professionals—information extraction from financial documents, market sentiment analysis, and financial trend forecasting.
|
22 |
-
|
23 |
-
## Key Differentiators
|
24 |
-
|
25 |
-
- **Comprehensive Financial Task Coverage**: Unlike general LLM leaderboards that evaluate broad NLP capabilities, OFLL focuses exclusively on tasks directly relevant to finance. These include information extraction, sentiment analysis, credit risk scoring, and stock movement forecasting—tasks crucial for real-world financial decision-making.
|
26 |
|
27 |
-
|
28 |
|
29 |
-
|
30 |
|
31 |
-
|
|
|
|
|
|
|
|
|
|
|
32 |
|
33 |
-
|
34 |
|
35 |
-
|
36 |
|
37 |
-
|
38 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
39 |
|
40 |
-
|
41 |
|
42 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
43 |
|
44 |
-
|
45 |
-
- `src/env.py`: Modify variables like repository paths for customization.
|
46 |
-
- `src/about.py`: Update task configurations here to add new datasets.
|
47 |
|
48 |
-
|
49 |
-
- Navigate to `src/about.py` and specify new tasks in the `Tasks` enum section.
|
50 |
-
- Each task requires details such as `benchmark`, `metric`, `col_name`, and `category`. For example:
|
51 |
-
```python
|
52 |
-
taskX = Task("DatasetName", "MetricType", "ColumnName", category="Category")
|
53 |
-
```
|
54 |
|
55 |
-
|
56 |
-
|
57 |
-
|
58 |
-
|
59 |
-
"config": {
|
60 |
-
"model_dtype": "torch.float16",
|
61 |
-
"model_name": "path of the model on the hub: org/model",
|
62 |
-
"model_sha": "revision on the hub"
|
63 |
-
},
|
64 |
-
"results": {
|
65 |
-
"task_name": {
|
66 |
-
"metric_name": score
|
67 |
-
},
|
68 |
-
"task_name2": {
|
69 |
-
"metric_name": score
|
70 |
-
}
|
71 |
-
}
|
72 |
-
}
|
73 |
-
```
|
74 |
-
|
75 |
-
4. **Updating Leaderboard Data**:
|
76 |
-
- When a new task is added, ensure that the results JSON files reflect this update. This process will be automated in future releases.
|
77 |
-
- Access the current results at [Hugging Face Datasets](https://huggingface.co/datasets/TheFinAI/results/tree/main/demo-leaderboard).
|
78 |
|
79 |
-
|
80 |
-
- [Hugging Face Leaderboard Documentation](https://huggingface.co/docs/leaderboards/en/leaderboards/building_page)
|
81 |
-
- [OFLL Demo on Hugging Face](https://huggingface.co/spaces/finosfoundation/Open-Financial-LLM-Leaderboard)
|
82 |
|
83 |
-
|
84 |
-
|
|
|
85 |
|
86 |
-
|
87 |
|
88 |
-
|
89 |
-
- the main table' columns names and properties in `src/display/utils.py`
|
90 |
-
- the logic to read all results and request files, then convert them in dataframe lines, in `src/leaderboard/read_evals.py`, and `src/populate.py`
|
91 |
-
- teh logic to allow or filter submissions in `src/submission/submit.py` and `src/submission/check_validity.py`
|
92 |
-
|
93 |
-
## License
|
94 |
-
|
95 |
-
Copyright 2024 Fintech Open Source Foundation
|
96 |
-
|
97 |
-
Distributed under the [Apache License, Version 2.0](http://www.apache.org/licenses/LICENSE-2.0).
|
98 |
-
|
99 |
-
SPDX-License-Identifier: [Apache-2.0](https://spdx.org/licenses/Apache-2.0)
|
100 |
-
|
101 |
-
|
102 |
-
### Current submissions are manully evaluated. Will open a automatic evaluation pipeline in the future update
|
103 |
-
tags:
|
104 |
-
- leaderboard
|
105 |
-
- modality:text
|
106 |
-
- submission:manual
|
107 |
-
- test:public
|
108 |
-
- judge:humans
|
109 |
-
- eval:generation
|
110 |
-
- language:English
|
111 |
|
|
|
|
|
|
|
|
1 |
---
|
2 |
+
title: Open Financial LLM Leaderboard
|
3 |
+
emoji: 🏆
|
4 |
+
colorFrom: blue
|
5 |
+
colorTo: red
|
6 |
+
sdk: docker
|
7 |
+
hf_oauth: true
|
|
|
8 |
pinned: true
|
9 |
license: apache-2.0
|
10 |
+
duplicated_from: open-llm-leaderboard/open_llm_leaderboard
|
11 |
+
short_description: Evaluating LLMs on Multilingual Multimodal Financial Tasks
|
12 |
+
tags:
|
13 |
+
- leaderboard
|
14 |
+
- modality:text
|
15 |
+
- submission:manual
|
16 |
+
- test:public
|
17 |
+
- judge:function
|
18 |
+
- eval:generation
|
19 |
+
- domain:financial
|
20 |
---
|
21 |
|
22 |
+
# Open LLM Leaderboard
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
23 |
|
24 |
+
Modern React interface for comparing Large Language Models (LLMs) in an open and reproducible way.
|
25 |
|
26 |
+
## Features
|
27 |
|
28 |
+
- 📊 Interactive table with advanced sorting and filtering
|
29 |
+
- 🔍 Semantic model search
|
30 |
+
- 📌 Pin models for comparison
|
31 |
+
- 📱 Responsive and modern interface
|
32 |
+
- 🎨 Dark/Light mode
|
33 |
+
- ⚡️ Optimized performance with virtualization
|
34 |
|
35 |
+
## Architecture
|
36 |
|
37 |
+
The project is split into two main parts:
|
38 |
|
39 |
+
### Frontend (React)
|
40 |
|
41 |
+
```
|
42 |
+
frontend/
|
43 |
+
├── src/
|
44 |
+
│ ├── components/ # Reusable UI components
|
45 |
+
│ ├── pages/ # Application pages
|
46 |
+
│ ├── hooks/ # Custom React hooks
|
47 |
+
│ ├── context/ # React contexts
|
48 |
+
│ └── constants/ # Constants and configurations
|
49 |
+
├── public/ # Static assets
|
50 |
+
└── server.js # Express server for production
|
51 |
+
```
|
52 |
|
53 |
+
### Backend (FastAPI)
|
54 |
|
55 |
+
```
|
56 |
+
backend/
|
57 |
+
├── app/
|
58 |
+
│ ├── api/ # API router and endpoints
|
59 |
+
│ │ └── endpoints/ # Specific API endpoints
|
60 |
+
│ ├── core/ # Core functionality
|
61 |
+
│ ├── config/ # Configuration
|
62 |
+
│ └── services/ # Business logic services
|
63 |
+
│ ├── leaderboard.py
|
64 |
+
│ ├── models.py
|
65 |
+
│ ├── votes.py
|
66 |
+
│ └── hf_service.py
|
67 |
+
└── utils/ # Utility functions
|
68 |
+
```
|
69 |
|
70 |
+
## Technologies
|
|
|
|
|
71 |
|
72 |
+
### Frontend
|
|
|
|
|
|
|
|
|
|
|
73 |
|
74 |
+
- React
|
75 |
+
- Material-UI
|
76 |
+
- TanStack Table & Virtual
|
77 |
+
- Express.js
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
78 |
|
79 |
+
### Backend
|
|
|
|
|
80 |
|
81 |
+
- FastAPI
|
82 |
+
- Hugging Face API
|
83 |
+
- Docker
|
84 |
|
85 |
+
## Development
|
86 |
|
87 |
+
The application is containerized using Docker and can be run using:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
88 |
|
89 |
+
```bash
|
90 |
+
docker-compose up
|
91 |
+
```
|
app.py
DELETED
@@ -1,508 +0,0 @@
|
|
1 |
-
import subprocess
|
2 |
-
import gradio as gr
|
3 |
-
import pandas as pd
|
4 |
-
from apscheduler.schedulers.background import BackgroundScheduler
|
5 |
-
from huggingface_hub import snapshot_download
|
6 |
-
import os
|
7 |
-
|
8 |
-
from src.about import (
|
9 |
-
CITATION_BUTTON_LABEL,
|
10 |
-
CITATION_BUTTON_TEXT,
|
11 |
-
EVALUATION_QUEUE_TEXT,
|
12 |
-
INTRODUCTION_TEXT,
|
13 |
-
LLM_BENCHMARKS_TEXT,
|
14 |
-
TITLE,
|
15 |
-
)
|
16 |
-
from src.display.css_html_js import custom_css
|
17 |
-
from src.display.utils import (
|
18 |
-
BENCHMARK_COLS,
|
19 |
-
COLS,
|
20 |
-
EVAL_COLS,
|
21 |
-
EVAL_TYPES,
|
22 |
-
NUMERIC_INTERVALS,
|
23 |
-
TYPES,
|
24 |
-
AutoEvalColumn,
|
25 |
-
ModelType,
|
26 |
-
fields,
|
27 |
-
WeightType,
|
28 |
-
Precision
|
29 |
-
)
|
30 |
-
from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, REPO_ID, RESULTS_REPO, TOKEN
|
31 |
-
from src.populate import get_evaluation_queue_df, get_leaderboard_df
|
32 |
-
from src.submission.submit import add_new_eval
|
33 |
-
|
34 |
-
|
35 |
-
def restart_space():
|
36 |
-
API.restart_space(repo_id=REPO_ID)
|
37 |
-
|
38 |
-
try:
|
39 |
-
print(EVAL_REQUESTS_PATH)
|
40 |
-
snapshot_download(
|
41 |
-
repo_id=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
|
42 |
-
)
|
43 |
-
except Exception:
|
44 |
-
restart_space()
|
45 |
-
try:
|
46 |
-
print(EVAL_RESULTS_PATH)
|
47 |
-
snapshot_download(
|
48 |
-
repo_id=RESULTS_REPO, local_dir=EVAL_RESULTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
|
49 |
-
)
|
50 |
-
except Exception:
|
51 |
-
restart_space()
|
52 |
-
|
53 |
-
|
54 |
-
raw_data, original_df = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS)
|
55 |
-
leaderboard_df = original_df.copy()
|
56 |
-
|
57 |
-
(
|
58 |
-
finished_eval_queue_df,
|
59 |
-
running_eval_queue_df,
|
60 |
-
pending_eval_queue_df,
|
61 |
-
) = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
|
62 |
-
|
63 |
-
|
64 |
-
# Searching and filtering
|
65 |
-
def update_table(
|
66 |
-
hidden_df: pd.DataFrame,
|
67 |
-
columns_info: list,
|
68 |
-
columns_IE: list,
|
69 |
-
columns_TA: list,
|
70 |
-
columns_QA: list,
|
71 |
-
columns_TG: list,
|
72 |
-
columns_RM: list,
|
73 |
-
columns_FO: list,
|
74 |
-
columns_DM: list,
|
75 |
-
columns_spanish: list,
|
76 |
-
columns_other: list,
|
77 |
-
type_query: list,
|
78 |
-
precision_query: list,
|
79 |
-
size_query: list,
|
80 |
-
show_deleted: bool,
|
81 |
-
query: str,
|
82 |
-
):
|
83 |
-
# Combine all column selections
|
84 |
-
selected_columns = (
|
85 |
-
columns_info + columns_IE + columns_TA + columns_QA + columns_TG +
|
86 |
-
columns_RM + columns_FO + columns_DM + columns_spanish + columns_other
|
87 |
-
)
|
88 |
-
# Filter models based on queries
|
89 |
-
filtered_df = filter_models(hidden_df, type_query, size_query, precision_query, show_deleted)
|
90 |
-
filtered_df = filter_queries(query, filtered_df)
|
91 |
-
df = select_columns(filtered_df, selected_columns)
|
92 |
-
return df
|
93 |
-
|
94 |
-
|
95 |
-
def search_table(df: pd.DataFrame, query: str) -> pd.DataFrame:
|
96 |
-
return df[(df[AutoEvalColumn.model.name].str.contains(query, case=False))]
|
97 |
-
|
98 |
-
|
99 |
-
def select_columns(df: pd.DataFrame, columns: list) -> pd.DataFrame:
|
100 |
-
always_here_cols = [
|
101 |
-
AutoEvalColumn.model_type_symbol.name,
|
102 |
-
AutoEvalColumn.model.name,
|
103 |
-
]
|
104 |
-
|
105 |
-
# Ensure no duplicates and add the new average columns
|
106 |
-
unique_columns = set(always_here_cols + columns)
|
107 |
-
|
108 |
-
# We use COLS to maintain sorting
|
109 |
-
filtered_df = df[[c for c in COLS if c in df.columns and c in unique_columns]]
|
110 |
-
|
111 |
-
# Debugging print to see if the new columns are included
|
112 |
-
print(f"Columns included in DataFrame: {filtered_df.columns.tolist()}")
|
113 |
-
|
114 |
-
return filtered_df
|
115 |
-
|
116 |
-
|
117 |
-
|
118 |
-
|
119 |
-
def filter_queries(query: str, filtered_df: pd.DataFrame) -> pd.DataFrame:
|
120 |
-
final_df = []
|
121 |
-
if query != "":
|
122 |
-
queries = [q.strip() for q in query.split(";")]
|
123 |
-
for _q in queries:
|
124 |
-
_q = _q.strip()
|
125 |
-
if _q != "":
|
126 |
-
temp_filtered_df = search_table(filtered_df, _q)
|
127 |
-
if len(temp_filtered_df) > 0:
|
128 |
-
final_df.append(temp_filtered_df)
|
129 |
-
if len(final_df) > 0:
|
130 |
-
filtered_df = pd.concat(final_df)
|
131 |
-
filtered_df = filtered_df.drop_duplicates(
|
132 |
-
subset=[AutoEvalColumn.model.name, AutoEvalColumn.precision.name, AutoEvalColumn.revision.name]
|
133 |
-
)
|
134 |
-
|
135 |
-
return filtered_df
|
136 |
-
|
137 |
-
|
138 |
-
def filter_models(
|
139 |
-
df: pd.DataFrame, type_query: list, size_query: list, precision_query: list, show_deleted: bool
|
140 |
-
) -> pd.DataFrame:
|
141 |
-
# Show all models
|
142 |
-
if show_deleted:
|
143 |
-
filtered_df = df
|
144 |
-
else:
|
145 |
-
filtered_df = df[df[AutoEvalColumn.still_on_hub.name] == True]
|
146 |
-
|
147 |
-
if "All" not in type_query:
|
148 |
-
if "?" in type_query:
|
149 |
-
filtered_df = filtered_df.loc[~df[AutoEvalColumn.model_type_symbol.name].isin([t for t in ModelType if t != "?"])]
|
150 |
-
else:
|
151 |
-
type_emoji = [t[0] for t in type_query]
|
152 |
-
filtered_df = filtered_df.loc[df[AutoEvalColumn.model_type_symbol.name].isin(type_emoji)]
|
153 |
-
|
154 |
-
if "All" not in precision_query:
|
155 |
-
if "?" in precision_query:
|
156 |
-
filtered_df = filtered_df.loc[df[AutoEvalColumn.precision.name].isna()]
|
157 |
-
else:
|
158 |
-
filtered_df = filtered_df.loc[df[AutoEvalColumn.precision.name].isin(precision_query + ["None"])]
|
159 |
-
|
160 |
-
if "All" not in size_query:
|
161 |
-
if "?" in size_query:
|
162 |
-
filtered_df = filtered_df.loc[df[AutoEvalColumn.params.name].isna()]
|
163 |
-
else:
|
164 |
-
numeric_interval = pd.IntervalIndex(sorted([NUMERIC_INTERVALS[s] for s in size_query]))
|
165 |
-
params_column = pd.to_numeric(df[AutoEvalColumn.params.name], errors="coerce")
|
166 |
-
mask = params_column.apply(lambda x: any(numeric_interval.contains(x)))
|
167 |
-
filtered_df = filtered_df.loc[mask]
|
168 |
-
|
169 |
-
return filtered_df
|
170 |
-
|
171 |
-
|
172 |
-
|
173 |
-
def uncheck_all():
|
174 |
-
return [], [], [], [], [], [], [], [], [], []
|
175 |
-
|
176 |
-
# Get a list of all logo files in the directory
|
177 |
-
logos_dir = "logos"
|
178 |
-
logo_files = sorted([f for f in os.listdir(logos_dir) if f.endswith(('.png', '.jpg', '.jpeg'))])
|
179 |
-
|
180 |
-
demo = gr.Blocks(css=custom_css)
|
181 |
-
with demo:
|
182 |
-
gr.HTML(TITLE)
|
183 |
-
gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
|
184 |
-
|
185 |
-
with gr.Tabs(elem_classes="tab-buttons") as tabs:
|
186 |
-
with gr.TabItem("🏅 LLM Benchmark", elem_id="llm-benchmark-tab-table", id=0):
|
187 |
-
with gr.Row():
|
188 |
-
with gr.Column():
|
189 |
-
with gr.Row():
|
190 |
-
search_bar = gr.Textbox(
|
191 |
-
placeholder=" 🔍 Search for your model (separate multiple queries with `;`) and press ENTER...",
|
192 |
-
show_label=False,
|
193 |
-
elem_id="search-bar",
|
194 |
-
)
|
195 |
-
with gr.Row():
|
196 |
-
with gr.Accordion("Select columns to show"):
|
197 |
-
with gr.Tab("Model Information"):
|
198 |
-
shown_columns_info = gr.CheckboxGroup(
|
199 |
-
choices=[c.name for c in fields(AutoEvalColumn) if c.category == "Model Information"],
|
200 |
-
value=[c.name for c in fields(AutoEvalColumn) if c.displayed_by_default and c.category == "Model Information"],
|
201 |
-
label="Model Information",
|
202 |
-
interactive=True,
|
203 |
-
)
|
204 |
-
with gr.Tab("Information Extraction (IE)"):
|
205 |
-
shown_columns_IE = gr.CheckboxGroup(
|
206 |
-
choices=[c.name for c in fields(AutoEvalColumn) if c.category == "Information Extraction (IE)"],
|
207 |
-
value=[c.name for c in fields(AutoEvalColumn) if c.displayed_by_default and c.category == "Information Extraction (IE)"],
|
208 |
-
label="Information Extraction (IE)",
|
209 |
-
interactive=True,
|
210 |
-
)
|
211 |
-
with gr.Tab("Textual Analysis (TA)"):
|
212 |
-
shown_columns_TA = gr.CheckboxGroup(
|
213 |
-
choices=[c.name for c in fields(AutoEvalColumn) if c.category == "Textual Analysis (TA)"],
|
214 |
-
value=[c.name for c in fields(AutoEvalColumn) if c.displayed_by_default and c.category == "Textual Analysis (TA)"],
|
215 |
-
label="Textual Analysis (TA)",
|
216 |
-
interactive=True,
|
217 |
-
)
|
218 |
-
with gr.Tab("Question Answering (QA)"):
|
219 |
-
shown_columns_QA = gr.CheckboxGroup(
|
220 |
-
choices=[c.name for c in fields(AutoEvalColumn) if c.category == "Question Answering (QA)"],
|
221 |
-
value=[c.name for c in fields(AutoEvalColumn) if c.displayed_by_default and c.category == "Question Answering (QA)"],
|
222 |
-
label="Question Answering (QA)",
|
223 |
-
interactive=True,
|
224 |
-
)
|
225 |
-
with gr.Tab("Text Generation (TG)"):
|
226 |
-
shown_columns_TG = gr.CheckboxGroup(
|
227 |
-
choices=[c.name for c in fields(AutoEvalColumn) if c.category == "Text Generation (TG)"],
|
228 |
-
value=[c.name for c in fields(AutoEvalColumn) if c.displayed_by_default and c.category == "Text Generation (TG)"],
|
229 |
-
label="Text Generation (TG)",
|
230 |
-
interactive=True,
|
231 |
-
)
|
232 |
-
with gr.Tab("Risk Management (RM)"):
|
233 |
-
shown_columns_RM = gr.CheckboxGroup(
|
234 |
-
choices=[c.name for c in fields(AutoEvalColumn) if c.category == "Risk Management (RM)"],
|
235 |
-
value=[c.name for c in fields(AutoEvalColumn) if c.displayed_by_default and c.category == "Risk Management (RM)"],
|
236 |
-
label="Risk Management (RM)",
|
237 |
-
interactive=True,
|
238 |
-
)
|
239 |
-
with gr.Tab("Forecasting (FO)"):
|
240 |
-
shown_columns_FO = gr.CheckboxGroup(
|
241 |
-
choices=[c.name for c in fields(AutoEvalColumn) if c.category == "Forecasting (FO)"],
|
242 |
-
value=[c.name for c in fields(AutoEvalColumn) if c.displayed_by_default and c.category == "Forecasting (FO)"],
|
243 |
-
label="Forecasting (FO)",
|
244 |
-
interactive=True,
|
245 |
-
)
|
246 |
-
with gr.Tab("Decision-Making (DM)"):
|
247 |
-
shown_columns_DM = gr.CheckboxGroup(
|
248 |
-
choices=[c.name for c in fields(AutoEvalColumn) if c.category == "Decision-Making (DM)"],
|
249 |
-
value=[c.name for c in fields(AutoEvalColumn) if c.displayed_by_default and c.category == "Decision-Making (DM)"],
|
250 |
-
label="Decision-Making (DM)",
|
251 |
-
interactive=True,
|
252 |
-
)
|
253 |
-
with gr.Tab("Spanish"):
|
254 |
-
shown_columns_spanish = gr.CheckboxGroup(
|
255 |
-
choices=[c.name for c in fields(AutoEvalColumn) if c.category == "Spanish"],
|
256 |
-
value=[c.name for c in fields(AutoEvalColumn) if c.displayed_by_default and c.category == "Spanish"],
|
257 |
-
label="Spanish",
|
258 |
-
interactive=True,
|
259 |
-
)
|
260 |
-
with gr.Tab("Other"):
|
261 |
-
shown_columns_other = gr.CheckboxGroup(
|
262 |
-
choices=[c.name for c in fields(AutoEvalColumn) if c.category == "Other"],
|
263 |
-
value=[c.name for c in fields(AutoEvalColumn) if c.displayed_by_default and c.category == "Other"],
|
264 |
-
label="Other",
|
265 |
-
interactive=True,
|
266 |
-
)
|
267 |
-
with gr.Row():
|
268 |
-
uncheck_all_button = gr.Button("Uncheck All")
|
269 |
-
uncheck_all_button.click(
|
270 |
-
uncheck_all,
|
271 |
-
inputs=[],
|
272 |
-
outputs=[
|
273 |
-
shown_columns_info,
|
274 |
-
shown_columns_IE,
|
275 |
-
shown_columns_TA,
|
276 |
-
shown_columns_QA,
|
277 |
-
shown_columns_TG,
|
278 |
-
shown_columns_RM,
|
279 |
-
shown_columns_FO,
|
280 |
-
shown_columns_DM,
|
281 |
-
shown_columns_spanish,
|
282 |
-
shown_columns_other,
|
283 |
-
|
284 |
-
],
|
285 |
-
)
|
286 |
-
with gr.Row():
|
287 |
-
deleted_models_visibility = gr.Checkbox(
|
288 |
-
value=True, label="Show gated/private/deleted models", interactive=True
|
289 |
-
)
|
290 |
-
with gr.Column(min_width=320):
|
291 |
-
#with gr.Box(elem_id="box-filter"):
|
292 |
-
filter_columns_type = gr.CheckboxGroup(
|
293 |
-
label="Model types",
|
294 |
-
choices=["All"] + [t.to_str() for t in ModelType],
|
295 |
-
value=["All"],
|
296 |
-
interactive=True,
|
297 |
-
elem_id="filter-columns-type",
|
298 |
-
)
|
299 |
-
filter_columns_precision = gr.CheckboxGroup(
|
300 |
-
label="Precision",
|
301 |
-
choices=["All"] + [i.value.name for i in Precision],
|
302 |
-
value=["All"],
|
303 |
-
interactive=True,
|
304 |
-
elem_id="filter-columns-precision",
|
305 |
-
)
|
306 |
-
filter_columns_size = gr.CheckboxGroup(
|
307 |
-
label="Model sizes (in billions of parameters)",
|
308 |
-
choices=["All"] + list(NUMERIC_INTERVALS.keys()) + ["?"],
|
309 |
-
value=["All"],
|
310 |
-
interactive=True,
|
311 |
-
elem_id="filter-columns-size",
|
312 |
-
)
|
313 |
-
|
314 |
-
|
315 |
-
leaderboard_table = gr.Dataframe(
|
316 |
-
value=leaderboard_df[
|
317 |
-
[c.name for c in fields(AutoEvalColumn) if c.never_hidden]
|
318 |
-
+ [c.name for c in fields(AutoEvalColumn) if c.displayed_by_default and not c.never_hidden]
|
319 |
-
],
|
320 |
-
headers=[c.name for c in fields(AutoEvalColumn) if c.never_hidden]
|
321 |
-
+ [c.name for c in fields(AutoEvalColumn) if c.displayed_by_default and not c.never_hidden],
|
322 |
-
datatype=TYPES,
|
323 |
-
elem_id="leaderboard-table",
|
324 |
-
interactive=False,
|
325 |
-
visible=True,
|
326 |
-
)
|
327 |
-
|
328 |
-
|
329 |
-
# Dummy leaderboard for handling the case when the user uses backspace key
|
330 |
-
hidden_leaderboard_table_for_search = gr.Dataframe(
|
331 |
-
value=original_df[COLS],
|
332 |
-
headers=COLS,
|
333 |
-
datatype=TYPES,
|
334 |
-
visible=False,
|
335 |
-
)
|
336 |
-
search_bar.submit(
|
337 |
-
update_table,
|
338 |
-
inputs=[
|
339 |
-
hidden_leaderboard_table_for_search,
|
340 |
-
shown_columns_info,
|
341 |
-
shown_columns_IE,
|
342 |
-
shown_columns_TA,
|
343 |
-
shown_columns_QA,
|
344 |
-
shown_columns_TG,
|
345 |
-
shown_columns_RM,
|
346 |
-
shown_columns_FO,
|
347 |
-
shown_columns_DM,
|
348 |
-
shown_columns_spanish,
|
349 |
-
shown_columns_other,
|
350 |
-
filter_columns_type,
|
351 |
-
filter_columns_precision,
|
352 |
-
filter_columns_size,
|
353 |
-
deleted_models_visibility,
|
354 |
-
search_bar,
|
355 |
-
],
|
356 |
-
outputs=leaderboard_table,
|
357 |
-
)
|
358 |
-
for selector in [
|
359 |
-
shown_columns_info,
|
360 |
-
shown_columns_IE,
|
361 |
-
shown_columns_TA,
|
362 |
-
shown_columns_QA,
|
363 |
-
shown_columns_TG,
|
364 |
-
shown_columns_RM,
|
365 |
-
shown_columns_FO,
|
366 |
-
shown_columns_DM,
|
367 |
-
shown_columns_spanish,
|
368 |
-
shown_columns_other,
|
369 |
-
filter_columns_type, filter_columns_precision,
|
370 |
-
filter_columns_size, deleted_models_visibility
|
371 |
-
]:
|
372 |
-
selector.change(
|
373 |
-
update_table,
|
374 |
-
inputs=[
|
375 |
-
hidden_leaderboard_table_for_search,
|
376 |
-
shown_columns_info,
|
377 |
-
shown_columns_IE,
|
378 |
-
shown_columns_TA,
|
379 |
-
shown_columns_QA,
|
380 |
-
shown_columns_TG,
|
381 |
-
shown_columns_RM,
|
382 |
-
shown_columns_FO,
|
383 |
-
shown_columns_DM,
|
384 |
-
shown_columns_spanish,
|
385 |
-
shown_columns_other,
|
386 |
-
filter_columns_type,
|
387 |
-
filter_columns_precision,
|
388 |
-
filter_columns_size,
|
389 |
-
deleted_models_visibility,
|
390 |
-
search_bar,
|
391 |
-
],
|
392 |
-
outputs=leaderboard_table,
|
393 |
-
queue=True,
|
394 |
-
)
|
395 |
-
|
396 |
-
with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=2):
|
397 |
-
gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
|
398 |
-
|
399 |
-
with gr.TabItem("🚀 Submit here! ", elem_id="llm-benchmark-tab-table", id=3):
|
400 |
-
with gr.Column():
|
401 |
-
with gr.Row():
|
402 |
-
gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
|
403 |
-
|
404 |
-
with gr.Column():
|
405 |
-
with gr.Accordion(
|
406 |
-
f"✅ Finished Evaluations ({len(finished_eval_queue_df)})",
|
407 |
-
open=False,
|
408 |
-
):
|
409 |
-
with gr.Row():
|
410 |
-
finished_eval_table = gr.Dataframe(
|
411 |
-
value=finished_eval_queue_df,
|
412 |
-
headers=EVAL_COLS,
|
413 |
-
datatype=EVAL_TYPES,
|
414 |
-
row_count=5,
|
415 |
-
)
|
416 |
-
with gr.Accordion(
|
417 |
-
f"🔄 Running Evaluation Queue ({len(running_eval_queue_df)})",
|
418 |
-
open=False,
|
419 |
-
):
|
420 |
-
with gr.Row():
|
421 |
-
running_eval_table = gr.Dataframe(
|
422 |
-
value=running_eval_queue_df,
|
423 |
-
headers=EVAL_COLS,
|
424 |
-
datatype=EVAL_TYPES,
|
425 |
-
row_count=5,
|
426 |
-
)
|
427 |
-
|
428 |
-
with gr.Accordion(
|
429 |
-
f"⏳ Pending Evaluation Queue ({len(pending_eval_queue_df)})",
|
430 |
-
open=False,
|
431 |
-
):
|
432 |
-
with gr.Row():
|
433 |
-
pending_eval_table = gr.Dataframe(
|
434 |
-
value=pending_eval_queue_df,
|
435 |
-
headers=EVAL_COLS,
|
436 |
-
datatype=EVAL_TYPES,
|
437 |
-
row_count=5,
|
438 |
-
)
|
439 |
-
with gr.Row():
|
440 |
-
gr.Markdown("# ✉️✨ Submit your model here!", elem_classes="markdown-text")
|
441 |
-
|
442 |
-
with gr.Row():
|
443 |
-
with gr.Column():
|
444 |
-
model_name_textbox = gr.Textbox(label="Model name")
|
445 |
-
revision_name_textbox = gr.Textbox(label="Revision commit", placeholder="main")
|
446 |
-
model_type = gr.Dropdown(
|
447 |
-
choices=[t.to_str(" : ") for t in ModelType if t != ModelType.Unknown],
|
448 |
-
label="Model type",
|
449 |
-
multiselect=False,
|
450 |
-
value=None,
|
451 |
-
interactive=True,
|
452 |
-
)
|
453 |
-
|
454 |
-
with gr.Column():
|
455 |
-
precision = gr.Dropdown(
|
456 |
-
choices=[i.value.name for i in Precision if i != Precision.Unknown],
|
457 |
-
label="Precision",
|
458 |
-
multiselect=False,
|
459 |
-
value="float16",
|
460 |
-
interactive=True,
|
461 |
-
)
|
462 |
-
weight_type = gr.Dropdown(
|
463 |
-
choices=[i.value.name for i in WeightType],
|
464 |
-
label="Weights type",
|
465 |
-
multiselect=False,
|
466 |
-
value="Original",
|
467 |
-
interactive=True,
|
468 |
-
)
|
469 |
-
base_model_name_textbox = gr.Textbox(label="Base model (for delta or adapter weights)")
|
470 |
-
|
471 |
-
submit_button = gr.Button("Submit Eval")
|
472 |
-
submission_result = gr.Markdown()
|
473 |
-
submit_button.click(
|
474 |
-
add_new_eval,
|
475 |
-
[
|
476 |
-
model_name_textbox,
|
477 |
-
base_model_name_textbox,
|
478 |
-
revision_name_textbox,
|
479 |
-
precision,
|
480 |
-
weight_type,
|
481 |
-
model_type,
|
482 |
-
],
|
483 |
-
submission_result,
|
484 |
-
)
|
485 |
-
|
486 |
-
# Footer with logos
|
487 |
-
with gr.Row(elem_id="footer"):
|
488 |
-
num_columns = min(5, len(logo_files))
|
489 |
-
for i in range(0, len(logo_files), num_columns):
|
490 |
-
with gr.Row():
|
491 |
-
for logo in logo_files[i:i + num_columns]:
|
492 |
-
logo_path = os.path.join(logos_dir, logo)
|
493 |
-
gr.Image(logo_path, show_label=False, elem_id="logo-image", width=100, height=100)
|
494 |
-
|
495 |
-
with gr.Row():
|
496 |
-
with gr.Accordion("📙 Citation", open=False):
|
497 |
-
citation_button = gr.Textbox(
|
498 |
-
value=CITATION_BUTTON_TEXT,
|
499 |
-
label=CITATION_BUTTON_LABEL,
|
500 |
-
lines=20,
|
501 |
-
elem_id="citation-button",
|
502 |
-
show_copy_button=True,
|
503 |
-
)
|
504 |
-
|
505 |
-
scheduler = BackgroundScheduler()
|
506 |
-
scheduler.add_job(restart_space, "interval", seconds=1800)
|
507 |
-
scheduler.start()
|
508 |
-
demo.queue(default_concurrency_limit=40).launch()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
examples/model_evaluation.ipynb
DELETED
The diff for this file is too large to render.
See raw diff
|
|
frontend/public/index.html
ADDED
@@ -0,0 +1,96 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
<!DOCTYPE html>
|
2 |
+
<html lang="en">
|
3 |
+
<head>
|
4 |
+
<meta charset="utf-8" />
|
5 |
+
<link rel="icon" href="%PUBLIC_URL%/logo32.png" />
|
6 |
+
<meta
|
7 |
+
name="viewport"
|
8 |
+
content="width=device-width, initial-scale=1, maximum-scale=1, user-scalable=no, viewport-fit=cover"
|
9 |
+
/>
|
10 |
+
<meta
|
11 |
+
name="description"
|
12 |
+
content="Interactive leaderboard for comparing LLM performance across financial benchmarks."
|
13 |
+
/>
|
14 |
+
|
15 |
+
<!-- Open Graph / Facebook -->
|
16 |
+
<meta property="og:type" content="website" />
|
17 |
+
<meta
|
18 |
+
property="og:url"
|
19 |
+
content="https://huggingface.co/spaces/TheFinAI/open_finllm_leaderboard"
|
20 |
+
/>
|
21 |
+
<meta
|
22 |
+
property="og:title"
|
23 |
+
content="Open Financial LLM Leaderboard - Compare Large Language Models in Financial Domain"
|
24 |
+
/>
|
25 |
+
<meta
|
26 |
+
property="og:description"
|
27 |
+
content="Interactive leaderboard for comparing LLM performance across financial benchmarks."
|
28 |
+
/>
|
29 |
+
<meta property="og:image" content="%PUBLIC_URL%/og-image.png" />
|
30 |
+
|
31 |
+
<!-- Twitter -->
|
32 |
+
<meta property="twitter:card" content="summary_large_image" />
|
33 |
+
<meta
|
34 |
+
property="twitter:url"
|
35 |
+
content="https://huggingface.co/spaces/TheFinAI/open_finllm_leaderboard"
|
36 |
+
/>
|
37 |
+
<meta
|
38 |
+
property="twitter:title"
|
39 |
+
content="Open Financial LLM Leaderboard - Compare Large Language Models in Financial Domain"
|
40 |
+
/>
|
41 |
+
<meta
|
42 |
+
property="twitter:description"
|
43 |
+
content="Interactive leaderboard for comparing LLM performance across financial benchmarks."
|
44 |
+
/>
|
45 |
+
<meta property="twitter:image" content="%PUBLIC_URL%/og-image.png" />
|
46 |
+
<!--
|
47 |
+
Notice the use of %PUBLIC_URL% in the tags above.
|
48 |
+
It will be replaced with the URL of the `public` folder during the build.
|
49 |
+
Only files inside the `public` folder can be referenced from the HTML.
|
50 |
+
|
51 |
+
Unlike "/favicon.ico" or "favicon.ico", "%PUBLIC_URL%/favicon.ico" will
|
52 |
+
work correctly both with client-side routing and a non-root public URL.
|
53 |
+
Learn how to configure a non-root public URL by running `npm run build`.
|
54 |
+
-->
|
55 |
+
<title>
|
56 |
+
Open Financial LLM Leaderboard - Compare Large Language Models in Financial Domain
|
57 |
+
</title>
|
58 |
+
<link
|
59 |
+
href="https://fonts.googleapis.com/css2?family=Source+Sans+Pro:wght@400;600;700&display=swap"
|
60 |
+
rel="stylesheet"
|
61 |
+
/>
|
62 |
+
<style>
|
63 |
+
html,
|
64 |
+
body {
|
65 |
+
position: fixed;
|
66 |
+
width: 100%;
|
67 |
+
height: 100%;
|
68 |
+
overflow: hidden;
|
69 |
+
-webkit-overflow-scrolling: touch;
|
70 |
+
}
|
71 |
+
#root {
|
72 |
+
position: absolute;
|
73 |
+
top: 0;
|
74 |
+
left: 0;
|
75 |
+
right: 0;
|
76 |
+
bottom: 0;
|
77 |
+
overflow-y: auto;
|
78 |
+
-webkit-overflow-scrolling: touch;
|
79 |
+
}
|
80 |
+
</style>
|
81 |
+
</head>
|
82 |
+
<body>
|
83 |
+
<noscript>You need to enable JavaScript to run this app.</noscript>
|
84 |
+
<div id="root"></div>
|
85 |
+
<!--
|
86 |
+
This HTML file is a template.
|
87 |
+
If you open it directly in the browser, you will see an empty page.
|
88 |
+
|
89 |
+
You can add webfonts, meta tags, or analytics to this file.
|
90 |
+
The build step will place the bundled scripts into the <body> tag.
|
91 |
+
|
92 |
+
To begin the development, run `npm start` or `yarn start`.
|
93 |
+
To create a production bundle, use `npm run build` or `yarn build`.
|
94 |
+
-->
|
95 |
+
</body>
|
96 |
+
</html>
|
frontend/public/og-image.jpg
ADDED
![]() |
frontend/public/robots.txt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
# https://www.robotstxt.org/robotstxt.html
|
2 |
+
User-agent: *
|
3 |
+
Disallow:
|
logos/1_columbia.png
DELETED
Binary file (121 kB)
|
|
logos/2_openfinance.jpg
DELETED
Binary file (28.6 kB)
|
|
logos/3_rpi.png
DELETED
Binary file (89.8 kB)
|
|
logos/4_finai-logo.jpg
DELETED
Binary file (65.8 kB)
|
|
logos/5_huggingface.jpeg
DELETED
Binary file (7.12 kB)
|
|
logos/FinOS.png
DELETED
Binary file (35.2 kB)
|
|
logos/archimedes logo GB copy.jpg
DELETED
Binary file (38 kB)
|
|
logos/manc.png
DELETED
Binary file (15 kB)
|
|
logos/nactemlogo.jpg
DELETED
Binary file (4.47 kB)
|
|
logos/uf.png
DELETED
Binary file (15 kB)
|
|
logos/wuhan.png
DELETED
Binary file (64.7 kB)
|
|
pyproject.toml
DELETED
@@ -1,13 +0,0 @@
|
|
1 |
-
[tool.ruff]
|
2 |
-
# Enable pycodestyle (`E`) and Pyflakes (`F`) codes by default.
|
3 |
-
select = ["E", "F"]
|
4 |
-
ignore = ["E501"] # line too long (black is taking care of this)
|
5 |
-
line-length = 119
|
6 |
-
fixable = ["A", "B", "C", "D", "E", "F", "G", "I", "N", "Q", "S", "T", "W", "ANN", "ARG", "BLE", "COM", "DJ", "DTZ", "EM", "ERA", "EXE", "FBT", "ICN", "INP", "ISC", "NPY", "PD", "PGH", "PIE", "PL", "PT", "PTH", "PYI", "RET", "RSE", "RUF", "SIM", "SLF", "TCH", "TID", "TRY", "UP", "YTT"]
|
7 |
-
|
8 |
-
[tool.isort]
|
9 |
-
profile = "black"
|
10 |
-
line_length = 119
|
11 |
-
|
12 |
-
[tool.black]
|
13 |
-
line-length = 119
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
requirements.txt
DELETED
@@ -1,20 +0,0 @@
|
|
1 |
-
APScheduler==3.10.1
|
2 |
-
black==23.11.0
|
3 |
-
click==8.1.3
|
4 |
-
datasets==2.14.5
|
5 |
-
gradio==4.42.0
|
6 |
-
gradio_client==1.3.0
|
7 |
-
huggingface-hub>=0.18.0
|
8 |
-
matplotlib==3.7.1
|
9 |
-
numpy==1.24.2
|
10 |
-
pandas==2.0.0
|
11 |
-
python-dateutil==2.8.2
|
12 |
-
requests==2.32.3
|
13 |
-
tqdm==4.65.0
|
14 |
-
transformers==4.35.2
|
15 |
-
tokenizers>=0.15.0
|
16 |
-
git+https://github.com/EleutherAI/lm-evaluation-harness.git@b281b0921b636bc36ad05c0b0b0763bd6dd43463#egg=lm-eval
|
17 |
-
accelerate==0.24.1
|
18 |
-
pydantic==2.9.1
|
19 |
-
fastapi==0.112.4
|
20 |
-
sentencepiece
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
src/about.py
DELETED
@@ -1,210 +0,0 @@
|
|
1 |
-
from dataclasses import dataclass
|
2 |
-
from enum import Enum
|
3 |
-
|
4 |
-
|
5 |
-
@dataclass
|
6 |
-
class Task:
|
7 |
-
benchmark: str
|
8 |
-
metric: str
|
9 |
-
col_name: str
|
10 |
-
category: str
|
11 |
-
|
12 |
-
|
13 |
-
# Select your tasks here
|
14 |
-
# ---------------------------------------------------
|
15 |
-
class Tasks(Enum):
|
16 |
-
task0 = Task("FPB", "F1", "FPB", category="Textual Analysis (TA)")
|
17 |
-
task2 = Task("FiQA-SA", "F1", "FiQA-SA", category="Textual Analysis (TA)")
|
18 |
-
task3 = Task("TSA", "RMSE", "TSA", category="Textual Analysis (TA)")
|
19 |
-
task4 = Task("Headlines", "AvgF1", "Headlines", category="Textual Analysis (TA)")
|
20 |
-
task5 = Task("FOMC", "F1", "FOMC", category="Textual Analysis (TA)")
|
21 |
-
task7 = Task("FinArg-ACC", "MicroF1", "FinArg-ACC", category="Textual Analysis (TA)")
|
22 |
-
task8 = Task("FinArg-ARC", "MicroF1", "FinArg-ARC", category="Textual Analysis (TA)")
|
23 |
-
task9 = Task("MultiFin", "MicroF1", "MultiFin", category="Textual Analysis (TA)")
|
24 |
-
task10 = Task("MA", "MicroF1", "MA", category="Textual Analysis (TA)")
|
25 |
-
task11 = Task("MLESG", "MicroF1", "MLESG", category="Textual Analysis (TA)")
|
26 |
-
task12 = Task("NER", "EntityF1", "NER", category="Information Extraction (IE)")
|
27 |
-
task13 = Task("FINER-ORD", "EntityF1", "FINER-ORD", category="Information Extraction (IE)")
|
28 |
-
task14 = Task("FinRED", "F1", "FinRED", category="Information Extraction (IE)")
|
29 |
-
task15 = Task("SC", "F1", "SC", category="Information Extraction (IE)")
|
30 |
-
task16 = Task("CD", "F1", "CD", category="Information Extraction (IE)")
|
31 |
-
task17 = Task("FinQA", "EmAcc", "FinQA", category="Question Answering (QA)")
|
32 |
-
task18 = Task("TATQA", "EmAcc", "TATQA", category="Question Answering (QA)")
|
33 |
-
task19 = Task("ConvFinQA", "EmAcc", "ConvFinQA", category="Question Answering (QA)")
|
34 |
-
task20 = Task("FNXL", "EntityF1", "FNXL", category="Information Extraction (IE)")
|
35 |
-
task21 = Task("FSRL", "EntityF1", "FSRL", category="Information Extraction (IE)")
|
36 |
-
task22 = Task("EDTSUM", "Rouge-1", "EDTSUM", category="Text Generation (TG)")
|
37 |
-
task25 = Task("ECTSUM", "Rouge-1", "ECTSUM", category="Text Generation (TG)")
|
38 |
-
task28 = Task("BigData22", "Acc", "BigData22", category="Forecasting (FO)")
|
39 |
-
task30 = Task("ACL18", "Acc", "ACL18", category="Forecasting (FO)")
|
40 |
-
task32 = Task("CIKM18", "Acc", "CIKM18", category="Forecasting (FO)")
|
41 |
-
task34 = Task("German", "MCC", "German", category="Risk Management (RM)")
|
42 |
-
task36 = Task("Australian", "MCC", "Australian", category="Risk Management (RM)")
|
43 |
-
task38 = Task("LendingClub", "MCC", "LendingClub", category="Risk Management (RM)")
|
44 |
-
task40 = Task("ccf", "MCC", "ccf", category="Risk Management (RM)")
|
45 |
-
task42 = Task("ccfraud", "MCC", "ccfraud", category="Risk Management (RM)")
|
46 |
-
task44 = Task("polish", "MCC", "polish", category="Risk Management (RM)")
|
47 |
-
task46 = Task("taiwan", "MCC", "taiwan", category="Risk Management (RM)")
|
48 |
-
task48 = Task("portoseguro", "MCC", "portoseguro", category="Risk Management (RM)")
|
49 |
-
task50 = Task("travelinsurance", "MCC", "travelinsurance", category="Risk Management (RM)")
|
50 |
-
task51 = Task("MultiFin-ES", "F1", "MultiFin-ES", category="Spanish")
|
51 |
-
task52 = Task("EFP", "F1", "EFP", category="Spanish")
|
52 |
-
task53 = Task("EFPA", "F1", "EFPA", category="Spanish")
|
53 |
-
task54 = Task("FinanceES", "F1", "FinanceES", category="Spanish")
|
54 |
-
task55 = Task("TSA-Spanish", "F1", "TSA-Spanish", category="Spanish")
|
55 |
-
task56 = Task("FinTrade", "SR", "FinTrade", category="Decision-Making (DM)")
|
56 |
-
|
57 |
-
NUM_FEWSHOT = 0 # Change with your few shot
|
58 |
-
# ---------------------------------------------------
|
59 |
-
|
60 |
-
|
61 |
-
# Your leaderboard name
|
62 |
-
TITLE = """<h1 align="center" id="space-title">🐲 Open Financial LLM Leaderboard</h1>"""
|
63 |
-
|
64 |
-
# What does your leaderboard evaluate?
|
65 |
-
INTRODUCTION_TEXT = """
|
66 |
-
🌟 The Open Financial LLM Leaderboard: Evaluate and compare the performance of financial Large Language Models (LLMs).
|
67 |
-
|
68 |
-
When you submit a model on the "Submit here!" page, it is automatically evaluated on a set of financial benchmarks.
|
69 |
-
|
70 |
-
The GPU used for evaluation is operated with the support of __[Wuhan University](http://en.whu.edu.cn/)__ and __[University of Florida](https://www.ufl.edu/)__.
|
71 |
-
|
72 |
-
The datasets used for evaluation consist of diverse financial datasets from `FinBen` benchmark to assess tasks such as sentiment analysis, named entity recognition, question answering, and more.
|
73 |
-
|
74 |
-
More details about the benchmarks and the evaluation process are provided on the “About” page.
|
75 |
-
"""
|
76 |
-
|
77 |
-
# Which evaluations are you running? how can people reproduce what you have?
|
78 |
-
LLM_BENCHMARKS_TEXT = """
|
79 |
-
## Introduction
|
80 |
-
|
81 |
-
The **Open Financial LLMs Leaderboard (OFLL)** is meticulously designed to rigorously track, rank, and evaluate state-of-the-art models in financial Natural Language Understanding and Prediction. Our leaderboard not only covers standard NLP tasks but also incorporates financial prediction tasks such as stock movement and credit scoring, offering a comprehensive evaluation for real-world financial applications.
|
82 |
-
|
83 |
-
## Icons & Model Types
|
84 |
-
|
85 |
-
- 🟢 : pretrained or continuously pretrained
|
86 |
-
- 🔶 : fine-tuned on domain-specific datasets
|
87 |
-
- 💬 : chat models (RLHF, DPO, ORPO, ...)
|
88 |
-
- 🤝 : base merges and moerges
|
89 |
-
|
90 |
-
If the icon is "?", it indicates that there is insufficient information about the model. Please provide information about the model through an issue! 🤩
|
91 |
-
|
92 |
-
**Note 1**: We reserve the right to correct any incorrect tags/icons after manual verification to ensure the accuracy and reliability of the leaderboard.
|
93 |
-
|
94 |
-
**Note 2** ⚠️: Some models might be widely discussed as subjects of caution by the community, implying that users should exercise restraint when using them. Models that have used the evaluation set for training to achieve a high leaderboard ranking, among others, may be selected as subjects of caution and might result in their deletion from the leaderboard.
|
95 |
-
|
96 |
-
## How It Works
|
97 |
-
|
98 |
-
📈 We evaluate models using Pixiu, a powerful and straightforward framework to test and assess language models on a large number of different evaluation tasks from FinBen, using datasets validated by financial experts.
|
99 |
-
|
100 |
-
### Evaluation Metrics
|
101 |
-
|
102 |
-
Our evaluation metrics include, but are not limited to, Accuracy, F1 Score, ROUGE score, BERTScore, and Matthews correlation coefficient (MCC), providing a multidimensional assessment of model performance. Metrics for specific tasks are as follows:
|
103 |
-
|
104 |
-
- **FPB**: F1, Accuracy. Financial PhraseBank classification task. This dataset is from the Financial PhraseBank, containing annotated phrases used in financial contexts. The classification task involves determining sentiment (positive, negative, neutral) for each phrase, essential for understanding financial news and reports.
|
105 |
-
- **FiQA-SA**: F1. Sentiment analysis on FiQA financial domain. Derived from the FiQA dataset, this task focuses on sentiment analysis in the financial domain, particularly within news and social media. The dataset is crucial for gauging market sentiment based on financial communications.
|
106 |
-
- **TSA**: F1, RMSE. Sentiment analysis on social media. The TSA dataset is utilized to analyze sentiment from tweets related to financial markets. The dataset is essential for real-time sentiment analysis, providing insights into market trends influenced by public opinion.
|
107 |
-
- **Headlines**: AvgF1. News headline classification. This dataset consists of financial news headlines, with each headline categorized into various financial events or sentiment classes. The task challenges models to understand and classify brief, context-rich text segments that drive market movements.
|
108 |
-
- **FOMC**: F1, Accuracy. Hawkish-dovish classification. Derived from transcripts of the Federal Open Market Committee (FOMC) meetings, this dataset involves classifying statements as hawkish or dovish, which indicates the stance of monetary policy. Accurate classification helps predict market reactions to central bank communications.
|
109 |
-
- **FinArg-ACC**: F1, Accuracy. Financial argument unit classification. This dataset involves the classification of argument units in financial documents, such as identifying the main claim, supporting evidence, or counterarguments. The task is crucial for automated financial document analysis, enabling the extraction of structured information from unstructured text.
|
110 |
-
- **FinArg-ARC**: F1, Accuracy. Financial argument relation classification. This task focuses on classifying relationships between different argument units within financial texts, such as support, opposition, or neutrality. Understanding these relations is critical for constructing coherent financial narratives from fragmented data.
|
111 |
-
- **MultiFin**: F1, Accuracy. Multi-class financial sentiment analysis. The MultiFin dataset includes diverse financial texts requiring sentiment classification across multiple categories, such as bullish, bearish, or neutral. The task is pivotal for analyzing sentiment in financial markets from varied sources like reports, news articles, and social media.
|
112 |
-
- **MA**: F1, Accuracy. Deal completeness classification. The dataset revolves around classifying mergers and acquisitions (M&A) reports to determine whether a deal has been completed. The task helps in tracking and analyzing the outcomes of corporate transactions, which is key for investment decisions.
|
113 |
-
- **MLESG**: F1, Accuracy. ESG issue identification. This dataset focuses on identifying Environmental, Social, and Governance (ESG) issues within financial texts. Models are evaluated on their ability to correctly classify and categorize ESG-related content, which is increasingly important for responsible investing.
|
114 |
-
- **NER**: EntityF1. Named entity recognition in financial texts. This task involves identifying and classifying named entities (e.g., companies, financial instruments, persons) within financial documents. Accurate NER is crucial for information extraction and financial analysis automation.
|
115 |
-
- **FINER-ORD**: EntityF1. Ordinal classification in financial NER. This dataset extends standard NER by requiring models to classify entities not just by type but also by their ordinal relevance (e.g., primary, secondary importance) within the text. This is useful for prioritizing information in financial summaries.
|
116 |
-
- **FinRED**: F1, EntityF1. Financial relation extraction from text. The task involves extracting relationships between financial entities, such as ownership, acquisition, or partnership relations. This is important for building knowledge graphs and conducting in-depth financial analysis.
|
117 |
-
- **SC**: F1, EntityF1. Causal classification task in the financial domain. The dataset requires models to classify causal relationships in financial texts, such as determining whether one event causes another. Understanding causality is critical for risk assessment and decision-making in finance.
|
118 |
-
- **CD**: F1, EntityF1. Causal detection. Similar to SC, but focused on detecting causality in a broader range of financial texts, including reports, news, and social media. The task evaluates the model's ability to identify causal links, which are key drivers in financial analysis.
|
119 |
-
- **FinQA**: EmAcc. Numerical question answering in finance. FinQA involves answering numerical questions based on financial documents, such as balance sheets or income statements. The task tests a model's ability to perform calculations or identify numerical data in a text.
|
120 |
-
- **TATQA**: F1, EmAcc. Table-based question answering in financial documents. This task is centered around answering questions that require interpreting and extracting information from tables in financial documents. It's crucial for automating the analysis of structured financial data.
|
121 |
-
- **ConvFinQA**: EmAcc. Multi-turn question answering in finance. ConvFinQA extends standard QA tasks by requiring models to handle multi-turn dialogues, where each question builds on the previous one. This simulates real-world scenarios where financial analysts ask a series of related questions.
|
122 |
-
- **FNXL**: F1, EmAcc. Numeric labeling in financial texts. This dataset requires models to label numeric values within financial documents, categorizing them by type (e.g., revenue, profit) and relevance. It tests the model's ability to understand the role of numbers in financial contexts.
|
123 |
-
- **FSRL**: F1, EmAcc. Financial statement relation linking. The task involves linking related information across different financial statements, such as matching revenue figures from income statements with corresponding cash flow data. This is key for comprehensive financial analysis.
|
124 |
-
- **EDTSUM**: ROUGE, BERTScore, BARTScore. Extractive document summarization in finance. The dataset involves summarizing lengthy financial documents by extracting the most relevant sentences. This task evaluates a model's ability to generate concise summaries that retain critical information.
|
125 |
-
- **ECTSUM**: ROUGE, BERTScore, BARTScore. Extractive content summarization. Similar to EDTSUM, but with a broader focus on summarizing content from various financial document types, including reports, articles, and regulatory filings.
|
126 |
-
- **BigData22**: Accuracy, MCC. Stock movement prediction. This dataset is used for predicting stock price movements based on financial news and reports. The task evaluates a model's ability to forecast market trends, which is essential for investment strategies.
|
127 |
-
- **ACL18**: Accuracy, MCC. Financial news-based stock prediction. The ACL18 dataset focuses on predicting stock movements specifically using news headlines and articles. It's a benchmark for evaluating the impact of news on stock prices.
|
128 |
-
- **CIKM18**: Accuracy, MCC. Financial market prediction using news. This task involves predicting broader market movements, such as indices, based on financial news. It tests the model's ability to aggregate and interpret multiple sources of financial information.
|
129 |
-
- **German**: F1, MCC. Credit scoring in the German market. The dataset includes data on loan applicants in Germany, with the task being to predict creditworthiness. This is important for financial institutions in assessing loan risks.
|
130 |
-
- **Australian**: F1, MCC. Credit scoring in the Australian market. Similar to the German dataset, but tailored for the Australian financial context, this task evaluates the model's ability to predict credit risk in this specific market.
|
131 |
-
- **LendingClub**: F1, MCC. Peer-to-peer lending risk prediction. This dataset involves predicting the risk of default for loans issued through the LendingClub platform, which is a major peer-to-peer lending service. The task is crucial for risk management in alternative finance.
|
132 |
-
- **ccf**: F1, MCC. Credit card fraud detection. The dataset is used to identify fraudulent transactions within a large dataset of credit card operations. Accurate detection is critical for financial security and fraud prevention.
|
133 |
-
- **ccfraud**: F1, MCC. Credit card transaction fraud detection. Similar to the ccf dataset but focusing on transaction-level analysis, this task evaluates the model's ability to detect anomalies that indicate fraud.
|
134 |
-
- **polish**: F1, MCC. Credit risk prediction in the Polish market. This task involves predicting the likelihood of default for loan applicants in Poland, with the dataset tailored to local economic and financial conditions.
|
135 |
-
- **taiwan**: F1, MCC. Credit risk prediction in the Taiwanese market. Similar to the Polish dataset but focused on Taiwan, this task evaluates the model's ability to assess credit risk in this market.
|
136 |
-
- **portoseguro**: F1, MCC. Claim analysis in the Brazilian market. The dataset involves predicting insurance claim risks in Brazil, specifically for auto insurance. The task tests the model's ability to assess and manage insurance risks.
|
137 |
-
- **travelinsurance**: F1, MCC. Travel insurance claim prediction. This dataset is used for predicting the likelihood of a travel insurance claim being made, which is important for risk pricing and policy management in the travel insurance industry.
|
138 |
-
- **MultiFin-ES**: F1. Multi-class financial sentiment analysis in Spanish. This dataset is used to analyze sentiment in Spanish-language financial texts. It evaluates the model's ability to handle sentiment classification across multiple categories in a non-English context.
|
139 |
-
- **EFP**: F1. Financial phrase classification in Spanish. Similar to the FPB dataset but in Spanish, this task involves classifying financial phrases according to sentiment or intent, specifically for Spanish-language content.
|
140 |
-
- **EFPA**: F1. Financial argument classification in Spanish. This dataset requires the classification of arguments in Spanish financial documents, focusing on identifying claims, evidence, and other argumentative structures.
|
141 |
-
- **FinanceES**: F1. Financial sentiment classification in Spanish. The task involves classifying sentiment in a broad range of Spanish financial documents, including news articles and reports. It tests the model's ability to adapt sentiment analysis techniques to a non-English language.
|
142 |
-
- **TSA-Spanish**: F1. Sentiment analysis in Spanish. This dataset involves sentiment analysis on Spanish-language tweets and short texts, similar to the English TSA dataset but tailored for Spanish speakers. It evaluates the model's ability to process and analyze sentiment in social media content.
|
143 |
-
- **FinTrade**: SR. Stock trading dataset. FinTrade is a novel dataset developed specifically for evaluating stock trading tasks using LLMs. It incorporates historical stock prices, financial news, and sentiment data from 10 different stocks over a year. This dataset is designed to simulate real-world trading scenarios, allowing models to perform agent-based financial trading. The task evaluates the models on multiple financial metrics such as Cumulative Return (CR), Sharpe Ratio (SR), Daily Volatility (DV), Annualized Volatility (AV), and Maximum Drawdown (MD). These metrics provide a comprehensive assessment of the model's profitability, risk management, and decision-making capabilities.
|
144 |
-
|
145 |
-
|
146 |
-
|
147 |
-
To ensure a fair and unbiased assessment of the models' true capabilities, all evaluations are conducted in zero-shot settings (0-shots). This approach eliminates any potential advantage from task-specific fine-tuning, providing a clear indication of how well the models can generalize to new tasks.
|
148 |
-
|
149 |
-
Given the nature of the tasks, which include multiple-choice and yes/no questions, we extract options from the generated text to evaluate performance.
|
150 |
-
|
151 |
-
Please, consider reaching out to us through the discussions tab if you are working on benchmarks for financial LLMs and willing to see them on this leaderboard as well. Your benchmark might change the whole game for financial models!
|
152 |
-
|
153 |
-
GPUs are provided by Wuhan University and the University of Florida for the evaluations.
|
154 |
-
|
155 |
-
## Details and Logs
|
156 |
-
|
157 |
-
- Detailed numerical results in the [results FinBen dataset](https://huggingface.co/datasets/FinBen/results)
|
158 |
-
- Community queries and running status in the [requests FinBen dataset](https://huggingface.co/datasets/FinBen/requests)
|
159 |
-
|
160 |
-
## More Resources
|
161 |
-
|
162 |
-
If you still have questions, you can check our github repository [here](https://github.com/The-FinAI/PIXIU).
|
163 |
-
"""
|
164 |
-
|
165 |
-
EVALUATION_QUEUE_TEXT = """
|
166 |
-
## Some good practices before submitting a model
|
167 |
-
|
168 |
-
### 1) Make sure you can load your model and tokenizer using AutoClasses:
|
169 |
-
```python
|
170 |
-
from transformers import AutoConfig, AutoModel, AutoTokenizer
|
171 |
-
config = AutoConfig.from_pretrained("your model name", revision=revision)
|
172 |
-
model = AutoModel.from_pretrained("your model name", revision=revision)
|
173 |
-
tokenizer = AutoTokenizer.from_pretrained("your model name", revision=revision)
|
174 |
-
```
|
175 |
-
If this step fails, follow the error messages to debug your model before submitting it. It's likely your model has been improperly uploaded.
|
176 |
-
|
177 |
-
Note: make sure your model is public!
|
178 |
-
Note: if your model needs `use_remote_code=True`, we do not support this option yet but we are working on adding it, stay posted!
|
179 |
-
|
180 |
-
### 2) Convert your model weights to [safetensors](https://huggingface.co/docs/safetensors/index)
|
181 |
-
It's a new format for storing weights which is safer and faster to load and use. It will also allow us to add the number of parameters of your model to the `Extended Viewer`!
|
182 |
-
|
183 |
-
### 3) Make sure your model has an open license!
|
184 |
-
This is a leaderboard for Open LLMs, and we'd love for as many people as possible to know they can use your model 🤗
|
185 |
-
|
186 |
-
### 4) Fill up your model card
|
187 |
-
When we add extra information about models to the leaderboard, it will be automatically taken from the model card
|
188 |
-
|
189 |
-
## In case of model failure
|
190 |
-
If your model is displayed in the `FAILED` category, its execution stopped.
|
191 |
-
Make sure you have followed the above steps first.
|
192 |
-
If everything is done, check you can launch the EleutherAIHarness on your model locally, using the above command without modifications (you can add `--limit` to limit the number of examples per task).
|
193 |
-
"""
|
194 |
-
|
195 |
-
CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
|
196 |
-
CITATION_BUTTON_TEXT = r"""
|
197 |
-
@article{Xie2024FinBen,
|
198 |
-
title={FinBen: A Holistic Financial Benchmark for Large Language Models},
|
199 |
-
author={Qianqian Xie and Weiguang Han and Zhengyu Chen and Ruoyu Xiang and Xiao Zhang and Yueru He and Mengxi Xiao and Dong Li and Yongfu Dai and Duanyu Feng and Yijing Xu and Haoqiang Kang and Ziyan Kuang and Chenhan Yuan and Kailai Yang and Zheheng Luo and Tianlin Zhang and Zhiwei Liu and Guojun Xiong and Zhiyang Deng and Yuechen Jiang and Zhiyuan Yao and Haohang Li and Yangyang Yu and Gang Hu and Jiajia Huang and Xiao-Yang Liu and Alejandro Lopez-Lira and Benyou Wang and Yanzhao Lai and Hao Wang and Min Peng and Sophia Ananiadou and Jimin Huang},
|
200 |
-
journal={NeurIPS, Special Track on Datasets and Benchmarks},
|
201 |
-
year={2024}
|
202 |
-
}
|
203 |
-
|
204 |
-
@article{Xie2023PIXIU,
|
205 |
-
title={PIXIU: A comprehensive benchmark, instruction dataset and large language model for finance},
|
206 |
-
author={Qianqian Xie and Weiguang Han and Xiao Zhang and Yanzhao Lai and Min Peng and Alejandro Lopez-Lira and Jimin Huang},
|
207 |
-
journal={NeurIPS, Special Track on Datasets and Benchmarks},
|
208 |
-
year={2023}
|
209 |
-
}
|
210 |
-
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
src/display/css_html_js.py
DELETED
@@ -1,105 +0,0 @@
|
|
1 |
-
custom_css = """
|
2 |
-
|
3 |
-
.markdown-text {
|
4 |
-
font-size: 16px !important;
|
5 |
-
}
|
6 |
-
|
7 |
-
#models-to-add-text {
|
8 |
-
font-size: 18px !important;
|
9 |
-
}
|
10 |
-
|
11 |
-
#citation-button span {
|
12 |
-
font-size: 16px !important;
|
13 |
-
}
|
14 |
-
|
15 |
-
#citation-button textarea {
|
16 |
-
font-size: 16px !important;
|
17 |
-
}
|
18 |
-
|
19 |
-
#citation-button > label > button {
|
20 |
-
margin: 6px;
|
21 |
-
transform: scale(1.3);
|
22 |
-
}
|
23 |
-
|
24 |
-
#leaderboard-table {
|
25 |
-
margin-top: 15px
|
26 |
-
}
|
27 |
-
|
28 |
-
#leaderboard-table-lite {
|
29 |
-
margin-top: 15px
|
30 |
-
}
|
31 |
-
|
32 |
-
#search-bar-table-box > div:first-child {
|
33 |
-
background: none;
|
34 |
-
border: none;
|
35 |
-
}
|
36 |
-
|
37 |
-
#search-bar {
|
38 |
-
padding: 0px;
|
39 |
-
}
|
40 |
-
|
41 |
-
/* Limit the width of the first AutoEvalColumn so that names don't expand too much */
|
42 |
-
table td:first-child,
|
43 |
-
table th:first-child {
|
44 |
-
max-width: 400px;
|
45 |
-
overflow: auto;
|
46 |
-
white-space: nowrap;
|
47 |
-
}
|
48 |
-
|
49 |
-
.tab-buttons button {
|
50 |
-
font-size: 20px;
|
51 |
-
}
|
52 |
-
|
53 |
-
#scale-logo {
|
54 |
-
border-style: none !important;
|
55 |
-
box-shadow: none;
|
56 |
-
display: block;
|
57 |
-
margin-left: auto;
|
58 |
-
margin-right: auto;
|
59 |
-
max-width: 600px;
|
60 |
-
}
|
61 |
-
|
62 |
-
#scale-logo .download {
|
63 |
-
display: none;
|
64 |
-
}
|
65 |
-
#filter_type{
|
66 |
-
border: 0;
|
67 |
-
padding-left: 0;
|
68 |
-
padding-top: 0;
|
69 |
-
}
|
70 |
-
#filter_type label {
|
71 |
-
display: flex;
|
72 |
-
}
|
73 |
-
#filter_type label > span{
|
74 |
-
margin-top: var(--spacing-lg);
|
75 |
-
margin-right: 0.5em;
|
76 |
-
}
|
77 |
-
#filter_type label > .wrap{
|
78 |
-
width: 103px;
|
79 |
-
}
|
80 |
-
#filter_type label > .wrap .wrap-inner{
|
81 |
-
padding: 2px;
|
82 |
-
}
|
83 |
-
#filter_type label > .wrap .wrap-inner input{
|
84 |
-
width: 1px
|
85 |
-
}
|
86 |
-
#filter-columns-type{
|
87 |
-
border:0;
|
88 |
-
padding:0.5;
|
89 |
-
}
|
90 |
-
#filter-columns-size{
|
91 |
-
border:0;
|
92 |
-
padding:0.5;
|
93 |
-
}
|
94 |
-
#box-filter > .form{
|
95 |
-
border: 0
|
96 |
-
}
|
97 |
-
"""
|
98 |
-
|
99 |
-
get_window_url_params = """
|
100 |
-
function(url_params) {
|
101 |
-
const params = new URLSearchParams(window.location.search);
|
102 |
-
url_params = Object.fromEntries(params);
|
103 |
-
return url_params;
|
104 |
-
}
|
105 |
-
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
src/display/formatting.py
DELETED
@@ -1,27 +0,0 @@
|
|
1 |
-
def model_hyperlink(link, model_name):
|
2 |
-
return f'<a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model_name}</a>'
|
3 |
-
|
4 |
-
|
5 |
-
def make_clickable_model(model_name):
|
6 |
-
link = f"https://huggingface.co/{model_name}"
|
7 |
-
return model_hyperlink(link, model_name)
|
8 |
-
|
9 |
-
|
10 |
-
def styled_error(error):
|
11 |
-
return f"<p style='color: red; font-size: 20px; text-align: center;'>{error}</p>"
|
12 |
-
|
13 |
-
|
14 |
-
def styled_warning(warn):
|
15 |
-
return f"<p style='color: orange; font-size: 20px; text-align: center;'>{warn}</p>"
|
16 |
-
|
17 |
-
|
18 |
-
def styled_message(message):
|
19 |
-
return f"<p style='color: green; font-size: 20px; text-align: center;'>{message}</p>"
|
20 |
-
|
21 |
-
|
22 |
-
def has_no_nan_values(df, columns):
|
23 |
-
return df[columns].notna().all(axis=1)
|
24 |
-
|
25 |
-
|
26 |
-
def has_nan_values(df, columns):
|
27 |
-
return df[columns].isna().any(axis=1)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
src/display/utils.py
DELETED
@@ -1,146 +0,0 @@
|
|
1 |
-
from dataclasses import dataclass, make_dataclass
|
2 |
-
from enum import Enum
|
3 |
-
|
4 |
-
import pandas as pd
|
5 |
-
|
6 |
-
from src.about import Tasks
|
7 |
-
|
8 |
-
def fields(raw_class):
|
9 |
-
return [v for k, v in raw_class.__dict__.items() if k[:2] != "__" and k[-2:] != "__"]
|
10 |
-
|
11 |
-
|
12 |
-
# These classes are for user facing column names,
|
13 |
-
# to avoid having to change them all around the code
|
14 |
-
# when a modif is needed
|
15 |
-
@dataclass
|
16 |
-
class ColumnContent:
|
17 |
-
name: str
|
18 |
-
type: str
|
19 |
-
displayed_by_default: bool
|
20 |
-
category: str = "" # New attribute to hold the category
|
21 |
-
hidden: bool = False
|
22 |
-
never_hidden: bool = False
|
23 |
-
|
24 |
-
## Leaderboard columns
|
25 |
-
auto_eval_column_dict = []
|
26 |
-
|
27 |
-
# Model Information
|
28 |
-
auto_eval_column_dict.append(["model_type_symbol", ColumnContent, ColumnContent("T", "str", True, category="Model Information", never_hidden=True)])
|
29 |
-
auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model", "markdown", True, category="Model Information", never_hidden=True)])
|
30 |
-
|
31 |
-
auto_eval_column_dict.append(["average", ColumnContent, ColumnContent("Average ⬆️", "number", True, category="Model Information")])
|
32 |
-
auto_eval_column_dict.append(["average_IE", ColumnContent, ColumnContent("Average IE ⬆️", "number", True, category="Information Extraction (IE)")])
|
33 |
-
auto_eval_column_dict.append(["average_TA", ColumnContent, ColumnContent("Average TA ⬆️", "number", True, category="Textual Analysis (TA)")])
|
34 |
-
auto_eval_column_dict.append(["average_QA", ColumnContent, ColumnContent("Average QA ⬆️", "number", True, category="Question Answering (QA)")])
|
35 |
-
auto_eval_column_dict.append(["average_TG", ColumnContent, ColumnContent("Average TG ⬆️", "number", True, category="Text Generation (TG)")])
|
36 |
-
auto_eval_column_dict.append(["average_RM", ColumnContent, ColumnContent("Average RM ⬆️", "number", True, category="Risk Management (RM)")])
|
37 |
-
auto_eval_column_dict.append(["average_FO", ColumnContent, ColumnContent("Average FO ⬆️", "number", True, category="Forecasting (FO)")])
|
38 |
-
auto_eval_column_dict.append(["average_DM", ColumnContent, ColumnContent("Average DM ⬆️", "number", True, category="Decision-Making (DM)")])
|
39 |
-
auto_eval_column_dict.append(["average_Spanish", ColumnContent, ColumnContent("Average Spanish ⬆️", "number", True, category="Spanish")])
|
40 |
-
|
41 |
-
auto_eval_column_dict.append(["model_type", ColumnContent, ColumnContent("Type", "str", False, category="Model Information")])
|
42 |
-
auto_eval_column_dict.append(["architecture", ColumnContent, ColumnContent("Architecture", "str", False, category="Model Information")])
|
43 |
-
auto_eval_column_dict.append(["weight_type", ColumnContent, ColumnContent("Weight type", "str", False, category="Model Information", hidden=True)])
|
44 |
-
auto_eval_column_dict.append(["precision", ColumnContent, ColumnContent("Precision", "str", False, category="Model Information")])
|
45 |
-
auto_eval_column_dict.append(["license", ColumnContent, ColumnContent("Hub License", "str", False, category="Model Information")])
|
46 |
-
auto_eval_column_dict.append(["params", ColumnContent, ColumnContent("#Params (B)", "number", False, category="Model Information")])
|
47 |
-
auto_eval_column_dict.append(["likes", ColumnContent, ColumnContent("Hub ❤️", "number", False, category="Model Information")])
|
48 |
-
auto_eval_column_dict.append(["still_on_hub", ColumnContent, ColumnContent("Available on the hub", "bool", False, category="Model Information")])
|
49 |
-
auto_eval_column_dict.append(["revision", ColumnContent, ColumnContent("Model sha", "str", False, category="Model Information", hidden=False)])
|
50 |
-
|
51 |
-
for task in Tasks:
|
52 |
-
auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", False, category=task.value.category)])
|
53 |
-
|
54 |
-
# We use make_dataclass to dynamically fill the scores from Tasks
|
55 |
-
AutoEvalColumn = make_dataclass("AutoEvalColumn", auto_eval_column_dict, frozen=True)
|
56 |
-
|
57 |
-
## For the queue columns in the submission tab
|
58 |
-
@dataclass(frozen=True)
|
59 |
-
class EvalQueueColumn: # Queue column
|
60 |
-
model = ColumnContent("model", "markdown", True)
|
61 |
-
revision = ColumnContent("revision", "str", True)
|
62 |
-
private = ColumnContent("private", "bool", True)
|
63 |
-
precision = ColumnContent("precision", "str", True)
|
64 |
-
weight_type = ColumnContent("weight_type", "str", "Original")
|
65 |
-
status = ColumnContent("status", "str", True)
|
66 |
-
|
67 |
-
## All the model information that we might need
|
68 |
-
@dataclass
|
69 |
-
class ModelDetails:
|
70 |
-
name: str
|
71 |
-
display_name: str = ""
|
72 |
-
symbol: str = "" # emoji
|
73 |
-
|
74 |
-
|
75 |
-
class ModelType(Enum):
|
76 |
-
PT = ModelDetails(name="pretrained", symbol="🟢")
|
77 |
-
FT = ModelDetails(name="fine-tuned", symbol="🔶")
|
78 |
-
IFT = ModelDetails(name="instruction-tuned", symbol="⭕")
|
79 |
-
RL = ModelDetails(name="RL-tuned", symbol="🟦")
|
80 |
-
Unknown = ModelDetails(name="", symbol="?")
|
81 |
-
|
82 |
-
def to_str(self, separator=" "):
|
83 |
-
return f"{self.value.symbol}{separator}{self.value.name}"
|
84 |
-
|
85 |
-
@staticmethod
|
86 |
-
def from_str(type):
|
87 |
-
if "fine-tuned" in type or "🔶" in type:
|
88 |
-
return ModelType.FT
|
89 |
-
if "pretrained" in type or "🟢" in type:
|
90 |
-
return ModelType.PT
|
91 |
-
if "RL-tuned" in type or "🟦" in type:
|
92 |
-
return ModelType.RL
|
93 |
-
if "instruction-tuned" in type or "⭕" in type:
|
94 |
-
return ModelType.IFT
|
95 |
-
return ModelType.Unknown
|
96 |
-
|
97 |
-
class WeightType(Enum):
|
98 |
-
Adapter = ModelDetails("Adapter")
|
99 |
-
Original = ModelDetails("Original")
|
100 |
-
Delta = ModelDetails("Delta")
|
101 |
-
|
102 |
-
class Precision(Enum):
|
103 |
-
float16 = ModelDetails("float16")
|
104 |
-
bfloat16 = ModelDetails("bfloat16")
|
105 |
-
float32 = ModelDetails("float32")
|
106 |
-
#qt_8bit = ModelDetails("8bit")
|
107 |
-
#qt_4bit = ModelDetails("4bit")
|
108 |
-
#qt_GPTQ = ModelDetails("GPTQ")
|
109 |
-
Unknown = ModelDetails("?")
|
110 |
-
|
111 |
-
def from_str(precision):
|
112 |
-
if precision in ["torch.float16", "float16"]:
|
113 |
-
return Precision.float16
|
114 |
-
if precision in ["torch.bfloat16", "bfloat16"]:
|
115 |
-
return Precision.bfloat16
|
116 |
-
if precision in ["float32"]:
|
117 |
-
return Precision.float32
|
118 |
-
#if precision in ["8bit"]:
|
119 |
-
# return Precision.qt_8bit
|
120 |
-
#if precision in ["4bit"]:
|
121 |
-
# return Precision.qt_4bit
|
122 |
-
#if precision in ["GPTQ", "None"]:
|
123 |
-
# return Precision.qt_GPTQ
|
124 |
-
return Precision.Unknown
|
125 |
-
|
126 |
-
# Column selection
|
127 |
-
COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden]
|
128 |
-
TYPES = [c.type for c in fields(AutoEvalColumn) if not c.hidden]
|
129 |
-
COLS_LITE = [c.name for c in fields(AutoEvalColumn) if c.displayed_by_default and not c.hidden]
|
130 |
-
TYPES_LITE = [c.type for c in fields(AutoEvalColumn) if c.displayed_by_default and not c.hidden]
|
131 |
-
|
132 |
-
EVAL_COLS = [c.name for c in fields(EvalQueueColumn)]
|
133 |
-
EVAL_TYPES = [c.type for c in fields(EvalQueueColumn)]
|
134 |
-
|
135 |
-
BENCHMARK_COLS = [t.value.col_name for t in Tasks]
|
136 |
-
|
137 |
-
NUMERIC_INTERVALS = {
|
138 |
-
"?": pd.Interval(-1, 0, closed="right"),
|
139 |
-
"~1.5": pd.Interval(0, 2, closed="right"),
|
140 |
-
"~3": pd.Interval(2, 4, closed="right"),
|
141 |
-
"~7": pd.Interval(4, 9, closed="right"),
|
142 |
-
"~13": pd.Interval(9, 20, closed="right"),
|
143 |
-
"~35": pd.Interval(20, 45, closed="right"),
|
144 |
-
"~60": pd.Interval(45, 70, closed="right"),
|
145 |
-
"70+": pd.Interval(70, 10000, closed="right"),
|
146 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
src/envs.py
DELETED
@@ -1,25 +0,0 @@
|
|
1 |
-
import os
|
2 |
-
|
3 |
-
from huggingface_hub import HfApi
|
4 |
-
|
5 |
-
# Info to change for your repository
|
6 |
-
# ----------------------------------
|
7 |
-
TOKEN = os.environ.get("TOKEN") # A read/write token for your org
|
8 |
-
|
9 |
-
OWNER = "TheFinAI" # Change to your org - don't forget to create a results and request dataset, with the correct format!
|
10 |
-
# ----------------------------------
|
11 |
-
|
12 |
-
REPO_ID = f"{OWNER}/FinBen-Leaderboard"
|
13 |
-
QUEUE_REPO = f"{OWNER}/requests"
|
14 |
-
RESULTS_REPO = f"{OWNER}/results"
|
15 |
-
|
16 |
-
# If you setup a cache later, just change HF_HOME
|
17 |
-
CACHE_PATH=os.getenv("HF_HOME", ".")
|
18 |
-
|
19 |
-
# Local caches
|
20 |
-
EVAL_REQUESTS_PATH = os.path.join(CACHE_PATH, "eval-queue")
|
21 |
-
EVAL_RESULTS_PATH = os.path.join(CACHE_PATH, "eval-results")
|
22 |
-
EVAL_REQUESTS_PATH_BACKEND = os.path.join(CACHE_PATH, "eval-queue-bk")
|
23 |
-
EVAL_RESULTS_PATH_BACKEND = os.path.join(CACHE_PATH, "eval-results-bk")
|
24 |
-
|
25 |
-
API = HfApi(token=TOKEN)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
src/leaderboard/read_evals.py
DELETED
@@ -1,273 +0,0 @@
|
|
1 |
-
import glob
|
2 |
-
import json
|
3 |
-
import math
|
4 |
-
import os
|
5 |
-
from dataclasses import dataclass
|
6 |
-
|
7 |
-
import dateutil
|
8 |
-
import numpy as np
|
9 |
-
|
10 |
-
from src.display.formatting import make_clickable_model
|
11 |
-
from src.display.utils import AutoEvalColumn, ModelType, Tasks, Precision, WeightType
|
12 |
-
from src.submission.check_validity import is_model_on_hub
|
13 |
-
|
14 |
-
task_benchmarks = {task.value.benchmark for task in Tasks}
|
15 |
-
|
16 |
-
@dataclass
|
17 |
-
class EvalResult:
|
18 |
-
"""Represents one full evaluation. Built from a combination of the result and request file for a given run.
|
19 |
-
"""
|
20 |
-
eval_name: str # org_model_precision (uid)
|
21 |
-
full_model: str # org/model (path on hub)
|
22 |
-
org: str
|
23 |
-
model: str
|
24 |
-
revision: str # commit hash, "" if main
|
25 |
-
results: dict
|
26 |
-
precision: Precision = Precision.Unknown
|
27 |
-
model_type: ModelType = ModelType.Unknown # Pretrained, fine tuned, ...
|
28 |
-
weight_type: WeightType = WeightType.Original # Original or Adapter
|
29 |
-
architecture: str = "Unknown"
|
30 |
-
license: str = "?"
|
31 |
-
likes: int = 0
|
32 |
-
num_params: int = 0
|
33 |
-
date: str = "" # submission date of request file
|
34 |
-
still_on_hub: bool = False
|
35 |
-
|
36 |
-
@classmethod
|
37 |
-
def init_from_json_file(self, json_filepath):
|
38 |
-
"""Inits the result from the specific model result file"""
|
39 |
-
with open(json_filepath) as fp:
|
40 |
-
print(json_filepath)
|
41 |
-
data = json.load(fp)
|
42 |
-
|
43 |
-
config = data.get("config")
|
44 |
-
# Precision
|
45 |
-
precision = Precision.from_str(config.get("model_dtype"))
|
46 |
-
|
47 |
-
# ModelType
|
48 |
-
model_type = ModelType.from_str(config.get("model_type"))
|
49 |
-
|
50 |
-
# Get model and org
|
51 |
-
org_and_model = config.get("model_name", config.get("model_args", None))
|
52 |
-
org_and_model = org_and_model.split("/", 1)
|
53 |
-
|
54 |
-
if len(org_and_model) == 1:
|
55 |
-
org = None
|
56 |
-
model = org_and_model[0]
|
57 |
-
result_key = f"{model}_{precision.value.name}"
|
58 |
-
else:
|
59 |
-
org = org_and_model[0]
|
60 |
-
model = org_and_model[1]
|
61 |
-
result_key = f"{org}_{model}_{precision.value.name}"
|
62 |
-
full_model = "/".join(org_and_model)
|
63 |
-
|
64 |
-
still_on_hub, _, model_config = is_model_on_hub(
|
65 |
-
full_model, config.get("model_sha", "main"), trust_remote_code=True, test_tokenizer=False
|
66 |
-
)
|
67 |
-
architecture = "?"
|
68 |
-
if model_config is not None:
|
69 |
-
architectures = getattr(model_config, "architectures", None)
|
70 |
-
if architectures:
|
71 |
-
architecture = ";".join(architectures)
|
72 |
-
|
73 |
-
# Extract results available in this file (some results are split in several files)
|
74 |
-
results = {}
|
75 |
-
for task in Tasks:
|
76 |
-
task = task.value
|
77 |
-
|
78 |
-
# We average all scores of a given metric (not all metrics are present in all files)
|
79 |
-
accs = np.array([v.get(task.metric, None) for k, v in data["results"].items() if task.benchmark == k])
|
80 |
-
if accs.size == 0 or any([acc is None for acc in accs]):
|
81 |
-
continue
|
82 |
-
|
83 |
-
mean_acc = np.mean(accs) * 100.0
|
84 |
-
results[task.benchmark] = mean_acc
|
85 |
-
|
86 |
-
# Print missing benchmarks if any
|
87 |
-
missing_benchmarks = task_benchmarks - results.keys()
|
88 |
-
if missing_benchmarks:
|
89 |
-
print(f"(Missing results) Model {model} is missing {', '.join(missing_benchmarks)} from result files")
|
90 |
-
for benchmark in missing_benchmarks:
|
91 |
-
results[benchmark] = "missing"
|
92 |
-
|
93 |
-
|
94 |
-
|
95 |
-
return self(
|
96 |
-
eval_name=result_key,
|
97 |
-
full_model=full_model,
|
98 |
-
org=org,
|
99 |
-
model=model,
|
100 |
-
results=results,
|
101 |
-
precision=precision,
|
102 |
-
revision= config.get("model_sha", ""),
|
103 |
-
still_on_hub=still_on_hub,
|
104 |
-
architecture=architecture,
|
105 |
-
model_type=model_type
|
106 |
-
)
|
107 |
-
|
108 |
-
|
109 |
-
def update_with_request_file(self, requests_path):
|
110 |
-
"""Finds the relevant request file for the current model and updates info with it"""
|
111 |
-
request_file = get_request_file_for_model(requests_path, self.full_model, self.precision.value.name)
|
112 |
-
try:
|
113 |
-
with open(request_file, "r") as f:
|
114 |
-
request = json.load(f)
|
115 |
-
self.model_type = ModelType.from_str(request.get("model_type", ""))
|
116 |
-
self.weight_type = WeightType[request.get("weight_type", "Original")]
|
117 |
-
self.license = request.get("license", "?")
|
118 |
-
self.likes = request.get("likes", 0)
|
119 |
-
self.num_params = request.get("params", 0)
|
120 |
-
self.date = request.get("submitted_time", "")
|
121 |
-
except Exception:
|
122 |
-
print(f"Could not find request file for {self.org}/{self.model} with precision {self.precision.value.name}")
|
123 |
-
|
124 |
-
def to_dict(self):
|
125 |
-
"""Converts the Eval Result to a dict compatible with our dataframe display"""
|
126 |
-
|
127 |
-
# Initialize category averages
|
128 |
-
category_averages = {
|
129 |
-
"average_IE": [],
|
130 |
-
"average_TA": [],
|
131 |
-
"average_QA": [],
|
132 |
-
"average_TG": [],
|
133 |
-
"average_RM": [],
|
134 |
-
"average_FO": [],
|
135 |
-
"average_DM": [],
|
136 |
-
"average_Spanish": []
|
137 |
-
}
|
138 |
-
|
139 |
-
# Calculate averages for each task
|
140 |
-
for task in Tasks:
|
141 |
-
score = self.results.get(task.value.benchmark)
|
142 |
-
if score is not None:
|
143 |
-
# Append score to the appropriate category
|
144 |
-
if task.value.category == "Information Extraction (IE)":
|
145 |
-
category_averages["average_IE"].append(score)
|
146 |
-
elif task.value.category == "Textual Analysis (TA)":
|
147 |
-
category_averages["average_TA"].append(score)
|
148 |
-
elif task.value.category == "Question Answering (QA)":
|
149 |
-
category_averages["average_QA"].append(score)
|
150 |
-
elif task.value.category == "Text Generation (TG)":
|
151 |
-
category_averages["average_TG"].append(score)
|
152 |
-
elif task.value.category == "Risk Management (RM)":
|
153 |
-
if score == "missing":
|
154 |
-
category_averages["average_RM"].append(score)
|
155 |
-
else:
|
156 |
-
category_averages["average_RM"].append((score + 100) / 2)
|
157 |
-
elif task.value.category == "Forecasting (FO)":
|
158 |
-
category_averages["average_FO"].append(score)
|
159 |
-
elif task.value.category == "Decision-Making (DM)":
|
160 |
-
if task.value.benchmark == "FinTrade" and score != "missing":
|
161 |
-
category_averages["average_DM"].append((score + 300)/6)
|
162 |
-
else:
|
163 |
-
category_averages["average_DM"].append(score)
|
164 |
-
elif task.value.category == "Spanish":
|
165 |
-
category_averages["average_Spanish"].append(score)
|
166 |
-
|
167 |
-
# Calculate the mean for each category and add to data_dict
|
168 |
-
data_dict = {}
|
169 |
-
for category, scores in category_averages.items():
|
170 |
-
# Calculate the average if there are valid scores, otherwise set to 0
|
171 |
-
valid_scores = [score for score in scores if score != "missing"]
|
172 |
-
if valid_scores:
|
173 |
-
average = sum(valid_scores) / len(valid_scores)
|
174 |
-
else:
|
175 |
-
average = 0
|
176 |
-
data_dict[category] = average
|
177 |
-
|
178 |
-
# Overall average
|
179 |
-
total_scores = [v for v in self.results.values() if v != "missing"]
|
180 |
-
overall_average = sum(total_scores) / len(total_scores) if total_scores else 0
|
181 |
-
|
182 |
-
# Add other columns
|
183 |
-
data_dict.update({
|
184 |
-
"eval_name": self.eval_name, # not a column, just a save name,
|
185 |
-
AutoEvalColumn.precision.name: self.precision.value.name,
|
186 |
-
AutoEvalColumn.model_type.name: self.model_type.value.name,
|
187 |
-
AutoEvalColumn.model_type_symbol.name: self.model_type.value.symbol,
|
188 |
-
AutoEvalColumn.weight_type.name: self.weight_type.value.name,
|
189 |
-
AutoEvalColumn.architecture.name: self.architecture,
|
190 |
-
AutoEvalColumn.model.name: make_clickable_model(self.full_model),
|
191 |
-
AutoEvalColumn.revision.name: self.revision,
|
192 |
-
AutoEvalColumn.average.name: overall_average,
|
193 |
-
AutoEvalColumn.license.name: self.license,
|
194 |
-
AutoEvalColumn.likes.name: self.likes,
|
195 |
-
AutoEvalColumn.params.name: self.num_params,
|
196 |
-
AutoEvalColumn.still_on_hub.name: self.still_on_hub,
|
197 |
-
})
|
198 |
-
|
199 |
-
# Add task results to the data dictionary
|
200 |
-
for task in Tasks:
|
201 |
-
data_dict[task.value.col_name] = self.results.get(task.value.benchmark)
|
202 |
-
|
203 |
-
return data_dict
|
204 |
-
|
205 |
-
|
206 |
-
|
207 |
-
|
208 |
-
|
209 |
-
def get_request_file_for_model(requests_path, model_name, precision):
|
210 |
-
"""Selects the correct request file for a given model. Only keeps runs tagged as FINISHED"""
|
211 |
-
request_files = os.path.join(
|
212 |
-
requests_path,
|
213 |
-
f"{model_name}_eval_request_*.json",
|
214 |
-
)
|
215 |
-
request_files = glob.glob(request_files)
|
216 |
-
|
217 |
-
# Select correct request file (precision)
|
218 |
-
request_file = ""
|
219 |
-
request_files = sorted(request_files, reverse=True)
|
220 |
-
for tmp_request_file in request_files:
|
221 |
-
with open(tmp_request_file, "r") as f:
|
222 |
-
req_content = json.load(f)
|
223 |
-
if (
|
224 |
-
req_content["status"] in ["FINISHED"]
|
225 |
-
and req_content["precision"] == precision.split(".")[-1]
|
226 |
-
):
|
227 |
-
request_file = tmp_request_file
|
228 |
-
return request_file
|
229 |
-
|
230 |
-
|
231 |
-
def get_raw_eval_results(results_path: str, requests_path: str) -> list[EvalResult]:
|
232 |
-
"""From the path of the results folder root, extract all needed info for results"""
|
233 |
-
model_result_filepaths = []
|
234 |
-
|
235 |
-
for root, _, files in os.walk(results_path):
|
236 |
-
# We should only have json files in model results
|
237 |
-
if len(files) == 0 or any([not f.endswith(".json") for f in files]):
|
238 |
-
continue
|
239 |
-
|
240 |
-
# Sort the files by date
|
241 |
-
try:
|
242 |
-
files.sort(key=lambda x: x.removesuffix(".json").removeprefix("results_")[:-7])
|
243 |
-
except dateutil.parser._parser.ParserError:
|
244 |
-
files = [files[-1]]
|
245 |
-
|
246 |
-
for file in files:
|
247 |
-
model_result_filepaths.append(os.path.join(root, file))
|
248 |
-
|
249 |
-
print(f"Found {len(model_result_filepaths)} JSON files to process.")
|
250 |
-
|
251 |
-
eval_results = {}
|
252 |
-
for model_result_filepath in model_result_filepaths:
|
253 |
-
# Creation of result
|
254 |
-
eval_result = EvalResult.init_from_json_file(model_result_filepath)
|
255 |
-
eval_result.update_with_request_file(requests_path)
|
256 |
-
|
257 |
-
# Store results of same eval together
|
258 |
-
eval_name = eval_result.eval_name
|
259 |
-
if eval_name in eval_results.keys():
|
260 |
-
eval_results[eval_name].results.update({k: v for k, v in eval_result.results.items() if v is not None})
|
261 |
-
else:
|
262 |
-
eval_results[eval_name] = eval_result
|
263 |
-
|
264 |
-
results = []
|
265 |
-
for v in eval_results.values():
|
266 |
-
try:
|
267 |
-
v.to_dict() # we test if the dict version is complete
|
268 |
-
results.append(v)
|
269 |
-
except KeyError: # not all eval values present
|
270 |
-
continue
|
271 |
-
|
272 |
-
print(f"Successfully loaded {len(results)} models.")
|
273 |
-
return results
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
src/populate.py
DELETED
@@ -1,99 +0,0 @@
|
|
1 |
-
import json
|
2 |
-
import os
|
3 |
-
import pandas as pd
|
4 |
-
import numpy as np
|
5 |
-
|
6 |
-
from src.display.formatting import has_no_nan_values, make_clickable_model
|
7 |
-
from src.display.utils import AutoEvalColumn, EvalQueueColumn
|
8 |
-
from src.leaderboard.read_evals import get_raw_eval_results
|
9 |
-
|
10 |
-
|
11 |
-
def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchmark_cols: list) -> pd.DataFrame:
|
12 |
-
"""Creates a dataframe from all the individual experiment results"""
|
13 |
-
raw_data = get_raw_eval_results(results_path, requests_path)
|
14 |
-
all_data_json = [v.to_dict() for v in raw_data]
|
15 |
-
|
16 |
-
df = pd.DataFrame.from_records(all_data_json)
|
17 |
-
|
18 |
-
# Add category average columns with default values
|
19 |
-
category_avg_columns = {
|
20 |
-
"Average IE ⬆️": "average_IE",
|
21 |
-
"Average TA ⬆️": "average_TA",
|
22 |
-
"Average QA ⬆️": "average_QA",
|
23 |
-
"Average TG ⬆️": "average_TG",
|
24 |
-
"Average RM ⬆️": "average_RM",
|
25 |
-
"Average FO ⬆️": "average_FO",
|
26 |
-
"Average DM ⬆️": "average_DM",
|
27 |
-
"Average Spanish ⬆️": "average_Spanish"
|
28 |
-
}
|
29 |
-
|
30 |
-
for display_name, internal_name in category_avg_columns.items():
|
31 |
-
df[display_name] = df[internal_name]
|
32 |
-
|
33 |
-
df = df.sort_values(by=[AutoEvalColumn.average.name], ascending=False)
|
34 |
-
|
35 |
-
# Apply the transformation for MCC values
|
36 |
-
mcc_tasks = ["German", "Australian", "LendingClub", "ccf", "ccfraud", "polish", "taiwan", "portoseguro", "travelinsurance"]
|
37 |
-
for task in mcc_tasks:
|
38 |
-
if task in df.columns:
|
39 |
-
df[task] = df.apply(lambda row: (row[task] + 100) / 2.0 if row[task] != "missing" else row[task], axis=1)
|
40 |
-
|
41 |
-
for index, row in df.iterrows():
|
42 |
-
if "FinTrade" in row and row["FinTrade"] != "missing":
|
43 |
-
df.loc[index, "FinTrade"] = (row["FinTrade"] + 300) / 6
|
44 |
-
|
45 |
-
# Now, select the columns that were passed to the function
|
46 |
-
df = df[cols]
|
47 |
-
|
48 |
-
# Function to round numeric values, including those in string format
|
49 |
-
def round_numeric(x):
|
50 |
-
try:
|
51 |
-
return round(float(x), 1)
|
52 |
-
except ValueError:
|
53 |
-
return x
|
54 |
-
|
55 |
-
# Apply rounding to all columns except 'T' and 'Model'
|
56 |
-
for col in df.columns:
|
57 |
-
if col not in ['T', 'Model']:
|
58 |
-
df[col] = df[col].apply(round_numeric)
|
59 |
-
|
60 |
-
# Filter out if any of the benchmarks have not been produced
|
61 |
-
df = df[has_no_nan_values(df, benchmark_cols)]
|
62 |
-
|
63 |
-
return raw_data, df
|
64 |
-
|
65 |
-
|
66 |
-
def get_evaluation_queue_df(save_path: str, cols: list) -> list[pd.DataFrame]:
|
67 |
-
"""Creates the different dataframes for the evaluation queues requests"""
|
68 |
-
entries = [entry for entry in os.listdir(save_path) if not entry.startswith(".")]
|
69 |
-
all_evals = []
|
70 |
-
|
71 |
-
for entry in entries:
|
72 |
-
if ".json" in entry:
|
73 |
-
file_path = os.path.join(save_path, entry)
|
74 |
-
with open(file_path) as fp:
|
75 |
-
data = json.load(fp)
|
76 |
-
|
77 |
-
data[EvalQueueColumn.model.name] = make_clickable_model(data["model"])
|
78 |
-
data[EvalQueueColumn.revision.name] = data.get("revision", "main")
|
79 |
-
|
80 |
-
all_evals.append(data)
|
81 |
-
elif ".md" not in entry:
|
82 |
-
# this is a folder
|
83 |
-
sub_entries = [e for e in os.listdir(f"{save_path}/{entry}") if not e.startswith(".")]
|
84 |
-
for sub_entry in sub_entries:
|
85 |
-
file_path = os.path.join(save_path, entry, sub_entry)
|
86 |
-
with open(file_path) as fp:
|
87 |
-
data = json.load(fp)
|
88 |
-
|
89 |
-
data[EvalQueueColumn.model.name] = make_clickable_model(data["model"])
|
90 |
-
data[EvalQueueColumn.revision.name] = data.get("revision", "main")
|
91 |
-
all_evals.append(data)
|
92 |
-
|
93 |
-
pending_list = [e for e in all_evals if e["status"] in ["PENDING", "RERUN"]]
|
94 |
-
running_list = [e for e in all_evals if e["status"] == "RUNNING"]
|
95 |
-
finished_list = [e for e in all_evals if e["status"].startswith("FINISHED") or e["status"] == "PENDING_NEW_EVAL"]
|
96 |
-
df_pending = pd.DataFrame.from_records(pending_list, columns=cols)
|
97 |
-
df_running = pd.DataFrame.from_records(running_list, columns=cols)
|
98 |
-
df_finished = pd.DataFrame.from_records(finished_list, columns=cols)
|
99 |
-
return df_finished[cols], df_running[cols], df_pending[cols]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
src/submission/check_validity.py
DELETED
@@ -1,99 +0,0 @@
|
|
1 |
-
import json
|
2 |
-
import os
|
3 |
-
import re
|
4 |
-
from collections import defaultdict
|
5 |
-
from datetime import datetime, timedelta, timezone
|
6 |
-
|
7 |
-
import huggingface_hub
|
8 |
-
from huggingface_hub import ModelCard
|
9 |
-
from huggingface_hub.hf_api import ModelInfo
|
10 |
-
from transformers import AutoConfig
|
11 |
-
from transformers.models.auto.tokenization_auto import AutoTokenizer
|
12 |
-
|
13 |
-
def check_model_card(repo_id: str) -> tuple[bool, str]:
|
14 |
-
"""Checks if the model card and license exist and have been filled"""
|
15 |
-
try:
|
16 |
-
card = ModelCard.load(repo_id)
|
17 |
-
except huggingface_hub.utils.EntryNotFoundError:
|
18 |
-
return False, "Please add a model card to your model to explain how you trained/fine-tuned it."
|
19 |
-
|
20 |
-
# Enforce license metadata
|
21 |
-
if card.data.license is None:
|
22 |
-
if not ("license_name" in card.data and "license_link" in card.data):
|
23 |
-
return False, (
|
24 |
-
"License not found. Please add a license to your model card using the `license` metadata or a"
|
25 |
-
" `license_name`/`license_link` pair."
|
26 |
-
)
|
27 |
-
|
28 |
-
# Enforce card content
|
29 |
-
if len(card.text) < 200:
|
30 |
-
return False, "Please add a description to your model card, it is too short."
|
31 |
-
|
32 |
-
return True, ""
|
33 |
-
|
34 |
-
def is_model_on_hub(model_name: str, revision: str, token: str = None, trust_remote_code=False, test_tokenizer=False) -> tuple[bool, str]:
|
35 |
-
"""Checks if the model model_name is on the hub, and whether it (and its tokenizer) can be loaded with AutoClasses."""
|
36 |
-
try:
|
37 |
-
config = AutoConfig.from_pretrained(model_name, revision=revision, trust_remote_code=trust_remote_code, token=token)
|
38 |
-
if test_tokenizer:
|
39 |
-
try:
|
40 |
-
tk = AutoTokenizer.from_pretrained(model_name, revision=revision, trust_remote_code=trust_remote_code, token=token)
|
41 |
-
except ValueError as e:
|
42 |
-
return (
|
43 |
-
False,
|
44 |
-
f"uses a tokenizer which is not in a transformers release: {e}",
|
45 |
-
None
|
46 |
-
)
|
47 |
-
except Exception as e:
|
48 |
-
return (False, "'s tokenizer cannot be loaded. Is your tokenizer class in a stable transformers release, and correctly configured?", None)
|
49 |
-
return True, None, config
|
50 |
-
|
51 |
-
except ValueError:
|
52 |
-
return (
|
53 |
-
False,
|
54 |
-
"needs to be launched with `trust_remote_code=True`. For safety reason, we do not allow these models to be automatically submitted to the leaderboard.",
|
55 |
-
None
|
56 |
-
)
|
57 |
-
|
58 |
-
except Exception as e:
|
59 |
-
return False, "was not found on hub!", None
|
60 |
-
|
61 |
-
|
62 |
-
def get_model_size(model_info: ModelInfo, precision: str):
|
63 |
-
"""Gets the model size from the configuration, or the model name if the configuration does not contain the information."""
|
64 |
-
try:
|
65 |
-
model_size = round(model_info.safetensors["total"] / 1e9, 3)
|
66 |
-
except (AttributeError, TypeError):
|
67 |
-
return 0 # Unknown model sizes are indicated as 0, see NUMERIC_INTERVALS in app.py
|
68 |
-
|
69 |
-
size_factor = 8 if (precision == "GPTQ" or "gptq" in model_info.modelId.lower()) else 1
|
70 |
-
model_size = size_factor * model_size
|
71 |
-
return model_size
|
72 |
-
|
73 |
-
def get_model_arch(model_info: ModelInfo):
|
74 |
-
"""Gets the model architecture from the configuration"""
|
75 |
-
return model_info.config.get("architectures", "Unknown")
|
76 |
-
|
77 |
-
def already_submitted_models(requested_models_dir: str) -> set[str]:
|
78 |
-
"""Gather a list of already submitted models to avoid duplicates"""
|
79 |
-
depth = 1
|
80 |
-
file_names = []
|
81 |
-
users_to_submission_dates = defaultdict(list)
|
82 |
-
|
83 |
-
for root, _, files in os.walk(requested_models_dir):
|
84 |
-
current_depth = root.count(os.sep) - requested_models_dir.count(os.sep)
|
85 |
-
if current_depth == depth:
|
86 |
-
for file in files:
|
87 |
-
if not file.endswith(".json"):
|
88 |
-
continue
|
89 |
-
with open(os.path.join(root, file), "r") as f:
|
90 |
-
info = json.load(f)
|
91 |
-
file_names.append(f"{info['model']}_{info['revision']}_{info['precision']}")
|
92 |
-
|
93 |
-
# Select organisation
|
94 |
-
if info["model"].count("/") == 0 or "submitted_time" not in info:
|
95 |
-
continue
|
96 |
-
organisation, _ = info["model"].split("/")
|
97 |
-
users_to_submission_dates[organisation].append(info["submitted_time"])
|
98 |
-
|
99 |
-
return set(file_names), users_to_submission_dates
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
src/submission/submit.py
DELETED
@@ -1,119 +0,0 @@
|
|
1 |
-
import json
|
2 |
-
import os
|
3 |
-
from datetime import datetime, timezone
|
4 |
-
|
5 |
-
from src.display.formatting import styled_error, styled_message, styled_warning
|
6 |
-
from src.envs import API, EVAL_REQUESTS_PATH, TOKEN, QUEUE_REPO
|
7 |
-
from src.submission.check_validity import (
|
8 |
-
already_submitted_models,
|
9 |
-
check_model_card,
|
10 |
-
get_model_size,
|
11 |
-
is_model_on_hub,
|
12 |
-
)
|
13 |
-
|
14 |
-
REQUESTED_MODELS = None
|
15 |
-
USERS_TO_SUBMISSION_DATES = None
|
16 |
-
|
17 |
-
def add_new_eval(
|
18 |
-
model: str,
|
19 |
-
base_model: str,
|
20 |
-
revision: str,
|
21 |
-
precision: str,
|
22 |
-
weight_type: str,
|
23 |
-
model_type: str,
|
24 |
-
):
|
25 |
-
global REQUESTED_MODELS
|
26 |
-
global USERS_TO_SUBMISSION_DATES
|
27 |
-
if not REQUESTED_MODELS:
|
28 |
-
REQUESTED_MODELS, USERS_TO_SUBMISSION_DATES = already_submitted_models(EVAL_REQUESTS_PATH)
|
29 |
-
|
30 |
-
user_name = ""
|
31 |
-
model_path = model
|
32 |
-
if "/" in model:
|
33 |
-
user_name = model.split("/")[0]
|
34 |
-
model_path = model.split("/")[1]
|
35 |
-
|
36 |
-
precision = precision.split(" ")[0]
|
37 |
-
current_time = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
|
38 |
-
|
39 |
-
if model_type is None or model_type == "":
|
40 |
-
return styled_error("Please select a model type.")
|
41 |
-
|
42 |
-
# Does the model actually exist?
|
43 |
-
if revision == "":
|
44 |
-
revision = "main"
|
45 |
-
|
46 |
-
# Is the model on the hub?
|
47 |
-
if weight_type in ["Delta", "Adapter"]:
|
48 |
-
base_model_on_hub, error, _ = is_model_on_hub(model_name=base_model, revision=revision, token=TOKEN, test_tokenizer=True)
|
49 |
-
if not base_model_on_hub:
|
50 |
-
return styled_error(f'Base model "{base_model}" {error}')
|
51 |
-
|
52 |
-
if not weight_type == "Adapter":
|
53 |
-
model_on_hub, error, _ = is_model_on_hub(model_name=model, revision=revision, token=TOKEN, test_tokenizer=True)
|
54 |
-
if not model_on_hub:
|
55 |
-
return styled_error(f'Model "{model}" {error}')
|
56 |
-
|
57 |
-
# Is the model info correctly filled?
|
58 |
-
try:
|
59 |
-
model_info = API.model_info(repo_id=model, revision=revision)
|
60 |
-
except Exception:
|
61 |
-
return styled_error("Could not get your model information. Please fill it up properly.")
|
62 |
-
|
63 |
-
model_size = get_model_size(model_info=model_info, precision=precision)
|
64 |
-
|
65 |
-
# Were the model card and license filled?
|
66 |
-
try:
|
67 |
-
license = model_info.cardData["license"]
|
68 |
-
except Exception:
|
69 |
-
return styled_error("Please select a license for your model")
|
70 |
-
|
71 |
-
modelcard_OK, error_msg = check_model_card(model)
|
72 |
-
if not modelcard_OK:
|
73 |
-
return styled_error(error_msg)
|
74 |
-
|
75 |
-
# Seems good, creating the eval
|
76 |
-
print("Adding new eval")
|
77 |
-
|
78 |
-
eval_entry = {
|
79 |
-
"model": model,
|
80 |
-
"base_model": base_model,
|
81 |
-
"revision": revision,
|
82 |
-
"precision": precision,
|
83 |
-
"weight_type": weight_type,
|
84 |
-
"status": "PENDING",
|
85 |
-
"submitted_time": current_time,
|
86 |
-
"model_type": model_type,
|
87 |
-
"likes": model_info.likes,
|
88 |
-
"params": model_size,
|
89 |
-
"license": license,
|
90 |
-
"private": False,
|
91 |
-
}
|
92 |
-
|
93 |
-
# Check for duplicate submission
|
94 |
-
if f"{model}_{revision}_{precision}" in REQUESTED_MODELS:
|
95 |
-
return styled_warning("This model has been already submitted.")
|
96 |
-
|
97 |
-
print("Creating eval file")
|
98 |
-
OUT_DIR = f"{EVAL_REQUESTS_PATH}/{user_name}"
|
99 |
-
os.makedirs(OUT_DIR, exist_ok=True)
|
100 |
-
out_path = f"{OUT_DIR}/{model_path}_eval_request_False_{precision}_{weight_type}.json"
|
101 |
-
|
102 |
-
with open(out_path, "w") as f:
|
103 |
-
f.write(json.dumps(eval_entry))
|
104 |
-
|
105 |
-
print("Uploading eval file")
|
106 |
-
API.upload_file(
|
107 |
-
path_or_fileobj=out_path,
|
108 |
-
path_in_repo=out_path.split("eval-queue/")[1],
|
109 |
-
repo_id=QUEUE_REPO,
|
110 |
-
repo_type="dataset",
|
111 |
-
commit_message=f"Add {model} to eval queue",
|
112 |
-
)
|
113 |
-
|
114 |
-
# Remove the local file
|
115 |
-
os.remove(out_path)
|
116 |
-
|
117 |
-
return styled_message(
|
118 |
-
"Your request has been submitted to the evaluation queue!\nPlease wait for up to an hour for the model to show in the PENDING list."
|
119 |
-
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|