zswzswzsw commited on
Commit
ae40651
·
verified ·
1 Parent(s): 663f446

Upload folder using huggingface_hub

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .github/workflows/build_documentation.yml +18 -0
  2. .github/workflows/build_pr_documentation.yml +19 -0
  3. .github/workflows/quality.yml +31 -0
  4. .github/workflows/tests.yml +31 -0
  5. .github/workflows/upload_pr_documentation.yml +16 -0
  6. .gitignore +164 -0
  7. CITATION.cff +29 -0
  8. LICENSE +201 -0
  9. Makefile +44 -0
  10. README.md +130 -0
  11. assets/handbook.png +0 -0
  12. chapters/en/_toctree.yml +4 -0
  13. chapters/en/chapter0/introduction.mdx +3 -0
  14. config_dpo_run.yaml +42 -0
  15. config_grpo_offline.yaml +45 -0
  16. config_sft_test_env.yaml +42 -0
  17. grpo_offline_run.py +217 -0
  18. recipes/accelerate_configs/deepspeed_zero3.yaml +22 -0
  19. recipes/accelerate_configs/fsdp.yaml +26 -0
  20. recipes/accelerate_configs/fsdp_qlora.yaml +25 -0
  21. recipes/accelerate_configs/multi_gpu.yaml +16 -0
  22. recipes/constitutional-ai/README.md +24 -0
  23. recipes/constitutional-ai/dpo/config_anthropic.yaml +41 -0
  24. recipes/constitutional-ai/sft/config_anthropic.yaml +48 -0
  25. recipes/constitutional-ai/sft/config_grok.yaml +48 -0
  26. recipes/gpt2-nl/README.md +43 -0
  27. recipes/gpt2-nl/cpt/config_full.yaml +45 -0
  28. recipes/gpt2-nl/dpo/config_full.yaml +44 -0
  29. recipes/gpt2-nl/sft/config_full.yaml +45 -0
  30. recipes/launch.slurm +86 -0
  31. recipes/pref_align_scan/README.md +49 -0
  32. recipes/pref_align_scan/dpo/config_openhermes.yaml +41 -0
  33. recipes/pref_align_scan/dpo/config_zephyr.yaml +39 -0
  34. recipes/pref_align_scan/launch_scan.sh +24 -0
  35. recipes/smollm/README.md +19 -0
  36. recipes/smollm/sft/config.yaml +53 -0
  37. recipes/smollm2/README.md +28 -0
  38. recipes/smollm2/dpo/config.yaml +43 -0
  39. recipes/smollm2/dpo/config_smol.yaml +43 -0
  40. recipes/smollm2/sft/config.yaml +49 -0
  41. recipes/smollm2/sft/config_smol.yaml +46 -0
  42. recipes/starchat2-15b/README.md +21 -0
  43. recipes/starchat2-15b/dpo/config_v0.1.yaml +43 -0
  44. recipes/starchat2-15b/sft/config_v0.1.yaml +49 -0
  45. recipes/zephyr-141b-A35b/README.md +23 -0
  46. recipes/zephyr-141b-A35b/orpo/config_full.yaml +39 -0
  47. recipes/zephyr-7b-beta/README.md +44 -0
  48. recipes/zephyr-7b-beta/dpo/config_full.yaml +41 -0
  49. recipes/zephyr-7b-beta/dpo/config_qlora.yaml +57 -0
  50. recipes/zephyr-7b-beta/sft/config_full.yaml +46 -0
.github/workflows/build_documentation.yml ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: Build documentation
2
+
3
+ on:
4
+ push:
5
+ branches:
6
+ - main
7
+
8
+ jobs:
9
+ build:
10
+ uses: huggingface/doc-builder/.github/workflows/build_main_documentation.yml@main
11
+ with:
12
+ commit_sha: ${{ github.sha }}
13
+ package: alignment-handbook
14
+ path_to_docs: alignment-handbook/chapters/
15
+ additional_args: --not_python_module
16
+ languages: en
17
+ secrets:
18
+ hf_token: ${{ secrets.HF_DOC_BUILD_PUSH }}
.github/workflows/build_pr_documentation.yml ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: Build PR Documentation
2
+
3
+ on:
4
+ pull_request:
5
+
6
+ concurrency:
7
+ group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
8
+ cancel-in-progress: true
9
+
10
+ jobs:
11
+ build:
12
+ uses: huggingface/doc-builder/.github/workflows/build_pr_documentation.yml@main
13
+ with:
14
+ commit_sha: ${{ github.event.pull_request.head.sha }}
15
+ pr_number: ${{ github.event.number }}
16
+ package: alignment-handbook
17
+ path_to_docs: alignment-handbook/chapters/
18
+ additional_args: --not_python_module
19
+ languages: en
.github/workflows/quality.yml ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: Quality
2
+
3
+ on:
4
+ push:
5
+ branches:
6
+ - main
7
+ - v*-release
8
+ pull_request:
9
+ branches:
10
+ - main
11
+
12
+ jobs:
13
+
14
+ check_code_quality:
15
+ name: Check code quality
16
+ runs-on: ubuntu-latest
17
+ steps:
18
+ - name: Checkout code
19
+ uses: actions/checkout@v2
20
+ - name: Setup Python environment
21
+ uses: actions/setup-python@v2
22
+ with:
23
+ python-version: 3.10.10
24
+ - name: Install dependencies
25
+ run: |
26
+ python -m pip install --upgrade pip
27
+ python -m pip install ".[quality]"
28
+ - name: Code quality
29
+ run: |
30
+ make quality
31
+
.github/workflows/tests.yml ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: Tests
2
+
3
+ on:
4
+ push:
5
+ branches:
6
+ - main
7
+ - v*-release
8
+ pull_request:
9
+ branches:
10
+ - main
11
+
12
+ jobs:
13
+
14
+ unit-tests:
15
+ name: Run unit tests
16
+ env:
17
+ HF_TOKEN: ${{ secrets.HF_TOKEN }}
18
+ runs-on: ubuntu-latest
19
+ steps:
20
+ - name: Checkout code
21
+ uses: actions/checkout@v2
22
+ - name: Setup Python environment
23
+ uses: actions/setup-python@v2
24
+ with:
25
+ python-version: 3.10.10
26
+ - name: Install dependencies
27
+ run: |
28
+ python -m pip install --upgrade pip
29
+ python -m pip install ".[dev, torch]"
30
+ - name: Run unit tests
31
+ run: HF_TOKEN=$HF_TOKEN pytest -sv tests/
.github/workflows/upload_pr_documentation.yml ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: Upload PR Documentation
2
+
3
+ on:
4
+ workflow_run:
5
+ workflows: ["Build PR Documentation"]
6
+ types:
7
+ - completed
8
+
9
+ jobs:
10
+ build:
11
+ uses: huggingface/doc-builder/.github/workflows/upload_pr_documentation.yml@main
12
+ with:
13
+ package_name: alignment-handbook
14
+ secrets:
15
+ hf_token: ${{ secrets.HF_DOC_BUILD_PUSH }}
16
+ comment_bot_token: ${{ secrets.COMMENT_BOT_TOKEN }}
.gitignore ADDED
@@ -0,0 +1,164 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Byte-compiled / optimized / DLL files
2
+ __pycache__/
3
+ *.py[cod]
4
+ *$py.class
5
+
6
+ # C extensions
7
+ *.so
8
+
9
+ # Distribution / packaging
10
+ .Python
11
+ build/
12
+ develop-eggs/
13
+ dist/
14
+ downloads/
15
+ eggs/
16
+ .eggs/
17
+ lib/
18
+ lib64/
19
+ parts/
20
+ sdist/
21
+ var/
22
+ wheels/
23
+ share/python-wheels/
24
+ *.egg-info/
25
+ .installed.cfg
26
+ *.egg
27
+ MANIFEST
28
+
29
+ # PyInstaller
30
+ # Usually these files are written by a python script from a template
31
+ # before PyInstaller builds the exe, so as to inject date/other infos into it.
32
+ *.manifest
33
+ *.spec
34
+
35
+ # Installer logs
36
+ pip-log.txt
37
+ pip-delete-this-directory.txt
38
+
39
+ # Unit test / coverage reports
40
+ htmlcov/
41
+ .tox/
42
+ .nox/
43
+ .coverage
44
+ .coverage.*
45
+ .cache
46
+ nosetests.xml
47
+ coverage.xml
48
+ *.cover
49
+ *.py,cover
50
+ .hypothesis/
51
+ .pytest_cache/
52
+ cover/
53
+
54
+ # Translations
55
+ *.mo
56
+ *.pot
57
+
58
+ # Django stuff:
59
+ *.log
60
+ local_settings.py
61
+ db.sqlite3
62
+ db.sqlite3-journal
63
+
64
+ # Flask stuff:
65
+ instance/
66
+ .webassets-cache
67
+
68
+ # Scrapy stuff:
69
+ .scrapy
70
+
71
+ # Sphinx documentation
72
+ docs/_build/
73
+
74
+ # PyBuilder
75
+ .pybuilder/
76
+ target/
77
+
78
+ # Jupyter Notebook
79
+ .ipynb_checkpoints
80
+
81
+ # IPython
82
+ profile_default/
83
+ ipython_config.py
84
+
85
+ # pyenv
86
+ # For a library or package, you might want to ignore these files since the code is
87
+ # intended to run in multiple environments; otherwise, check them in:
88
+ # .python-version
89
+
90
+ # pipenv
91
+ # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
92
+ # However, in case of collaboration, if having platform-specific dependencies or dependencies
93
+ # having no cross-platform support, pipenv may install dependencies that don't work, or not
94
+ # install all needed dependencies.
95
+ #Pipfile.lock
96
+
97
+ # poetry
98
+ # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
99
+ # This is especially recommended for binary packages to ensure reproducibility, and is more
100
+ # commonly ignored for libraries.
101
+ # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
102
+ #poetry.lock
103
+
104
+ # pdm
105
+ # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
106
+ #pdm.lock
107
+ # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
108
+ # in version control.
109
+ # https://pdm.fming.dev/#use-with-ide
110
+ .pdm.toml
111
+
112
+ # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
113
+ __pypackages__/
114
+
115
+ # Celery stuff
116
+ celerybeat-schedule
117
+ celerybeat.pid
118
+
119
+ # SageMath parsed files
120
+ *.sage.py
121
+
122
+ # Environments
123
+ .env
124
+ .venv
125
+ env/
126
+ venv/
127
+ ENV/
128
+ env.bak/
129
+ venv.bak/
130
+
131
+ # Spyder project settings
132
+ .spyderproject
133
+ .spyproject
134
+
135
+ # Rope project settings
136
+ .ropeproject
137
+
138
+ # mkdocs documentation
139
+ /site
140
+
141
+ # mypy
142
+ .mypy_cache/
143
+ .dmypy.json
144
+ dmypy.json
145
+
146
+ # Pyre type checker
147
+ .pyre/
148
+
149
+ # pytype static type analyzer
150
+ .pytype/
151
+
152
+ # Cython debug symbols
153
+ cython_debug/
154
+
155
+ # PyCharm
156
+ # JetBrains specific template is maintained in a separate JetBrains.gitignore that can
157
+ # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
158
+ # and can be added to the global gitignore or merged into this file. For a more nuclear
159
+ # option (not recommended) you can uncomment the following to ignore the entire idea folder.
160
+ .idea/
161
+
162
+ # Temp folders
163
+ data/
164
+ wandb/
CITATION.cff ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ cff-version: 1.2.0
2
+ title: The Alignment Handbook
3
+ message: >-
4
+ Robust recipes to align language models with human and AI
5
+ preferences.
6
+ type: software
7
+ authors:
8
+ - given-names: Lewis
9
+ family-names: Tunstall
10
+ - given-names: Edward
11
+ family-names: Beeching
12
+ - given-names: Nathan
13
+ family-names: Lambert
14
+ - given-names: Nazneen
15
+ family-names: Rajani
16
+ - given-names: Shengyi
17
+ family-names: Huang
18
+ - given-names: Kashif
19
+ family-names: Rasul
20
+ - given-names: Alvaro
21
+ family-names: Bartolome
22
+ - given-names: Alexander
23
+ name-particle: M.
24
+ family-names: Rush
25
+ - given-names: Thomas
26
+ family-names: Wolf
27
+ repository-code: 'https://github.com/huggingface/alignment-handbook'
28
+ license: Apache-2.0
29
+ version: 0.3.0.dev0
LICENSE ADDED
@@ -0,0 +1,201 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Apache License
2
+ Version 2.0, January 2004
3
+ http://www.apache.org/licenses/
4
+
5
+ TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
6
+
7
+ 1. Definitions.
8
+
9
+ "License" shall mean the terms and conditions for use, reproduction,
10
+ and distribution as defined by Sections 1 through 9 of this document.
11
+
12
+ "Licensor" shall mean the copyright owner or entity authorized by
13
+ the copyright owner that is granting the License.
14
+
15
+ "Legal Entity" shall mean the union of the acting entity and all
16
+ other entities that control, are controlled by, or are under common
17
+ control with that entity. For the purposes of this definition,
18
+ "control" means (i) the power, direct or indirect, to cause the
19
+ direction or management of such entity, whether by contract or
20
+ otherwise, or (ii) ownership of fifty percent (50%) or more of the
21
+ outstanding shares, or (iii) beneficial ownership of such entity.
22
+
23
+ "You" (or "Your") shall mean an individual or Legal Entity
24
+ exercising permissions granted by this License.
25
+
26
+ "Source" form shall mean the preferred form for making modifications,
27
+ including but not limited to software source code, documentation
28
+ source, and configuration files.
29
+
30
+ "Object" form shall mean any form resulting from mechanical
31
+ transformation or translation of a Source form, including but
32
+ not limited to compiled object code, generated documentation,
33
+ and conversions to other media types.
34
+
35
+ "Work" shall mean the work of authorship, whether in Source or
36
+ Object form, made available under the License, as indicated by a
37
+ copyright notice that is included in or attached to the work
38
+ (an example is provided in the Appendix below).
39
+
40
+ "Derivative Works" shall mean any work, whether in Source or Object
41
+ form, that is based on (or derived from) the Work and for which the
42
+ editorial revisions, annotations, elaborations, or other modifications
43
+ represent, as a whole, an original work of authorship. For the purposes
44
+ of this License, Derivative Works shall not include works that remain
45
+ separable from, or merely link (or bind by name) to the interfaces of,
46
+ the Work and Derivative Works thereof.
47
+
48
+ "Contribution" shall mean any work of authorship, including
49
+ the original version of the Work and any modifications or additions
50
+ to that Work or Derivative Works thereof, that is intentionally
51
+ submitted to Licensor for inclusion in the Work by the copyright owner
52
+ or by an individual or Legal Entity authorized to submit on behalf of
53
+ the copyright owner. For the purposes of this definition, "submitted"
54
+ means any form of electronic, verbal, or written communication sent
55
+ to the Licensor or its representatives, including but not limited to
56
+ communication on electronic mailing lists, source code control systems,
57
+ and issue tracking systems that are managed by, or on behalf of, the
58
+ Licensor for the purpose of discussing and improving the Work, but
59
+ excluding communication that is conspicuously marked or otherwise
60
+ designated in writing by the copyright owner as "Not a Contribution."
61
+
62
+ "Contributor" shall mean Licensor and any individual or Legal Entity
63
+ on behalf of whom a Contribution has been received by Licensor and
64
+ subsequently incorporated within the Work.
65
+
66
+ 2. Grant of Copyright License. Subject to the terms and conditions of
67
+ this License, each Contributor hereby grants to You a perpetual,
68
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
69
+ copyright license to reproduce, prepare Derivative Works of,
70
+ publicly display, publicly perform, sublicense, and distribute the
71
+ Work and such Derivative Works in Source or Object form.
72
+
73
+ 3. Grant of Patent License. Subject to the terms and conditions of
74
+ this License, each Contributor hereby grants to You a perpetual,
75
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
76
+ (except as stated in this section) patent license to make, have made,
77
+ use, offer to sell, sell, import, and otherwise transfer the Work,
78
+ where such license applies only to those patent claims licensable
79
+ by such Contributor that are necessarily infringed by their
80
+ Contribution(s) alone or by combination of their Contribution(s)
81
+ with the Work to which such Contribution(s) was submitted. If You
82
+ institute patent litigation against any entity (including a
83
+ cross-claim or counterclaim in a lawsuit) alleging that the Work
84
+ or a Contribution incorporated within the Work constitutes direct
85
+ or contributory patent infringement, then any patent licenses
86
+ granted to You under this License for that Work shall terminate
87
+ as of the date such litigation is filed.
88
+
89
+ 4. Redistribution. You may reproduce and distribute copies of the
90
+ Work or Derivative Works thereof in any medium, with or without
91
+ modifications, and in Source or Object form, provided that You
92
+ meet the following conditions:
93
+
94
+ (a) You must give any other recipients of the Work or
95
+ Derivative Works a copy of this License; and
96
+
97
+ (b) You must cause any modified files to carry prominent notices
98
+ stating that You changed the files; and
99
+
100
+ (c) You must retain, in the Source form of any Derivative Works
101
+ that You distribute, all copyright, patent, trademark, and
102
+ attribution notices from the Source form of the Work,
103
+ excluding those notices that do not pertain to any part of
104
+ the Derivative Works; and
105
+
106
+ (d) If the Work includes a "NOTICE" text file as part of its
107
+ distribution, then any Derivative Works that You distribute must
108
+ include a readable copy of the attribution notices contained
109
+ within such NOTICE file, excluding those notices that do not
110
+ pertain to any part of the Derivative Works, in at least one
111
+ of the following places: within a NOTICE text file distributed
112
+ as part of the Derivative Works; within the Source form or
113
+ documentation, if provided along with the Derivative Works; or,
114
+ within a display generated by the Derivative Works, if and
115
+ wherever such third-party notices normally appear. The contents
116
+ of the NOTICE file are for informational purposes only and
117
+ do not modify the License. You may add Your own attribution
118
+ notices within Derivative Works that You distribute, alongside
119
+ or as an addendum to the NOTICE text from the Work, provided
120
+ that such additional attribution notices cannot be construed
121
+ as modifying the License.
122
+
123
+ You may add Your own copyright statement to Your modifications and
124
+ may provide additional or different license terms and conditions
125
+ for use, reproduction, or distribution of Your modifications, or
126
+ for any such Derivative Works as a whole, provided Your use,
127
+ reproduction, and distribution of the Work otherwise complies with
128
+ the conditions stated in this License.
129
+
130
+ 5. Submission of Contributions. Unless You explicitly state otherwise,
131
+ any Contribution intentionally submitted for inclusion in the Work
132
+ by You to the Licensor shall be under the terms and conditions of
133
+ this License, without any additional terms or conditions.
134
+ Notwithstanding the above, nothing herein shall supersede or modify
135
+ the terms of any separate license agreement you may have executed
136
+ with Licensor regarding such Contributions.
137
+
138
+ 6. Trademarks. This License does not grant permission to use the trade
139
+ names, trademarks, service marks, or product names of the Licensor,
140
+ except as required for reasonable and customary use in describing the
141
+ origin of the Work and reproducing the content of the NOTICE file.
142
+
143
+ 7. Disclaimer of Warranty. Unless required by applicable law or
144
+ agreed to in writing, Licensor provides the Work (and each
145
+ Contributor provides its Contributions) on an "AS IS" BASIS,
146
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147
+ implied, including, without limitation, any warranties or conditions
148
+ of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149
+ PARTICULAR PURPOSE. You are solely responsible for determining the
150
+ appropriateness of using or redistributing the Work and assume any
151
+ risks associated with Your exercise of permissions under this License.
152
+
153
+ 8. Limitation of Liability. In no event and under no legal theory,
154
+ whether in tort (including negligence), contract, or otherwise,
155
+ unless required by applicable law (such as deliberate and grossly
156
+ negligent acts) or agreed to in writing, shall any Contributor be
157
+ liable to You for damages, including any direct, indirect, special,
158
+ incidental, or consequential damages of any character arising as a
159
+ result of this License or out of the use or inability to use the
160
+ Work (including but not limited to damages for loss of goodwill,
161
+ work stoppage, computer failure or malfunction, or any and all
162
+ other commercial damages or losses), even if such Contributor
163
+ has been advised of the possibility of such damages.
164
+
165
+ 9. Accepting Warranty or Additional Liability. While redistributing
166
+ the Work or Derivative Works thereof, You may choose to offer,
167
+ and charge a fee for, acceptance of support, warranty, indemnity,
168
+ or other liability obligations and/or rights consistent with this
169
+ License. However, in accepting such obligations, You may act only
170
+ on Your own behalf and on Your sole responsibility, not on behalf
171
+ of any other Contributor, and only if You agree to indemnify,
172
+ defend, and hold each Contributor harmless for any liability
173
+ incurred by, or claims asserted against, such Contributor by reason
174
+ of your accepting any such warranty or additional liability.
175
+
176
+ END OF TERMS AND CONDITIONS
177
+
178
+ APPENDIX: How to apply the Apache License to your work.
179
+
180
+ To apply the Apache License to your work, attach the following
181
+ boilerplate notice, with the fields enclosed by brackets "[]"
182
+ replaced with your own identifying information. (Don't include
183
+ the brackets!) The text should be enclosed in the appropriate
184
+ comment syntax for the file format. We also recommend that a
185
+ file or class name and description of purpose be included on the
186
+ same "printed page" as the copyright notice for easier
187
+ identification within third-party archives.
188
+
189
+ Copyright [yyyy] [name of copyright owner]
190
+
191
+ Licensed under the Apache License, Version 2.0 (the "License");
192
+ you may not use this file except in compliance with the License.
193
+ You may obtain a copy of the License at
194
+
195
+ http://www.apache.org/licenses/LICENSE-2.0
196
+
197
+ Unless required by applicable law or agreed to in writing, software
198
+ distributed under the License is distributed on an "AS IS" BASIS,
199
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200
+ See the License for the specific language governing permissions and
201
+ limitations under the License.
Makefile ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ .PHONY: style quality
2
+
3
+ # make sure to test the local checkout in scripts and not the pre-installed one (don't use quotes!)
4
+ export PYTHONPATH = src
5
+
6
+ check_dirs := src tests scripts
7
+
8
+ style:
9
+ black --line-length 119 --target-version py310 $(check_dirs) setup.py
10
+ isort $(check_dirs) setup.py
11
+
12
+ quality:
13
+ black --check --line-length 119 --target-version py310 $(check_dirs) setup.py
14
+ isort --check-only $(check_dirs) setup.py
15
+ flake8 --max-line-length 119 $(check_dirs) setup.py
16
+
17
+
18
+ # Release stuff
19
+
20
+ pre-release:
21
+ python src/alignment/release.py
22
+
23
+ pre-patch:
24
+ python src/alignment/release.py --patch
25
+
26
+ post-release:
27
+ python src/alignment/release.py --post_release
28
+
29
+ post-patch:
30
+ python src/alignment/release.py --post_release --patch
31
+
32
+ wheels:
33
+ python setup.py bdist_wheel && python setup.py sdist
34
+
35
+ wheels_clean:
36
+ rm -rf build && rm -rf dist
37
+
38
+ pypi_upload:
39
+ python -m pip install twine
40
+ twine upload dist/* -r pypi
41
+
42
+ pypi_test_upload:
43
+ python -m pip install twine
44
+ twine upload dist/* -r pypitest --repository-url=https://test.pypi.org/legacy/
README.md ADDED
@@ -0,0 +1,130 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <p align="center">
2
+ <img src="https://raw.githubusercontent.com/huggingface/alignment-handbook/main/assets/handbook.png">
3
+ </p>
4
+
5
+ <p align="center">
6
+ 🤗 <a href="https://huggingface.co/collections/alignment-handbook/handbook-v01-models-and-datasets-654e424d22e6880da5ebc015" target="_blank">Models & Datasets</a> | 📃 <a href="https://arxiv.org/abs/2310.16944" target="_blank">Technical Report</a>
7
+ </p>
8
+
9
+ # The Alignment Handbook
10
+
11
+ Robust recipes to continue pretraining and to align language models with human and AI preferences.
12
+
13
+ ## What is this?
14
+
15
+ Just one year ago, chatbots were out of fashion and most people hadn't heard about techniques like Reinforcement Learning from Human Feedback (RLHF) to align language models with human preferences. Then, OpenAI broke the internet with ChatGPT and Meta followed suit by releasing the Llama series of language models which enabled the ML community to build their very own capable chatbots. This has led to a rich ecosystem of datasets and models that have mostly focused on teaching language models to follow instructions through supervised fine-tuning (SFT).
16
+
17
+ However, we know from the [InstructGPT](https://huggingface.co/papers/2203.02155) and [Llama2](https://huggingface.co/papers/2307.09288) papers that significant gains in helpfulness and safety can be had by augmenting SFT with human (or AI) preferences. At the same time, aligning language models to a set of preferences is a fairly novel idea and there are few public resources available on how to train these models, what data to collect, and what metrics to measure for best downstream performance.
18
+
19
+ The Alignment Handbook aims to fill that gap by providing the community with a series of robust training recipes that span the whole pipeline.
20
+
21
+ ## News 🗞️
22
+ * **November 21, 2024**: We release the [recipe](recipes/smollm2/README.md) for finet-uning SmolLM2-Instruct.
23
+ * **August 18, 2024**: We release SmolLM-Instruct v0.2, along with the [recipe](recipes/smollm/README.md) to fine-tuning small LLMs 💻
24
+ * **April 12, 2024**: We release Zephyr 141B (A35B), in collaboration with Argilla and Kaist AI, along with the recipe to fine-tune Mixtral 8x22B with ORPO 🪁
25
+ * **March 12, 2024:** We release StarChat2 15B, along with the recipe to train capable coding assistants 🌟
26
+ * **March 1, 2024:** We release Zephyr 7B Gemma, which is a new recipe to align Gemma 7B with RLAIF 🔥
27
+ * **February 1, 2024:** We release a recipe to align open LLMs with Constitutional AI 📜! See the [recipe](https://github.com/huggingface/alignment-handbook/tree/main/recipes/constitutional-ai) and the [blog post](https://huggingface.co/blog/constitutional_ai) for details.
28
+ * **January 18, 2024:** We release a suite of evaluations of DPO vs KTO vs IPO, see the [recipe](recipes/pref_align_scan/README.md) and the [blog post](https://huggingface.co/blog/pref-tuning) for details.
29
+ * **November 10, 2023:** We release all the training code to replicate Zephyr-7b-β 🪁! We also release [No Robots](https://huggingface.co/datasets/HuggingFaceH4/no_robots), a brand new dataset of 10,000 instructions and demonstrations written entirely by skilled human annotators.
30
+
31
+ ## Links 🔗
32
+
33
+ * [Zephyr 7B models, datasets, and demos](https://huggingface.co/collections/HuggingFaceH4/zephyr-7b-6538c6d6d5ddd1cbb1744a66)
34
+
35
+ ## How to navigate this project 🧭
36
+
37
+ This project is simple by design and mostly consists of:
38
+
39
+ * [`scripts`](./scripts/) to train and evaluate models. Four steps are included: continued pretraining, supervised-finetuning (SFT) for chat, preference alignment with DPO, and supervised-finetuning with preference alignment with ORPO. Each script supports distributed training of the full model weights with DeepSpeed ZeRO-3, or LoRA/QLoRA for parameter-efficient fine-tuning.
40
+ * [`recipes`](./recipes/) to reproduce models like Zephyr 7B. Each recipe takes the form of a YAML file which contains all the parameters associated with a single training run. A `gpt2-nl` recipe is also given to illustrate how this handbook can be used for language or domain adaptation, e.g. by continuing to pretrain on a different language, and then SFT and DPO tuning the result.
41
+
42
+ We are also working on a series of guides to explain how methods like direct preference optimization (DPO) work, along with lessons learned from gathering human preferences in practice. To get started, we recommend the following:
43
+
44
+ 1. Follow the [installation instructions](#installation-instructions) to set up your environment etc.
45
+ 2. Replicate Zephyr-7b-β by following the [recipe instructions](./recipes/zephyr-7b-beta/README.md).
46
+
47
+ If you would like to train chat models on your own datasets, we recommend following the dataset formatting instructions [here](./scripts/README.md#fine-tuning-on-your-datasets).
48
+
49
+
50
+ ## Contents
51
+
52
+ The initial release of the handbook will focus on the following techniques:
53
+
54
+ * **Continued pretraining:** adapt language models to a new language or domain, or simply improve it by continued pretraining (causal language modeling) on a new dataset.
55
+ * **Supervised fine-tuning:** teach language models to follow instructions and tips on how to collect and curate your training dataset.
56
+ * **Reward modeling:** teach language models to distinguish model responses according to human or AI preferences.
57
+ * **Rejection sampling:** a simple, but powerful technique to boost the performance of your SFT model.
58
+ * **Direct preference optimisation (DPO):** a powerful and promising alternative to PPO.
59
+ * **Odds Ratio Preference Optimisation (ORPO)**: a technique to fine-tune language models with human preferences, combining SFT and DPO in a single stage.
60
+
61
+ ## Installation instructions
62
+
63
+ To run the code in this project, first, create a Python virtual environment using e.g. Conda:
64
+
65
+ ```shell
66
+ conda create -n handbook python=3.10 && conda activate handbook
67
+ ```
68
+
69
+ Next, install PyTorch `v2.1.2` - the precise version is important for reproducibility! Since this is hardware-dependent, we
70
+ direct you to the [PyTorch Installation Page](https://pytorch.org/get-started/locally/).
71
+
72
+ You can then install the remaining package dependencies as follows:
73
+
74
+ ```shell
75
+ git clone https://github.com/huggingface/alignment-handbook.git
76
+ cd ./alignment-handbook/
77
+ python -m pip install .
78
+ ```
79
+
80
+ You will also need Flash Attention 2 installed, which can be done by running:
81
+
82
+ ```shell
83
+ python -m pip install flash-attn --no-build-isolation
84
+ ```
85
+
86
+ > **Note**
87
+ > If your machine has less than 96GB of RAM and many CPU cores, reduce the `MAX_JOBS` arguments, e.g. `MAX_JOBS=4 pip install flash-attn --no-build-isolation`
88
+
89
+ Next, log into your Hugging Face account as follows:
90
+
91
+ ```shell
92
+ huggingface-cli login
93
+ ```
94
+
95
+ Finally, install Git LFS so that you can push models to the Hugging Face Hub:
96
+
97
+ ```shell
98
+ sudo apt-get install git-lfs
99
+ ```
100
+
101
+ You can now check out the `scripts` and `recipes` directories for instructions on how to train some models 🪁!
102
+
103
+ ## Project structure
104
+
105
+ ```
106
+ ├── LICENSE
107
+ ├── Makefile <- Makefile with commands like `make style`
108
+ ├── README.md <- The top-level README for developers using this project
109
+ ├── chapters <- Educational content to render on hf.co/learn
110
+ ├── recipes <- Recipe configs, accelerate configs, slurm scripts
111
+ ├── scripts <- Scripts to train and evaluate chat models
112
+ ├── setup.cfg <- Installation config (mostly used for configuring code quality & tests)
113
+ ├── setup.py <- Makes project pip installable (pip install -e .) so `alignment` can be imported
114
+ ├── src <- Source code for use in this project
115
+ └── tests <- Unit tests
116
+ ```
117
+
118
+ ## Citation
119
+
120
+ If you find the content of this repo useful in your work, please cite it as follows via `\usepackage{biblatex}`:
121
+
122
+ ```bibtex
123
+ @software{Tunstall_The_Alignment_Handbook,
124
+ author = {Tunstall, Lewis and Beeching, Edward and Lambert, Nathan and Rajani, Nazneen and Huang, Shengyi and Rasul, Kashif and Bartolome, Alvaro and M. Rush, Alexander and Wolf, Thomas},
125
+ license = {Apache-2.0},
126
+ title = {{The Alignment Handbook}},
127
+ url = {https://github.com/huggingface/alignment-handbook},
128
+ version = {0.3.0.dev0}
129
+ }
130
+ ```
assets/handbook.png ADDED
chapters/en/_toctree.yml ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ - title: Unit 0. Welcome to the RLHF Handbook!
2
+ sections:
3
+ - local: chapter0/introduction
4
+ title: What is this about?
chapters/en/chapter0/introduction.mdx ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ # Welcome to the RLHF Handbook!
2
+
3
+ Stay tuned for more details 🤗
config_dpo_run.yaml ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Model arguments
2
+ model_name_or_path: /home/swzhang/test_trl_0.12_grpo/qwen/Qwen2/
3
+ model_revision: main
4
+ torch_dtype: bfloat16
5
+ attn_implementation: flash_attention_2
6
+
7
+ # Data training arguments
8
+ chat_template: "{% if not add_generation_prompt is defined %}\n{% set add_generation_prompt = false %}\n{% endif %}\n{%- set ns = namespace(found=false) -%}\n{%- for message in messages -%}\n {%- if message['role'] == 'system' -%}\n {%- set ns.found = true -%}\n {%- endif -%}\n{%- endfor -%}\n{{bos_token}}{%- if not ns.found -%}\n{{'You are an AI programming assistant, utilizing the Deepseek Coder model, developed by Deepseek Company, and you only answer questions related to computer science. For politically sensitive questions, security and privacy issues, and other non-computer science questions, you will refuse to answer\\n'}}\n{%- endif %}\n{%- for message in messages %}\n {%- if message['role'] == 'system' %}\n{{ message['content'] }}\n {%- else %}\n {%- if message['role'] == 'user' %}\n{{'### Instruction:\\n' + message['content'] + '\\n'}}\n {%- else %}\n{{'### Response:\\n' + message['content'] + '\\n<|EOT|>\\n'}}\n {%- endif %}\n {%- endif %}\n{%- endfor %}\n{% if add_generation_prompt %}\n{{'### Response:'}}\n{% endif %}"
9
+ dataset_mixer:
10
+ data/my: 1.0
11
+ dataset_splits:
12
+ - train
13
+ preprocessing_num_workers: 2
14
+
15
+ # dpo trainer config
16
+ bf16: true
17
+ do_eval: False
18
+ eval_strategy: epoch
19
+ gradient_accumulation_steps: 1
20
+ gradient_checkpointing: true
21
+ gradient_checkpointing_kwargs:
22
+ use_reentrant: False
23
+ learning_rate: 1.0e-05
24
+ log_level: info
25
+ logging_steps: 5
26
+ logging_strategy: steps
27
+ lr_scheduler_type: cosine
28
+ max_length: 4096
29
+ num_train_epochs: 5
30
+ output_dir: /home/swzhang/LLM_alignment/alignment-handbook/qwen_test_model
31
+ overwrite_output_dir: true
32
+ per_device_eval_batch_size: 1
33
+ per_device_train_batch_size: 1
34
+ push_to_hub: False
35
+ remove_unused_columns: true
36
+ report_to:
37
+ - tensorboard
38
+ save_strategy: "steps"
39
+ save_steps: 51
40
+ save_total_limit: 30
41
+ seed: 42
42
+ warmup_ratio: 0.2
config_grpo_offline.yaml ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Model arguments
2
+ model_name_or_path: /home/swzhang/test_trl_0.12_grpo/qwen/Qwen2/
3
+ model_revision: main
4
+ torch_dtype: bfloat16
5
+ attn_implementation: flash_attention_2
6
+
7
+ # Data training arguments
8
+ chat_template: "{% if not add_generation_prompt is defined %}\n{% set add_generation_prompt = false %}\n{% endif %}\n{%- set ns = namespace(found=false) -%}\n{%- for message in messages -%}\n {%- if message['role'] == 'system' -%}\n {%- set ns.found = true -%}\n {%- endif -%}\n{%- endfor -%}\n{{bos_token}}{%- if not ns.found -%}\n{{'You are an AI programming assistant, utilizing the Deepseek Coder model, developed by Deepseek Company, and you only answer questions related to computer science. For politically sensitive questions, security and privacy issues, and other non-computer science questions, you will refuse to answer\\n'}}\n{%- endif %}\n{%- for message in messages %}\n {%- if message['role'] == 'system' %}\n{{ message['content'] }}\n {%- else %}\n {%- if message['role'] == 'user' %}\n{{'### Instruction:\\n' + message['content'] + '\\n'}}\n {%- else %}\n{{'### Response:\\n' + message['content'] + '\\n<|EOT|>\\n'}}\n {%- endif %}\n {%- endif %}\n{%- endfor %}\n{% if add_generation_prompt %}\n{{'### Response:'}}\n{% endif %}"
9
+ dataset_mixer:
10
+ data/my: 1.0
11
+ dataset_splits:
12
+ - train
13
+ preprocessing_num_workers: 32
14
+
15
+ # GRPO trainer config
16
+ bf16: true
17
+ do_eval: False
18
+ eval_strategy: epoch
19
+ gradient_accumulation_steps: 1
20
+ gradient_checkpointing: true
21
+ gradient_checkpointing_kwargs:
22
+ use_reentrant: False
23
+ learning_rate: 1.0e-05
24
+ log_level: info
25
+ logging_steps: 5
26
+ logging_strategy: steps
27
+ lr_scheduler_type: cosine
28
+ max_prompt_length: 512
29
+ max_completion_length: 512
30
+ num_train_epochs: 5
31
+ output_dir: /home/swzhang/LLM_alignment/alignment-handbook/qwen_grpo
32
+ overwrite_output_dir: true
33
+ # per_device_batch_size = num_generations * per_device_prompt_num (采样数量*per_device_prompt数量)
34
+ per_device_eval_batch_size: 4
35
+ per_device_train_batch_size: 4
36
+ num_generations: 4
37
+ push_to_hub: False
38
+ remove_unused_columns: false
39
+ report_to:
40
+ - tensorboard
41
+ save_strategy: "steps"
42
+ save_steps: 50
43
+ save_total_limit: 30
44
+ seed: 42
45
+ warmup_ratio: 0.2
config_sft_test_env.yaml ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Model arguments
2
+ model_name_or_path: /home/swzhang/test_trl_0.12_grpo/qwen/Qwen2/
3
+ model_revision: main
4
+ torch_dtype: bfloat16
5
+ attn_implementation: flash_attention_2
6
+
7
+ # Data training arguments
8
+ chat_template: "{% if not add_generation_prompt is defined %}\n{% set add_generation_prompt = false %}\n{% endif %}\n{%- set ns = namespace(found=false) -%}\n{%- for message in messages -%}\n {%- if message['role'] == 'system' -%}\n {%- set ns.found = true -%}\n {%- endif -%}\n{%- endfor -%}\n{{bos_token}}{%- if not ns.found -%}\n{{'You are an AI programming assistant, utilizing the Deepseek Coder model, developed by Deepseek Company, and you only answer questions related to computer science. For politically sensitive questions, security and privacy issues, and other non-computer science questions, you will refuse to answer\\n'}}\n{%- endif %}\n{%- for message in messages %}\n {%- if message['role'] == 'system' %}\n{{ message['content'] }}\n {%- else %}\n {%- if message['role'] == 'user' %}\n{{'### Instruction:\\n' + message['content'] + '\\n'}}\n {%- else %}\n{{'### Response:\\n' + message['content'] + '\\n<|EOT|>\\n'}}\n {%- endif %}\n {%- endif %}\n{%- endfor %}\n{% if add_generation_prompt %}\n{{'### Response:'}}\n{% endif %}"
9
+ dataset_mixer:
10
+ data/my: 1.0
11
+ dataset_splits:
12
+ - train
13
+ preprocessing_num_workers: 2
14
+
15
+ # SFT trainer config
16
+ bf16: true
17
+ do_eval: False
18
+ eval_strategy: epoch
19
+ gradient_accumulation_steps: 1
20
+ gradient_checkpointing: true
21
+ gradient_checkpointing_kwargs:
22
+ use_reentrant: False
23
+ learning_rate: 1.0e-05
24
+ log_level: info
25
+ logging_steps: 5
26
+ logging_strategy: steps
27
+ lr_scheduler_type: cosine
28
+ max_seq_length: 4096
29
+ num_train_epochs: 5
30
+ output_dir: /home/swzhang/LLM_alignment/alignment-handbook/qwen_test_model
31
+ overwrite_output_dir: true
32
+ per_device_eval_batch_size: 1
33
+ per_device_train_batch_size: 1
34
+ push_to_hub: False
35
+ remove_unused_columns: true
36
+ report_to:
37
+ - tensorboard
38
+ save_strategy: "steps"
39
+ save_steps: 51
40
+ save_total_limit: 30
41
+ seed: 42
42
+ warmup_ratio: 0.2
grpo_offline_run.py ADDED
@@ -0,0 +1,217 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python
2
+ # coding=utf-8
3
+ # Copyright 2023 The HuggingFace Inc. team. All rights reserved.
4
+ #
5
+ # Licensed under the Apache License, Version 2.0 (the "License");
6
+ # you may not use this file except in compliance with the License.
7
+ # You may obtain a copy of the License at
8
+ #
9
+ # http://www.apache.org/licenses/LICENSE-2.0
10
+ #
11
+ # Unless required by applicable law or agreed to in writing, software
12
+ # distributed under the License is distributed on an "AS IS" BASIS,
13
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
+ # See the License for the specific language governing permissions and
15
+ # limitations under the License.
16
+ """
17
+ Supervised fine-tuning script for decoder language models.
18
+ CUDA_VISIBLE_DEVICES=1,2,3,4,5 ACCELERATE_LOG_LEVEL=info accelerate launch --config_file recipes/accelerate_configs/deepspeed_zero3.yaml grpo_offline_run.py config_grpo_offline.yaml
19
+ """
20
+
21
+ import logging
22
+ import random
23
+ import sys
24
+
25
+ import datasets
26
+ import torch
27
+ import transformers
28
+ from transformers import AutoModelForCausalLM, set_seed
29
+ from trl.data_utils import maybe_apply_chat_template
30
+ from datasets import load_dataset
31
+ from alignment import (
32
+ DataArguments,
33
+ H4ArgumentParser,
34
+ ModelArguments,
35
+ SFTConfig,
36
+ apply_chat_template,
37
+ decontaminate_humaneval,
38
+ get_checkpoint,
39
+ get_datasets,
40
+ get_kbit_device_map,
41
+ get_peft_config,
42
+ get_quantization_config,
43
+ get_tokenizer,
44
+ )
45
+ from trl import SFTTrainer, setup_chat_format
46
+ from trl_012_grpo.grpo_trainer import GRPOTrainer
47
+ from trl_012_grpo.grpo_config import GRPOConfig
48
+
49
+ logger = logging.getLogger(__name__)
50
+
51
+
52
+ def main():
53
+ parser = H4ArgumentParser((ModelArguments, DataArguments, GRPOConfig))
54
+ model_args, data_args, training_args = parser.parse()
55
+
56
+ # Set seed for reproducibility
57
+ set_seed(training_args.seed)
58
+
59
+ ###############
60
+ # Setup logging
61
+ ###############
62
+ logging.basicConfig(
63
+ format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
64
+ datefmt="%Y-%m-%d %H:%M:%S",
65
+ handlers=[logging.StreamHandler(sys.stdout)],
66
+ )
67
+ log_level = training_args.get_process_log_level()
68
+ logger.setLevel(log_level)
69
+ datasets.utils.logging.set_verbosity(log_level)
70
+ transformers.utils.logging.set_verbosity(log_level)
71
+ transformers.utils.logging.enable_default_handler()
72
+ transformers.utils.logging.enable_explicit_format()
73
+
74
+ # Log on each process a small summary
75
+ logger.warning(
76
+ f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}"
77
+ + f" distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}"
78
+ )
79
+ logger.info(f"Model parameters {model_args}")
80
+ logger.info(f"Data parameters {data_args}")
81
+ logger.info(f"Training/evaluation parameters {training_args}")
82
+
83
+ # Check for last checkpoint
84
+ last_checkpoint = get_checkpoint(training_args)
85
+ if last_checkpoint is not None and training_args.resume_from_checkpoint is None:
86
+ logger.info(f"Checkpoint detected, resuming training at {last_checkpoint=}.")
87
+
88
+ ###############
89
+ # Load datasets
90
+ ###############
91
+ raw_datasets = load_dataset("json", data_files="/data01/swzhang/dataset/grpo_data_ori/grpo_del_lowscore/shuffle/grpo_test_shuffle.json")
92
+ eval_raw_datasets = load_dataset("json", data_files="/data01/swzhang/dataset/grpo_data_ori/grpo_del_lowscore/shuffle/grpo_test_shuffle.json")
93
+ logger.info(
94
+ f"Training on the following datasets and their proportions: {[split + ' : ' + str(dset.num_rows) for split, dset in raw_datasets.items()]}"
95
+ )
96
+ column_names = list(raw_datasets["train"].features)
97
+
98
+ ################
99
+ # Load tokenizer
100
+ ################
101
+ tokenizer = get_tokenizer(model_args, data_args)
102
+
103
+ #######################
104
+ # Load pretrained model
105
+ #######################
106
+ logger.info("*** Load pretrained model ***")
107
+ torch_dtype = (
108
+ model_args.torch_dtype if model_args.torch_dtype in ["auto", None] else getattr(torch, model_args.torch_dtype)
109
+ )
110
+ quantization_config = get_quantization_config(model_args)
111
+
112
+ model_kwargs = dict(
113
+ revision=model_args.model_revision,
114
+ trust_remote_code=model_args.trust_remote_code,
115
+ attn_implementation=model_args.attn_implementation,
116
+ torch_dtype=torch_dtype,
117
+ use_cache=False if training_args.gradient_checkpointing else True,
118
+ device_map=get_kbit_device_map() if quantization_config is not None else None,
119
+ quantization_config=quantization_config,
120
+ )
121
+
122
+ model = model_args.model_name_or_path
123
+ # For ChatML we need to add special tokens and resize the embedding layer
124
+ if "<|im_start|>" in tokenizer.chat_template and "gemma-tokenizer-chatml" not in tokenizer.name_or_path:
125
+ model = AutoModelForCausalLM.from_pretrained(model_args.model_name_or_path, **model_kwargs)
126
+ model, tokenizer = setup_chat_format(model, tokenizer)
127
+ model_kwargs = None
128
+
129
+ #####################
130
+ # Apply chat template
131
+ #####################
132
+ def modify_completion(example):
133
+ # 将 completion 转换为列表
134
+ example['prompt'] = \
135
+ maybe_apply_chat_template({"prompt": [{"role": "user", "content": example['prompt']}]}, tokenizer=tokenizer)[
136
+ 'prompt']
137
+ return example
138
+
139
+ raw_datasets = raw_datasets.map(modify_completion)
140
+ eval_raw_datasets = eval_raw_datasets.map(modify_completion)
141
+
142
+
143
+ train_dataset = raw_datasets["train"]
144
+ eval_dataset = eval_raw_datasets["train"]
145
+
146
+ ########################
147
+ # Initialize the Trainer
148
+ ########################
149
+
150
+ # 这里的reward function实际不会被用到
151
+ def reward_len(completions, **kwargs):
152
+ return [-abs(20 - len(completion)) for completion in completions]
153
+
154
+ training_args.model_init_kwargs = model_kwargs
155
+ trainer = GRPOTrainer(
156
+ model=model,
157
+ reward_funcs=reward_len,
158
+ args=training_args,
159
+ train_dataset=train_dataset,
160
+ eval_dataset=eval_dataset,
161
+ )
162
+
163
+ ###############
164
+ # Training loop
165
+ ###############
166
+ logger.info("*** Train ***")
167
+ checkpoint = None
168
+ if training_args.resume_from_checkpoint is not None:
169
+ checkpoint = training_args.resume_from_checkpoint
170
+ elif last_checkpoint is not None:
171
+ checkpoint = last_checkpoint
172
+ train_result = trainer.train(resume_from_checkpoint=checkpoint)
173
+ metrics = train_result.metrics
174
+ metrics["train_samples"] = len(train_dataset)
175
+ trainer.log_metrics("train", metrics)
176
+ trainer.save_metrics("train", metrics)
177
+ trainer.save_state()
178
+
179
+ ##################################
180
+ # Save model and create model card
181
+ ##################################
182
+ logger.info("*** Save model ***")
183
+ trainer.save_model(training_args.output_dir)
184
+ logger.info(f"Model saved to {training_args.output_dir}")
185
+
186
+ # Save everything else on main process
187
+ kwargs = {
188
+ "finetuned_from": model_args.model_name_or_path,
189
+ "dataset": list(data_args.dataset_mixer.keys()),
190
+ "dataset_tags": list(data_args.dataset_mixer.keys()),
191
+ "tags": ["alignment-handbook"],
192
+ }
193
+ if trainer.accelerator.is_main_process:
194
+ trainer.create_model_card(**kwargs)
195
+ # Restore k,v cache for fast inference
196
+ trainer.model.config.use_cache = True
197
+ trainer.model.config.save_pretrained(training_args.output_dir)
198
+
199
+ ##########
200
+ # Evaluate
201
+ ##########
202
+ if training_args.do_eval:
203
+ logger.info("*** Evaluate ***")
204
+ metrics = trainer.evaluate()
205
+ metrics["eval_samples"] = len(eval_dataset)
206
+ trainer.log_metrics("eval", metrics)
207
+ trainer.save_metrics("eval", metrics)
208
+
209
+ if training_args.push_to_hub is True:
210
+ logger.info("Pushing to hub...")
211
+ trainer.push_to_hub(**kwargs)
212
+
213
+ logger.info("*** Training complete ***")
214
+
215
+
216
+ if __name__ == "__main__":
217
+ main()
recipes/accelerate_configs/deepspeed_zero3.yaml ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ compute_environment: LOCAL_MACHINE
2
+ debug: false
3
+ deepspeed_config:
4
+ deepspeed_multinode_launcher: standard
5
+ offload_optimizer_device: none
6
+ offload_param_device: none
7
+ zero3_init_flag: true
8
+ zero3_save_16bit_model: true
9
+ zero_stage: 3
10
+ distributed_type: DEEPSPEED
11
+ downcast_bf16: 'no'
12
+ machine_rank: 0
13
+ main_training_function: main
14
+ mixed_precision: bf16
15
+ num_machines: 1
16
+ num_processes: 4
17
+ rdzv_backend: static
18
+ same_network: true
19
+ tpu_env: []
20
+ tpu_use_cluster: false
21
+ tpu_use_sudo: false
22
+ use_cpu: false
recipes/accelerate_configs/fsdp.yaml ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ compute_environment: LOCAL_MACHINE
2
+ debug: false
3
+ distributed_type: FSDP
4
+ downcast_bf16: 'no'
5
+ enable_cpu_affinity: false
6
+ fsdp_config:
7
+ fsdp_auto_wrap_policy: TRANSFORMER_BASED_WRAP
8
+ fsdp_backward_prefetch: BACKWARD_PRE
9
+ fsdp_cpu_ram_efficient_loading: true
10
+ fsdp_forward_prefetch: true
11
+ fsdp_offload_params: false
12
+ fsdp_sharding_strategy: FULL_SHARD
13
+ fsdp_state_dict_type: SHARDED_STATE_DICT
14
+ fsdp_sync_module_states: true
15
+ fsdp_use_orig_params: true
16
+ machine_rank: 0
17
+ main_training_function: main
18
+ mixed_precision: bf16
19
+ num_machines: 1
20
+ num_processes: 8
21
+ rdzv_backend: static
22
+ same_network: true
23
+ tpu_env: []
24
+ tpu_use_cluster: false
25
+ tpu_use_sudo: false
26
+ use_cpu: false
recipes/accelerate_configs/fsdp_qlora.yaml ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ compute_environment: LOCAL_MACHINE
2
+ debug: false
3
+ distributed_type: FSDP
4
+ downcast_bf16: 'no'
5
+ fsdp_config:
6
+ fsdp_auto_wrap_policy: TRANSFORMER_BASED_WRAP
7
+ fsdp_backward_prefetch: BACKWARD_PRE
8
+ fsdp_cpu_ram_efficient_loading: true
9
+ fsdp_forward_prefetch: false
10
+ fsdp_offload_params: true
11
+ fsdp_sharding_strategy: FULL_SHARD
12
+ fsdp_state_dict_type: SHARDED_STATE_DICT
13
+ fsdp_sync_module_states: true
14
+ fsdp_use_orig_params: false
15
+ machine_rank: 0
16
+ main_training_function: main
17
+ mixed_precision: 'no'
18
+ num_machines: 1
19
+ num_processes: 2
20
+ rdzv_backend: static
21
+ same_network: true
22
+ tpu_env: []
23
+ tpu_use_cluster: false
24
+ tpu_use_sudo: false
25
+ use_cpu: false
recipes/accelerate_configs/multi_gpu.yaml ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ compute_environment: LOCAL_MACHINE
2
+ debug: false
3
+ distributed_type: MULTI_GPU
4
+ downcast_bf16: 'no'
5
+ gpu_ids: all
6
+ machine_rank: 0
7
+ main_training_function: main
8
+ mixed_precision: bf16
9
+ num_machines: 1
10
+ num_processes: 8
11
+ rdzv_backend: static
12
+ same_network: true
13
+ tpu_env: []
14
+ tpu_use_cluster: false
15
+ tpu_use_sudo: false
16
+ use_cpu: false
recipes/constitutional-ai/README.md ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Constitutional AI
2
+
3
+ This repo includes the recipe for training the following models:
4
+
5
+ * https://huggingface.co/HuggingFaceH4/mistral-7b-anthropic
6
+ * https://huggingface.co/HuggingFaceH4/mistral-7b-grok
7
+
8
+
9
+ ## Full training examples
10
+
11
+ You will require 8 GPUs (80GB of VRAM) to train the full model.
12
+ ```shell
13
+ # Step 1 - SFT
14
+ ACCELERATE_LOG_LEVEL=info accelerate launch --config_file recipes/accelerate_configs/deepspeed_zero3.yaml scripts/run_sft.py recipes/constitutional-ai/sft/config_{grok,anthropic}.yaml
15
+
16
+ # Step 2 - DPO
17
+ ACCELERATE_LOG_LEVEL=info accelerate launch --config_file recipes/accelerate_configs/deepspeed_zero3.yaml scripts/run_dpo.py recipes/constitutional-ai/dpo/config_anthropic.yaml
18
+ # Note that we did not include the DPO recipe for grok, as that model's seems overtrained and too snarky.
19
+ ```
20
+
21
+
22
+ ## Advanced: generating you own dataset
23
+
24
+ To generate the constitutional AI dataset, see https://github.com/huggingface/llm-swarm/tree/main/examples/constitutional-ai for detailed instructions if you want to build or customize the dataset.
recipes/constitutional-ai/dpo/config_anthropic.yaml ADDED
@@ -0,0 +1,41 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Model arguments
2
+ model_name_or_path: alignment-handbook/mistral-7b-sft-constitutional-ai
3
+ torch_dtype: null
4
+
5
+ # Data training arguments
6
+ # For definitions, see: src/h4/training/config.py
7
+ dataset_mixer:
8
+ HuggingFaceH4/ultrafeedback_binarized: 1.0
9
+ HuggingFaceH4/cai-conversation-harmless: 1.0
10
+ dataset_splits:
11
+ - train_prefs
12
+ - test_prefs
13
+ preprocessing_num_workers: 12
14
+
15
+ # DPOTrainer arguments
16
+ bf16: true
17
+ beta: 0.1
18
+ do_eval: true
19
+ do_train: true
20
+ eval_strategy: steps
21
+ eval_steps: 1000
22
+ gradient_accumulation_steps: 1
23
+ gradient_checkpointing: true
24
+ hub_model_id: mistral-7b-dpo-constitutional-ai
25
+ learning_rate: 5.0e-7
26
+ log_level: info
27
+ logging_steps: 10
28
+ lr_scheduler_type: linear
29
+ max_length: 1024
30
+ max_prompt_length: 512
31
+ num_train_epochs: 3
32
+ optim: rmsprop
33
+ output_dir: data/mistral-7b-dpo-constitutional-ai
34
+ per_device_train_batch_size: 2
35
+ per_device_eval_batch_size: 8
36
+ push_to_hub: true
37
+ save_strategy: "steps"
38
+ save_steps: 100
39
+ save_total_limit: 1
40
+ seed: 42
41
+ warmup_ratio: 0.1
recipes/constitutional-ai/sft/config_anthropic.yaml ADDED
@@ -0,0 +1,48 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Model arguments
2
+ model_name_or_path: mistralai/Mistral-7B-v0.1
3
+ model_revision: main
4
+ torch_dtype: bfloat16
5
+ attn_implementation: flash_attention_2
6
+
7
+ # Data training arguments
8
+ chat_template: "{% for message in messages %}\n{% if message['role'] == 'user' %}\n{{ '<|user|>\n' + message['content'] + eos_token }}\n{% elif message['role'] == 'system' %}\n{{ '<|system|>\n' + message['content'] + eos_token }}\n{% elif message['role'] == 'assistant' %}\n{{ '<|assistant|>\n' + message['content'] + eos_token }}\n{% endif %}\n{% if loop.last and add_generation_prompt %}\n{{ '<|assistant|>' }}\n{% endif %}\n{% endfor %}"
9
+ dataset_mixer:
10
+ HuggingFaceH4/cai-conversation-harmless: 1.0
11
+ HuggingFaceH4/ultrachat_200k: 1.0
12
+ dataset_splits:
13
+ - train_sft
14
+ - test_sft
15
+ preprocessing_num_workers: 12
16
+
17
+ # SFT trainer config
18
+ bf16: true
19
+ do_eval: true
20
+ do_train: true
21
+ eval_strategy: epoch # One of ["no", "steps", "epoch"]
22
+ gradient_accumulation_steps: 4
23
+ gradient_checkpointing: true
24
+ gradient_checkpointing_kwargs:
25
+ use_reentrant: False
26
+ hub_model_id: mistral-7b-sft-constitutional-ai
27
+ hub_strategy: every_save
28
+ learning_rate: 2.0e-05
29
+ log_level: info
30
+ logging_steps: 5
31
+ logging_strategy: steps
32
+ lr_scheduler_type: cosine
33
+ max_seq_length: 2048
34
+ max_steps: -1
35
+ num_train_epochs: 1
36
+ output_dir: data/mistral-7b-sft-constitutional-ai
37
+ overwrite_output_dir: true
38
+ per_device_eval_batch_size: 8
39
+ per_device_train_batch_size: 8
40
+ push_to_hub: true
41
+ remove_unused_columns: true
42
+ report_to:
43
+ - tensorboard
44
+ save_strategy: "steps"
45
+ save_steps: 100
46
+ save_total_limit: 1
47
+ seed: 42
48
+ warmup_ratio: 0.1
recipes/constitutional-ai/sft/config_grok.yaml ADDED
@@ -0,0 +1,48 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Model arguments
2
+ model_name_or_path: mistralai/Mistral-7B-v0.1
3
+ model_revision: main
4
+ torch_dtype: bfloat16
5
+ attn_implementation: flash_attention_2
6
+
7
+ # Data training arguments
8
+ chat_template: "{% for message in messages %}\n{% if message['role'] == 'user' %}\n{{ '<|user|>\n' + message['content'] + eos_token }}\n{% elif message['role'] == 'system' %}\n{{ '<|system|>\n' + message['content'] + eos_token }}\n{% elif message['role'] == 'assistant' %}\n{{ '<|assistant|>\n' + message['content'] + eos_token }}\n{% endif %}\n{% if loop.last and add_generation_prompt %}\n{{ '<|assistant|>' }}\n{% endif %}\n{% endfor %}"
9
+ dataset_mixer:
10
+ HuggingFaceH4/grok-conversation-harmless: 0.15
11
+ HuggingFaceH4/ultrachat_200k: 1.0
12
+ dataset_splits:
13
+ - train_sft
14
+ - test_sft
15
+ preprocessing_num_workers: 12
16
+
17
+ # SFT trainer config
18
+ bf16: true
19
+ do_eval: true
20
+ do_train: true
21
+ eval_strategy: epoch # One of ["no", "steps", "epoch"]
22
+ gradient_accumulation_steps: 4
23
+ gradient_checkpointing: true
24
+ gradient_checkpointing_kwargs:
25
+ use_reentrant: False
26
+ hub_model_id: mistral-7b-sft-constitutional-ai
27
+ hub_strategy: every_save
28
+ learning_rate: 2.0e-05
29
+ log_level: info
30
+ logging_steps: 5
31
+ logging_strategy: steps
32
+ lr_scheduler_type: cosine
33
+ max_seq_length: 2048
34
+ max_steps: -1
35
+ num_train_epochs: 1
36
+ output_dir: data/mistral-7b-sft-constitutional-ai
37
+ overwrite_output_dir: true
38
+ per_device_eval_batch_size: 8
39
+ per_device_train_batch_size: 8
40
+ push_to_hub: true
41
+ remove_unused_columns: true
42
+ report_to:
43
+ - tensorboard
44
+ save_strategy: "steps"
45
+ save_steps: 100
46
+ save_total_limit: 1
47
+ seed: 42
48
+ warmup_ratio: 0.1
recipes/gpt2-nl/README.md ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Language Adaptation through Continued Pretraining
2
+
3
+ This directory shows a base example of how to use continued pretraining and further tuning to adapt a language model to new data (e.g. a new language or domain).
4
+
5
+ Three steps are needed: continued pretraining (`cpt`), supervised finetuning (`sft`), and direct preference optimisation (`dpo`). In this dummy example, we'll continue pretraining gpt2 on Dutch raw data, then sft-tuning it, and finally aligning it with DPO. Note that no extensive hyperparameters were tested in this example and that the output models are bad - it is just to show you how you can use the scripts for LM adaptation. The scripts work on 4x 3090s (24GB VRAM). If you have less powerful hardware you may need to reduce the batch size.
6
+
7
+ ## Continued pretraining
8
+
9
+ This step will further pretrain the original `gpt2` model on plain Dutch text. Note that the script will by default use the `text` column in the dataset but you can change that by specifying `text_column` in the yaml file or on the command-line.
10
+
11
+ ```shell
12
+ ACCELERATE_LOG_LEVEL=info accelerate launch \
13
+ --config_file recipes/accelerate_configs/multi_gpu.yaml \
14
+ --num_processes 4 \
15
+ scripts/run_cpt.py \
16
+ recipes/gpt2-nl/cpt/config_full.yaml
17
+ ```
18
+
19
+ ## Supervised finetuning
20
+
21
+ As other recipes, such as the famous zephyr-7b-beta recipe, have shown, we can then teach our model how to hold a conversation by finetuning it on chat-formatted data. As a base model, we'll make use of the output of the previous step.
22
+
23
+ ```shell
24
+ ACCELERATE_LOG_LEVEL=info accelerate launch \
25
+ --config_file recipes/accelerate_configs/multi_gpu.yaml \
26
+ --num_processes 4 \
27
+ scripts/run_sft.py recipes/gpt2-nl/sft/config_full.yaml
28
+ ```
29
+
30
+ ## Direct preference optimisation
31
+
32
+ Finally, to align the model better with feedback, we can finetune the SFT output with the DPO algorithm. This should improve the quality of the chat capabilities of the model.
33
+
34
+ ```shell
35
+ ACCELERATE_LOG_LEVEL=info accelerate launch \
36
+ --config_file recipes/accelerate_configs/multi_gpu.yaml \
37
+ --num_processes 4 \
38
+ scripts/run_dpo.py recipes/gpt2-nl/dpo/config_full.yaml
39
+ ```
40
+
41
+ ## Conclusion
42
+
43
+ With the steps above you can adapt an LM to a new domain, more data, or even a different language. Then, with sft and dpo, you can end up building a powerful chatbot, too! All within just three simple commands. It should be obvious that all of these follow a very similar approach, which makes them suitable to apply in parameterized slurm jobs. The neat part is that you can easily overwrite arguments in the yaml files by specifying the overwriting argument as a command-line argument, so the adaptability is also great.
recipes/gpt2-nl/cpt/config_full.yaml ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Model arguments
2
+ model_name_or_path: gpt2
3
+ model_revision: main
4
+ torch_dtype: bfloat16
5
+
6
+ # Data training arguments
7
+ dataset_mixer:
8
+ yhavinga/mc4_nl_cleaned: 1.0
9
+ dataset_splits:
10
+ - train
11
+ dataset_configs:
12
+ - tiny
13
+ preprocessing_num_workers: 12
14
+
15
+ # SFT trainer config
16
+ bf16: true
17
+ do_eval: False
18
+ eval_strategy: "no"
19
+ gradient_accumulation_steps: 1
20
+ gradient_checkpointing: true
21
+ gradient_checkpointing_kwargs:
22
+ use_reentrant: False
23
+ hub_model_id: gpt2-cpt-dutch
24
+ hub_strategy: every_save
25
+ learning_rate: 2.0e-04
26
+ log_level: info
27
+ logging_steps: 5
28
+ logging_strategy: steps
29
+ lr_scheduler_type: cosine
30
+ max_seq_length: 1024
31
+ max_steps: -1
32
+ num_train_epochs: 1
33
+ output_dir: data/gpt2-cpt-dutch
34
+ overwrite_output_dir: true
35
+ per_device_eval_batch_size: 8
36
+ per_device_train_batch_size: 16
37
+ push_to_hub: true
38
+ remove_unused_columns: true
39
+ report_to:
40
+ - wandb
41
+ save_strategy: "steps"
42
+ save_steps: 100
43
+ save_total_limit: 1
44
+ seed: 42
45
+ warmup_ratio: 0.1
recipes/gpt2-nl/dpo/config_full.yaml ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Model arguments
2
+ model_name_or_path: BramVanroy/gpt2-sft-dutch
3
+ model_revision: main
4
+ torch_dtype: bfloat16
5
+
6
+ # Data training arguments
7
+ # For definitions, see: src/h4/training/config.py
8
+ dataset_mixer:
9
+ BramVanroy/ultra_feedback_dutch: 1.0
10
+ dataset_splits:
11
+ - train_prefs
12
+ - test_prefs
13
+ preprocessing_num_workers: 12
14
+
15
+ # DPOTrainer arguments
16
+ bf16: true
17
+ beta: 0.1
18
+ do_eval: true
19
+ eval_strategy: steps
20
+ eval_steps: 100
21
+ gradient_accumulation_steps: 8
22
+ gradient_checkpointing: true
23
+ gradient_checkpointing_kwargs:
24
+ use_reentrant: False
25
+ hub_model_id: gpt2-dpo-dutch
26
+ learning_rate: 5.0e-7
27
+ log_level: info
28
+ logging_steps: 10
29
+ lr_scheduler_type: cosine
30
+ max_length: 1024
31
+ max_prompt_length: 512
32
+ num_train_epochs: 1
33
+ optim: adamw_torch
34
+ output_dir: data/gpt2-dpo-dutch
35
+ per_device_train_batch_size: 8
36
+ per_device_eval_batch_size: 8
37
+ push_to_hub: true
38
+ save_strategy: "steps"
39
+ save_steps: 100
40
+ save_total_limit: 1
41
+ seed: 42
42
+ warmup_ratio: 0.1
43
+ report_to:
44
+ - wandb
recipes/gpt2-nl/sft/config_full.yaml ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Model arguments
2
+ model_name_or_path: BramVanroy/gpt2-cpt-dutch
3
+ model_revision: main
4
+ torch_dtype: bfloat16
5
+
6
+ # Data training arguments
7
+ chat_template: "{% for message in messages %}\n{% if message['role'] == 'user' %}\n{{ '<|user|>\n' + message['content'] + eos_token }}\n{% elif message['role'] == 'system' %}\n{{ '<|system|>\n' + message['content'] + eos_token }}\n{% elif message['role'] == 'assistant' %}\n{{ '<|assistant|>\n' + message['content'] + eos_token }}\n{% endif %}\n{% if loop.last and add_generation_prompt %}\n{{ '<|assistant|>' }}\n{% endif %}\n{% endfor %}"
8
+ dataset_mixer:
9
+ BramVanroy/ultrachat_200k_dutch: 1.0
10
+ dataset_splits:
11
+ - train_sft
12
+ - test_sft
13
+ preprocessing_num_workers: 12
14
+
15
+ # SFT trainer config
16
+ bf16: true
17
+ do_eval: true
18
+ eval_strategy: epoch
19
+ gradient_accumulation_steps: 1
20
+ gradient_checkpointing: true
21
+ gradient_checkpointing_kwargs:
22
+ use_reentrant: False
23
+ hub_model_id: gpt2-sft-dutch
24
+ hub_strategy: every_save
25
+ learning_rate: 2.0e-05
26
+ log_level: info
27
+ logging_steps: 5
28
+ logging_strategy: steps
29
+ lr_scheduler_type: cosine
30
+ max_seq_length: 1024
31
+ max_steps: -1
32
+ num_train_epochs: 1
33
+ output_dir: data/gpt2-sft-dutch
34
+ overwrite_output_dir: true
35
+ per_device_eval_batch_size: 8
36
+ per_device_train_batch_size: 8
37
+ push_to_hub: true
38
+ remove_unused_columns: true
39
+ report_to:
40
+ - wandb
41
+ save_strategy: "steps"
42
+ save_steps: 100
43
+ save_total_limit: 1
44
+ seed: 42
45
+ warmup_ratio: 0.1
recipes/launch.slurm ADDED
@@ -0,0 +1,86 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+ #SBATCH --ntasks-per-node=1
3
+ #SBATCH --exclusive
4
+ #SBATCH --gres=gpu:8
5
+ #SBATCH --partition=hopper-prod # Adjust this for your cluster
6
+ #SBATCH --output=/fsx/h4/logs/%x-%j.out # Adjust this for your cluster
7
+ #SBATCH --err=/fsx/h4/logs/%x-%j.err # Adjust this for your cluster
8
+
9
+ set -x -e
10
+
11
+ source ~/.bashrc
12
+ conda activate handbook
13
+ echo "START TIME: $(date)"
14
+
15
+ MODEL=$1
16
+ TASK=$2
17
+ PRECISION=$3
18
+ ACCELERATOR=$4
19
+ OPTIONAL_ARGS=$5
20
+
21
+ # Training setup
22
+ NUM_NODES=$SLURM_NNODES
23
+ GPUS_PER_NODE=8
24
+ WORLD_SIZE=$(($NUM_NODES*$GPUS_PER_NODE))
25
+ # Due to conflicts between Accelerate's DeepSpeed configs and Transformers' TrainingArguments, we need to parse the gradient accumulation steps from the config file to ensure they match
26
+ CONFIG_FILE=recipes/$MODEL/$TASK/config_$PRECISION.yaml
27
+ GRAD_ACC_STEPS=$(grep 'gradient_accumulation_steps' $CONFIG_FILE | awk '{print $2}')
28
+
29
+ # Split the string into individual arguments
30
+ IFS=' ' read -ra ARGS <<< "$OPTIONAL_ARGS"
31
+
32
+ # Loop through the arguments and find the one with "--gradient_accumulation_steps"
33
+ for arg in "${ARGS[@]}"; do
34
+ if [[ "$arg" == "--gradient_accumulation_steps="* ]]; then
35
+ # Extract the value after the equals sign
36
+ GRAD_ACC_STEPS="${arg#*=}"
37
+ break # Exit the loop once we find the desired argument
38
+ fi
39
+ done
40
+
41
+ echo "Gradient accumulation steps: $GRAD_ACC_STEPS"
42
+ # so processes know who to talk to
43
+ MASTER_ADDR=$(scontrol show hostnames $SLURM_JOB_NODELIST | head -n 1)
44
+ MASTER_PORT=6000
45
+
46
+ export CMD=" \
47
+ scripts/run_$TASK.py $CONFIG_FILE $OPTIONAL_ARGS
48
+ "
49
+
50
+ export LAUNCHER="HF_HUB_ENABLE_HF_TRANSFER=1 ACCELERATE_LOG_LEVEL=info TRANSFORMERS_VERBOSITY=info accelerate launch \
51
+ --config_file recipes/accelerate_configs/$ACCELERATOR.yaml \
52
+ --gradient_accumulation_steps $GRAD_ACC_STEPS \
53
+ --num_machines $NUM_NODES \
54
+ --num_processes $WORLD_SIZE \
55
+ --main_process_ip $MASTER_ADDR \
56
+ --main_process_port $MASTER_PORT \
57
+ --machine_rank \$SLURM_PROCID \
58
+ --rdzv_conf "rdzv_backend=c10d,rdzv_endpoint=$MASTER_ADDR:$MASTER_PORT" \
59
+ --max_restarts 1 \
60
+ --role \$(hostname -s): \
61
+ --tee 3 \
62
+ "
63
+
64
+ # force crashing on nccl issues like hanging broadcast
65
+ export NCCL_ASYNC_ERROR_HANDLING=1
66
+ # export NCCL_DEBUG=INFO
67
+ # export NCCL_DEBUG_SUBSYS=COLL
68
+ # export NCCL_SOCKET_NTHREADS=1
69
+ # export NCCL_NSOCKS_PERTHREAD=1
70
+ # export CUDA_LAUNCH_BLOCKING=1
71
+
72
+ # Specific configuration optimized for the Hugging Face Compute Cluster
73
+ # Be ye warned this may not work on other clusters!
74
+ module load cuda/12.1
75
+
76
+ # srun error handling:
77
+ # --wait=60: wait 60 sec after the first task terminates before terminating all remaining tasks
78
+ # --kill-on-bad-exit=1: terminate a step if any task exits with a non-zero exit code
79
+ SRUN_ARGS=" \
80
+ --wait=60 \
81
+ --kill-on-bad-exit=1 \
82
+ "
83
+
84
+ clear; srun $SRUN_ARGS --jobid $SLURM_JOB_ID bash -c "$LAUNCHER --role \$SLURMD_NODENAME: $CMD" 2>&1
85
+
86
+ echo "END TIME: $(date)"
recipes/pref_align_scan/README.md ADDED
@@ -0,0 +1,49 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Comparing Preference Alignment Algorithms
2
+ This directory contains various comparisons for three algorithms: DPO, IPO, and KTO. Each algorithm has been run in different hyperparameter configurations to study their performance. Two different models and datasets have been used to compare the performance of each algorithm:
3
+
4
+ - zephyr-beta-sft and Ultrafeedback
5
+ - OpenHermes-2.5 and the OpenOrca datasets
6
+
7
+ We release a collection containing the datasets and models used for these experiments, if you require the other trained models, we can release them on request.
8
+ You can find a longer description of these results in our [blogpost](https://huggingface.co/blog/pref-tuning)
9
+
10
+ ## Comparisons
11
+ For each algorithm, we aim to tune the beta parameter for a fixed learning rate. We vary beta from 0.1-0.9 in steps of 0.1, we have also found that in certain configurations a tiny value of beta, 0.01, can be effective. So we have included this smaller value in all our comparisons.
12
+
13
+ ## Usage
14
+ The experiments can be launched with the following bash script:
15
+ ```bash
16
+ #!/bin/bash
17
+
18
+ # Define an array containing the base configs we wish to fine tune
19
+ configs=("zephyr" "openhermes")
20
+ # Define an array of loss types
21
+ loss_types=("sigmoid" "kto_pair" "ipo")
22
+
23
+ # Define an array of beta values
24
+ betas=("0.01" "0.1" "0.2" "0.3" "0.4" "0.5" "0.6" "0.7" "0.8" "0.9")
25
+
26
+ # Outer loop for loss types
27
+ for config in "${configs[@]}"; do
28
+ for loss_type in "${loss_types[@]}"; do
29
+
30
+ # Inner loop for beta values
31
+ for beta in "${betas[@]}"; do
32
+
33
+ # Determine the job name and model revision based on loss type
34
+ job_name="$config_${loss_type}_beta_${beta}"
35
+ model_revision="${loss_type}-${beta}"
36
+
37
+ # Submit the job
38
+ sbatch --job-name=${job_name} recipes/launch.slurm pref_align_scan dpo $config deepspeed_zero3 \
39
+ "--beta=${beta} --loss_type=${loss_type} --output_dir=data/$config-7b-align-scan-${loss_type}-beta-${beta} --hub_model_revision=${model_revision}"
40
+ done
41
+ done
42
+ done
43
+ ```
44
+
45
+
46
+
47
+
48
+
49
+
recipes/pref_align_scan/dpo/config_openhermes.yaml ADDED
@@ -0,0 +1,41 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Model arguments
2
+ model_name_or_path: teknium/OpenHermes-2.5-Mistral-7B
3
+ torch_dtype: null
4
+
5
+ # Data training arguments
6
+ dataset_mixer:
7
+ HuggingFaceH4/orca_dpo_pairs: 1.0
8
+ dataset_splits:
9
+ - train_prefs
10
+ - test_prefs
11
+ preprocessing_num_workers: 12
12
+
13
+ # Training arguments with sensible defaults
14
+ bf16: true
15
+ beta: 0.01
16
+ loss_type: sigmoid
17
+ do_eval: true
18
+ do_train: true
19
+ eval_strategy: steps
20
+ eval_steps: 100
21
+ gradient_accumulation_steps: 2
22
+ gradient_checkpointing: true
23
+ gradient_checkpointing_kwargs:
24
+ use_reentrant: False
25
+ hub_model_id: HuggingFaceH4/openhermes-2.5-mistral-7b-dpo
26
+ hub_model_revision: v1.0
27
+
28
+ learning_rate: 5.0e-7
29
+ logging_steps: 10
30
+ lr_scheduler_type: cosine
31
+ max_prompt_length: 512
32
+ num_train_epochs: 1
33
+ optim: adamw_torch
34
+ output_dir: data/openhermes-2.5-mistral-7b-dpo-v1.0
35
+ per_device_train_batch_size: 8
36
+ per_device_eval_batch_size: 8
37
+ save_strategy: "steps"
38
+ save_steps: 100
39
+ save_total_limit: 1
40
+ seed: 42
41
+ warmup_ratio: 0.1
recipes/pref_align_scan/dpo/config_zephyr.yaml ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Model arguments
2
+ model_name_or_path: alignment-handbook/zephyr-7b-sft-full
3
+ torch_dtype: null
4
+
5
+ # Data training arguments
6
+ dataset_mixer:
7
+ HuggingFaceH4/ultrafeedback_binarized: 1.0
8
+ dataset_splits:
9
+ - train_prefs
10
+ - test_prefs
11
+ preprocessing_num_workers: 12
12
+
13
+ # Training arguments with sensible defaults
14
+ bf16: true
15
+ beta: 0.01
16
+ loss_type: sigmoid
17
+ do_eval: true
18
+ eval_strategy: steps
19
+ eval_steps: 100
20
+ gradient_accumulation_steps: 2
21
+ gradient_checkpointing: true
22
+ gradient_checkpointing_kwargs:
23
+ use_reentrant: False
24
+ hub_model_id: zephyr-7b-align-scan
25
+ hub_model_revision: dpo-beta-0.01
26
+ learning_rate: 5.0e-7
27
+ logging_steps: 10
28
+ lr_scheduler_type: cosine
29
+ max_prompt_length: 512
30
+ num_train_epochs: 1
31
+ optim: adamw_torch
32
+ output_dir: data/zephyr-7b-align-scan-dpo-beta-0.01
33
+ per_device_train_batch_size: 8
34
+ per_device_eval_batch_size: 8
35
+ save_strategy: "steps"
36
+ save_steps: 100
37
+ save_total_limit: 1
38
+ seed: 42
39
+ warmup_ratio: 0.1
recipes/pref_align_scan/launch_scan.sh ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+ # Define an array containing the base configs we wish to fine tune
3
+ configs=("zephyr" "openhermes")
4
+ # Define an array of loss types
5
+ loss_types=("sigmoid" "kto_pair" "ipo")
6
+ # Define an array of beta values
7
+ betas=("0.01" "0.1" "0.2" "0.3" "0.4" "0.5" "0.6" "0.7" "0.8" "0.9")
8
+
9
+ # Outer loop for loss types
10
+ for config in "${configs[@]}"; do
11
+ for loss_type in "${loss_types[@]}"; do
12
+
13
+ # Inner loop for beta values
14
+ for beta in "${betas[@]}"; do
15
+ # Determine the job name and model revision based on loss type
16
+ job_name="$config_${loss_type}_beta_${beta}"
17
+ model_revision="${loss_type}-${beta}"
18
+
19
+ # Submit the job
20
+ sbatch --job-name=${job_name} recipes/launch.slurm pref_align_scan dpo $config deepspeed_zero3 \
21
+ "--beta=${beta} --loss_type=${loss_type} --output_dir=data/$config-7b-align-scan-${loss_type}-beta-${beta} --hub_model_revision=${model_revision}"
22
+ done
23
+ done
24
+ done
recipes/smollm/README.md ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ # Instructions to train SmolLM-Instruct
3
+
4
+ We build the [SmolLM-Instruct](https://huggingface.co/collections/HuggingFaceTB/smollm-6695016cad7167254ce15966) (v0.2) models (135M, 360M and 1.7B) by doing SFT on a mix of these datasets:
5
+ - a dataset of 2k simple everyday conversations we generated by llama3.1-70B [everyday-conversations-llama3.1-2k](https://huggingface.co/datasets/HuggingFaceTB/everyday-conversations-llama3.1-2k/)
6
+ - [Magpie-Pro-300K-Filtered](https://huggingface.co/datasets/Magpie-Align/Magpie-Pro-300K-Filtered)
7
+ - [StarCoder2-Self-OSS-Instruct](https://huggingface.co/datasets/bigcode/self-oss-instruct-sc2-exec-filter-50k)
8
+ - A small subset of [OpenHermes-2.5](https://huggingface.co/datasets/teknium/OpenHermes-2.5)
9
+
10
+ ## Setup
11
+
12
+ Follow the installation instructions in https://github.com/huggingface/alignment-handbook/tree/main?tab=readme-ov-file#installation-instructions
13
+
14
+ ## Training
15
+ We train the models on 8 GPUs using the following command:
16
+
17
+ ```shell
18
+ ACCELERATE_LOG_LEVEL=info accelerate launch --config_file recipes/accelerate_configs/deepspeed_zero3.yaml scripts/run_sft.py recipes/smollm/sft/config.yaml
19
+ ```
recipes/smollm/sft/config.yaml ADDED
@@ -0,0 +1,53 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Model arguments
2
+ model_name_or_path: HuggingFaceTB/SmolLM-360M
3
+ model_revision: main
4
+ tokenizer_name_or_path: HuggingFaceTB/SmolLM-360M-Instruct # Custom tokenizer with <|im_start|> and <|im_end|> tokens
5
+ torch_dtype: bfloat16
6
+ use_flash_attention_2: true
7
+
8
+ # Data training arguments
9
+ dataset_mixer:
10
+ HuggingFaceTB/Magpie-Pro-300K-Filtered-H4: 1.0
11
+ HuggingFaceTB/self-oss-instruct-sc2-H4: 1.0
12
+ HuggingFaceTB/OpenHermes-2.5-H4: 0.001
13
+ HuggingFaceTB/everyday-conversations-llama3.1-2k: 1.0
14
+ HuggingFaceTB/instruct-data-basics-smollm-H4: 1.0
15
+
16
+ dataset_splits:
17
+ - train_sft
18
+ - test_sft
19
+ preprocessing_num_workers: 36
20
+
21
+ # SFT trainer config
22
+ bf16: true
23
+ dataset_kwargs:
24
+ add_special_tokens: false # We already wrap <bos> and <eos> in the chat template
25
+ append_concat_token: false # No need to add <eos> across samples
26
+ do_eval: true
27
+ evaluation_strategy: epoch
28
+ gradient_accumulation_steps: 4
29
+ gradient_checkpointing: true
30
+ gradient_checkpointing_kwargs:
31
+ use_reentrant: false
32
+ hub_model_id: smollm-360M-instruct-new
33
+ hub_strategy: every_save
34
+ learning_rate: 1.0e-03 # 3e-4
35
+ log_level: info
36
+ logging_steps: 5
37
+ logging_strategy: steps
38
+ lr_scheduler_type: cosine
39
+ max_seq_length: 2048
40
+ max_steps: -1
41
+ num_train_epochs: 1
42
+ output_dir: data/smollm-360M-instruct-new
43
+ overwrite_output_dir: true
44
+ per_device_eval_batch_size: 4
45
+ per_device_train_batch_size: 4
46
+ push_to_hub: true
47
+ remove_unused_columns: true
48
+ report_to:
49
+ - tensorboard
50
+ - wandb
51
+ save_strategy: "no"
52
+ seed: 42
53
+ warmup_ratio: 0.1
recipes/smollm2/README.md ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ # Instructions to train SmolLM2-1.7B-Instruct
3
+
4
+ We build the [SmolLM2-Instruct](https://huggingface.co/collections/HuggingFaceTB/smollm2-6723884218bcda64b34d7db9) by doing SFT on [SmolTalk](https://huggingface.co/datasets/HuggingFaceTB/smoltalk) and then DPO on [UltraFeedBack](https://huggingface.co/datasets/HuggingFaceH4/ultrafeedback_binarized).
5
+
6
+ ## Setup
7
+
8
+ Follow the installation instructions in https://github.com/huggingface/alignment-handbook/tree/main?tab=readme-ov-file#installation-instructions
9
+
10
+ ## Training
11
+ We train the 1.7B on 8 GPUs using the following command:
12
+
13
+ ```shell
14
+ # SFT
15
+ ACCELERATE_LOG_LEVEL=info accelerate launch --config_file recipes/accelerate_configs/deepspeed_zero3.yaml scripts/run_sft.py recipes/smollm2/sft/config.yaml
16
+
17
+ # DPO
18
+ ACCELERATE_LOG_LEVEL=info accelerate launch --config_file recipes/accelerate_configs/deepspeed_zero3.yaml scripts/run_dpo.py recipes/smollm2/dpo/config.yaml
19
+ ```
20
+
21
+ For the 135M and 360M we use [smol-smoltalk](https://huggingface.co/datasets/HuggingFaceTB/smol-smoltalk) dataset for SFT and UltraFeedback for DPO:
22
+ ```shell
23
+ # SFT
24
+ ACCELERATE_LOG_LEVEL=info accelerate launch --config_file recipes/accelerate_configs/deepspeed_zero3.yaml scripts/run_sft.py recipes/smollm2/sft/config_smol.yaml
25
+
26
+ # DPO
27
+ ACCELERATE_LOG_LEVEL=info accelerate launch --config_file recipes/accelerate_configs/deepspeed_zero3.yaml scripts/run_dpo.py recipes/smollm2/dpo/config_smol.yaml
28
+ ```
recipes/smollm2/dpo/config.yaml ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Model arguments
2
+ model_name_or_path: loubnabnl/smollm2-1.7B-sft
3
+ torch_dtype: bfloat16
4
+
5
+ # Data training arguments
6
+ dataset_mixer:
7
+ HuggingFaceH4/ultrafeedback_binarized: 1.0
8
+
9
+ dataset_splits:
10
+ - train_prefs
11
+ - test_prefs
12
+ preprocessing_num_workers: 12
13
+
14
+ # DPOTrainer arguments
15
+ bf16: true
16
+ beta: 0.5
17
+ do_eval: true
18
+ hub_private_repo: true
19
+ eval_strategy: steps
20
+ eval_steps: 100
21
+ gradient_accumulation_steps: 8
22
+ gradient_checkpointing: true
23
+ gradient_checkpointing_kwargs:
24
+ use_reentrant: False
25
+ hub_model_id: smollm2-1.7B-dpo
26
+ learning_rate: 1.0e-6
27
+ log_level: info
28
+ logging_steps: 10
29
+ lr_scheduler_type: cosine
30
+ max_length: 1024
31
+ max_prompt_length: 512
32
+ num_train_epochs: 3
33
+ optim: adamw_torch
34
+ output_dir: data/smollm2-1.7B-dpo
35
+ per_device_train_batch_size: 2
36
+ per_device_eval_batch_size: 4
37
+ push_to_hub: true
38
+ report_to:
39
+ - tensorboard
40
+ - wandb
41
+ save_strategy: "no"
42
+ seed: 42
43
+ warmup_ratio: 0.1
recipes/smollm2/dpo/config_smol.yaml ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Model arguments
2
+ model_name_or_path: loubnabnl/smollm2-360M-sft # we use this script for the 135M model too
3
+ torch_dtype: bfloat16
4
+
5
+ # Data training arguments
6
+ dataset_mixer:
7
+ HuggingFaceH4/ultrafeedback_binarized: 1.0
8
+
9
+ dataset_splits:
10
+ - train_prefs
11
+ - test_prefs
12
+ preprocessing_num_workers: 12
13
+
14
+ # DPOTrainer arguments
15
+ bf16: true
16
+ beta: 0.5
17
+ do_eval: true
18
+ hub_private_repo: true
19
+ eval_strategy: steps
20
+ eval_steps: 100
21
+ gradient_accumulation_steps: 8
22
+ gradient_checkpointing: true
23
+ gradient_checkpointing_kwargs:
24
+ use_reentrant: False
25
+ hub_model_id: smollm2-360M-dpo
26
+ learning_rate: 1.0e-6
27
+ log_level: info
28
+ logging_steps: 10
29
+ lr_scheduler_type: cosine
30
+ max_length: 1024
31
+ max_prompt_length: 512
32
+ num_train_epochs: 2
33
+ optim: adamw_torch
34
+ output_dir: data/smollm2-360M-dpo
35
+ per_device_train_batch_size: 2
36
+ per_device_eval_batch_size: 4
37
+ push_to_hub: true
38
+ report_to:
39
+ - tensorboard
40
+ - wandb
41
+ save_strategy: "no"
42
+ seed: 42
43
+ warmup_ratio: 0.1
recipes/smollm2/sft/config.yaml ADDED
@@ -0,0 +1,49 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Model arguments
2
+ model_name_or_path: HuggingFaceTB/SmolLM2-1.7B
3
+ model_revision: main
4
+ tokenizer_name_or_path: HuggingFaceTB/SmolLM2-1.7B-Instruct # Custom tokenizer with <|im_start|> and <|im_end|> tokens
5
+ torch_dtype: bfloat16
6
+ use_flash_attention_2: true
7
+
8
+ # Data training arguments
9
+ dataset_mixer:
10
+ HuggingFaceTB/smoltalk: 1.0
11
+
12
+ dataset_configs:
13
+ - all
14
+
15
+ dataset_splits:
16
+ - train
17
+ - test
18
+ preprocessing_num_workers: 36
19
+
20
+ # SFT trainer config
21
+ bf16: true
22
+ do_eval: true
23
+ evaluation_strategy: epoch
24
+ gradient_accumulation_steps: 4
25
+ gradient_checkpointing: true
26
+ gradient_checkpointing_kwargs:
27
+ use_reentrant: false
28
+ hub_model_id: smollm2-1.7B-sft
29
+ hub_strategy: every_save
30
+ learning_rate: 3.0e-04
31
+ log_level: info
32
+ logging_steps: 5
33
+ logging_strategy: steps
34
+ lr_scheduler_type: cosine
35
+ max_seq_length: 8192
36
+ max_steps: -1
37
+ num_train_epochs: 2
38
+ output_dir: data/smollm2-1.7B-sft
39
+ overwrite_output_dir: true
40
+ per_device_eval_batch_size: 4
41
+ per_device_train_batch_size: 4
42
+ push_to_hub: true
43
+ remove_unused_columns: true
44
+ report_to:
45
+ - tensorboard
46
+ - wandb
47
+ save_strategy: "no"
48
+ seed: 42
49
+ warmup_ratio: 0.1
recipes/smollm2/sft/config_smol.yaml ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Model arguments
2
+ model_name_or_path: HuggingFaceTB/SmolLM2-360M # we use this script for the 135M model too
3
+ model_revision: main
4
+ tokenizer_name_or_path: HuggingFaceTB/SmolLM2-360M-Instruct # Custom tokenizer with <|im_start|> and <|im_end|> tokens
5
+ torch_dtype: bfloat16
6
+ use_flash_attention_2: true
7
+
8
+ # Data training arguments
9
+ dataset_mixer:
10
+ HuggingFaceTB/smol-smoltalk: 1.0
11
+
12
+ dataset_splits:
13
+ - train
14
+ - test
15
+ preprocessing_num_workers: 36
16
+
17
+ # SFT trainer config
18
+ bf16: true
19
+ do_eval: true
20
+ evaluation_strategy: epoch
21
+ gradient_accumulation_steps: 4
22
+ gradient_checkpointing: true
23
+ gradient_checkpointing_kwargs:
24
+ use_reentrant: false
25
+ hub_model_id: smollm2-360M-sft
26
+ hub_strategy: every_save
27
+ learning_rate: 1.0e-03 # 3e-4
28
+ log_level: info
29
+ logging_steps: 5
30
+ logging_strategy: steps
31
+ lr_scheduler_type: cosine
32
+ max_seq_length: 8192
33
+ max_steps: -1
34
+ num_train_epochs: 2
35
+ output_dir: data/smollm2-360M-sft
36
+ overwrite_output_dir: true
37
+ per_device_eval_batch_size: 4
38
+ per_device_train_batch_size: 4
39
+ push_to_hub: true
40
+ remove_unused_columns: true
41
+ report_to:
42
+ - tensorboard
43
+ - wandb
44
+ save_strategy: "no"
45
+ seed: 42
46
+ warmup_ratio: 0.1
recipes/starchat2-15b/README.md ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ # Instructions to train StarChat2
3
+
4
+ Similar to how we trained Zephyr 7B Beta in our [technical report](https://huggingface.co/papers/2310.16944), training this model proceeds in two steps:
5
+
6
+ 1. Apply SFT to fine-tune [StarCoder2 15B](https://huggingface.co/bigcode/starcoder2-15b) on a blend of chat, code, and math datastets. The result is an SFT model like [`starchat2-15b-sft-v0.1`](https://huggingface.co/HuggingFaceH4/starchat2-15b-sft-v0.1).
7
+ 2. Align the SFT model to AI feedback via DPO on the UltraFeedback and Orca DPO Pairs datasets. The result is a DPO model like [`starchat2-15b-v0.1`](https://huggingface.co/HuggingFaceH4/starchat2-15b-v0.1).
8
+
9
+ See below for commands to train these models using DeepSpeed ZeRO-3.
10
+
11
+ ## Full training examples
12
+
13
+ You will require 8 GPUs (80GB of VRAM) to train the full model - alternatively, you can train on 1 GPU by adjusting `per_device_train_batch_size` and `gradient_accumulation_steps` to keep the global batch size constant. A recipe involving QLoRA will come later 🤗.
14
+
15
+ ```shell
16
+ # Step 1 - SFT
17
+ ACCELERATE_LOG_LEVEL=info accelerate launch --config_file recipes/accelerate_configs/deepspeed_zero3.yaml scripts/run_sft.py recipes/starchat2-15b/sft/config_v0.1.yaml
18
+
19
+ # Step 2 - DPO
20
+ ACCELERATE_LOG_LEVEL=info accelerate launch --config_file recipes/accelerate_configs/deepspeed_zero3.yaml scripts/run_dpo.py recipes/starchat2-15b/dpo/config_v0.1.yaml
21
+ ```
recipes/starchat2-15b/dpo/config_v0.1.yaml ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Model arguments
2
+ model_name_or_path: HuggingFaceH4/starchat2-15b-sft-v0.1
3
+ torch_dtype: bfloat16
4
+
5
+ # Data training arguments
6
+ # For definitions, see: src/h4/training/config.py
7
+ dataset_mixer:
8
+ HuggingFaceH4/ultrafeedback_binarized: 1.0
9
+ HuggingFaceH4/orca_dpo_pairs: 1.0
10
+ dataset_splits:
11
+ - train_prefs
12
+ - test_prefs
13
+ preprocessing_num_workers: 12
14
+
15
+ # DPOTrainer arguments
16
+ bf16: true
17
+ beta: 0.05
18
+ do_eval: true
19
+ eval_strategy: steps
20
+ eval_steps: 100
21
+ gradient_accumulation_steps: 8
22
+ gradient_checkpointing: true
23
+ gradient_checkpointing_kwargs:
24
+ use_reentrant: False
25
+ hub_model_id: starchat2-15b-dpo-v0.1
26
+ learning_rate: 5.0e-7
27
+ log_level: info
28
+ logging_steps: 10
29
+ lr_scheduler_type: cosine
30
+ max_length: 1024
31
+ max_prompt_length: 512
32
+ num_train_epochs: 2
33
+ optim: adamw_torch
34
+ output_dir: data/starchat2-15b-dpo-v0.1
35
+ per_device_train_batch_size: 2
36
+ per_device_eval_batch_size: 4
37
+ push_to_hub: true
38
+ report_to:
39
+ - tensorboard
40
+ - wandb
41
+ save_strategy: "no"
42
+ seed: 42
43
+ warmup_ratio: 0.1
recipes/starchat2-15b/sft/config_v0.1.yaml ADDED
@@ -0,0 +1,49 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Model arguments
2
+ model_name_or_path: bigcode/starcoder2-15b
3
+ model_revision: main
4
+ torch_dtype: bfloat16
5
+ attn_implementation: flash_attention_2
6
+
7
+ # Data training arguments
8
+ chat_template: "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}"
9
+ dataset_mixer:
10
+ HuggingFaceH4/airoboros-3.2: 1.0
11
+ HuggingFaceH4/Code-Feedback: 1.0
12
+ HuggingFaceH4/orca-math-word-problems-200k: 1.0
13
+ HuggingFaceH4/SystemChat: 1.0
14
+ HuggingFaceH4/capybara: 1.0
15
+ dataset_splits:
16
+ - train_sft
17
+ - test_sft
18
+ preprocessing_num_workers: 24
19
+
20
+ # SFT trainer config
21
+ bf16: true
22
+ do_eval: true
23
+ eval_strategy: epoch
24
+ gradient_accumulation_steps: 2
25
+ gradient_checkpointing: true
26
+ gradient_checkpointing_kwargs:
27
+ use_reentrant: false
28
+ hub_model_id: starchat2-15b-v0.1
29
+ hub_strategy: every_save
30
+ learning_rate: 2.0e-05
31
+ log_level: info
32
+ logging_steps: 5
33
+ logging_strategy: steps
34
+ lr_scheduler_type: cosine
35
+ max_seq_length: 2048
36
+ max_steps: -1
37
+ num_train_epochs: 3
38
+ output_dir: data/starchat2-15b-v0.1
39
+ overwrite_output_dir: true
40
+ per_device_eval_batch_size: 8
41
+ per_device_train_batch_size: 8
42
+ push_to_hub: true
43
+ remove_unused_columns: true
44
+ report_to:
45
+ - tensorboard
46
+ - wandb
47
+ save_strategy: "no"
48
+ seed: 42
49
+ warmup_ratio: 0.1
recipes/zephyr-141b-A35b/README.md ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ # Instructions to train Zephyr-141B-A35B with ORPO
3
+
4
+ This model is fine-tuned via a novel alignment algorithm called [Odds Ratio Preference Optimization (ORPO)](https://huggingface.co/papers/2403.07691). ORPO does not require an SFT step to achieve high performance and is thus much more computationally efficient than methods like DPO and PPO. To train Zephyr-141B-A35B, we used the [`argilla/distilabel-capybara-dpo-7k-binarized`](https://huggingface.co/datasets/argilla/distilabel-capybara-dpo-7k-binarized) preference dataset, which consists of synthetic, high-quality, multi-turn preferences that have been scored via LLMs.
5
+
6
+ See below for commands to train these models using FSDP. **Note:** we found it was not possible to train this large model with DeepSpeed ZeRO-3 due to unresolved NCCL errors which cause GPUs to hang.
7
+
8
+ ## Full training examples
9
+
10
+ You will require 4 nodes of 8 GPUs (80GB of VRAM) to train the full model - alternatively, you may be able to train on fewer GPUs by adjusting `per_device_train_batch_size` and `gradient_accumulation_steps` and `num_train_epochs` to keep the global batch size constant. A recipe involving QLoRA will come later 🤗.
11
+
12
+ To run with Slurm, use:
13
+
14
+ ```shell
15
+ sbatch --job-name=handbook_sft --nodes=4 recipes/launch.slurm zephyr-141b-A35b orpo full fsdp
16
+ ```
17
+
18
+ Under the hood, this calls the following script which can be adapted to other models and datasets:
19
+
20
+
21
+ ```shell
22
+ ACCELERATE_LOG_LEVEL=info TRANSFORMERS_VERBOSITY=info accelerate launch --config_file recipes/accelerate_configs/fsdp.yaml scripts/run_orpo.py recipes/zephyr-141b-A35b/orpo/config_full.yaml
23
+ ```
recipes/zephyr-141b-A35b/orpo/config_full.yaml ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Model arguments
2
+ model_name_or_path: mistral-community/Mixtral-8x22B-v0.1
3
+ model_revision: main
4
+ torch_dtype: bfloat16
5
+ attn_implementation: flash_attention_2
6
+
7
+ # Data training arguments
8
+ chat_template: "{% for message in messages %}\n{% if message['role'] == 'user' %}\n{{ '<|user|>\n' + message['content'] + eos_token }}\n{% elif message['role'] == 'system' %}\n{{ '<|system|>\n' + message['content'] + eos_token }}\n{% elif message['role'] == 'assistant' %}\n{{ '<|assistant|>\n' + message['content'] + eos_token }}\n{% endif %}\n{% if loop.last and add_generation_prompt %}\n{{ '<|assistant|>' }}\n{% endif %}\n{% endfor %}"
9
+ dataset_mixer:
10
+ argilla/distilabel-capybara-dpo-7k-binarized: 1.0
11
+ dataset_splits:
12
+ - train
13
+ preprocessing_num_workers: 8
14
+
15
+ # ORPOTrainer arguments
16
+ bf16: true
17
+ beta: 0.05
18
+ gradient_accumulation_steps: 1
19
+ gradient_checkpointing: true
20
+ gradient_checkpointing_kwargs:
21
+ use_reentrant: true
22
+ hub_model_id: zephyr-orpo-141b-A35b
23
+ learning_rate: 5.0e-6
24
+ log_level: info
25
+ logging_steps: 10
26
+ lr_scheduler_type: inverse_sqrt
27
+ max_length: 2048
28
+ max_prompt_length: 1792
29
+ num_train_epochs: 3
30
+ optim: adamw_bnb_8bit
31
+ output_dir: data/zephyr-orpo-141b-A35b
32
+ per_device_train_batch_size: 1
33
+ push_to_hub: true
34
+ report_to:
35
+ - tensorboard
36
+ - wandb
37
+ save_strategy: "no"
38
+ seed: 42
39
+ warmup_steps: 100
recipes/zephyr-7b-beta/README.md ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ # Instructions to Replicate Zephyr-7b-β
3
+
4
+ As described in the Zephyr [technical report](https://huggingface.co/papers/2310.16944), training this model proceeds in two steps:
5
+
6
+ 1. Apply SFT to fine-tune Mistral 7B on a filtered version of the UltraChat dataset ([link](https://huggingface.co/datasets/HuggingFaceH4/ultrachat_200k)). The result is an SFT model like [`zephyr-7b-sft-full`](https://huggingface.co/alignment-handbook/zephyr-7b-sft-full) or [`zephyr-7b-sft-qlora`](https://huggingface.co/alignment-handbook/zephyr-7b-sft-qlora).
7
+ 2. Align the SFT model to AI feedback via DPO on a preprocessed version of the UltraFeedback dataset ([link](https://huggingface.co/datasets/HuggingFaceH4/ultrafeedback_binarized)). The result is a DPO model like [`zephyr-7b-dpo-full`](https://huggingface.co/alignment-handbook/zephyr-7b-dpo-full) or [`zephyr-7b-dpo-qlora`](https://huggingface.co/alignment-handbook/zephyr-7b-dpo-qlora).
8
+
9
+ **Note:** after the release of Zephyr, the team at [Argilla](https://argilla.io) found that the source UltraFeedback dataset had a few thousand incorrect preference labels from GPT-4. Additionally, TRL's `SFTTrainer` had a bug in the learning rate scheduler which terminated training early. Accounting for these changes led us to find a better set of hyperparameters from those described in the technical report. In particular, for DPO training we found that training for 1 epoch with `beta=0.01` was sufficient to achieve comparable performance to `zephyr-7b-beta` (vs. 3 epochs with `beta=0.1`).
10
+
11
+ See below for commands to train these models using either DeepSpeed ZeRO-3 or LoRA.
12
+
13
+ ## Full training examples
14
+
15
+ You will require 8 GPUs (80GB of VRAM) to train the full model.
16
+ ```shell
17
+ # Step 1 - SFT
18
+ ACCELERATE_LOG_LEVEL=info accelerate launch --config_file recipes/accelerate_configs/deepspeed_zero3.yaml scripts/run_sft.py recipes/zephyr-7b-beta/sft/config_full.yaml
19
+
20
+ # Step 2 - DPO
21
+ ACCELERATE_LOG_LEVEL=info accelerate launch --config_file recipes/accelerate_configs/deepspeed_zero3.yaml scripts/run_dpo.py recipes/zephyr-7b-beta/dpo/config_full.yaml
22
+ ```
23
+
24
+ ## QLoRA training examples
25
+
26
+ Train faster with flash-attention 2 (GPU supporting FA2: A100, H100, etc)
27
+ ```````shell
28
+ # Step 1 - SFT
29
+ ACCELERATE_LOG_LEVEL=info accelerate launch --config_file recipes/accelerate_configs/multi_gpu.yaml --num_processes=1 scripts/run_sft.py recipes/zephyr-7b-beta/sft/config_qlora.yaml --load_in_4bit=true
30
+
31
+ # Step 2 - DPO
32
+ ACCELERATE_LOG_LEVEL=info accelerate launch --config_file recipes/accelerate_configs/multi_gpu.yaml --num_processes=1 scripts/run_dpo.py recipes/zephyr-7b-beta/dpo/config_qlora.yaml
33
+ ```````
34
+
35
+ P.S. Using Flash Attention also allows you to drastically increase the batch size (x2 in my case)
36
+
37
+ Train without flash-attention (i.e. via PyTorch's scaled dot product attention):
38
+ ```````shell
39
+ # Step 1 - SFT
40
+ ACCELERATE_LOG_LEVEL=info accelerate launch --config_file recipes/accelerate_configs/multi_gpu.yaml --num_processes=1 scripts/run_sft.py recipes/zephyr-7b-beta/sft/config_qlora.yaml --load_in_4bit=true --attn_implementation=sdpa
41
+
42
+ # Step 2 - DPO
43
+ ACCELERATE_LOG_LEVEL=info accelerate launch --config_file recipes/accelerate_configs/multi_gpu.yaml --num_processes=1 scripts/run_dpo.py recipes/zephyr-7b-beta/dpo/config_qlora.yaml --attn_implementation=sdpa
44
+ ```````
recipes/zephyr-7b-beta/dpo/config_full.yaml ADDED
@@ -0,0 +1,41 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Model arguments
2
+ model_name_or_path: alignment-handbook/zephyr-7b-sft-full
3
+ torch_dtype: null
4
+
5
+ # Data training arguments
6
+ # For definitions, see: src/h4/training/config.py
7
+ dataset_mixer:
8
+ HuggingFaceH4/ultrafeedback_binarized: 1.0
9
+ dataset_splits:
10
+ - train_prefs
11
+ - test_prefs
12
+ preprocessing_num_workers: 12
13
+
14
+ # DPOTrainer arguments
15
+ bf16: true
16
+ beta: 0.01
17
+ do_eval: true
18
+ eval_strategy: steps
19
+ eval_steps: 100
20
+ gradient_accumulation_steps: 2
21
+ gradient_checkpointing: true
22
+ gradient_checkpointing_kwargs:
23
+ use_reentrant: False
24
+ hub_model_id: zephyr-7b-dpo-full
25
+ learning_rate: 5.0e-7
26
+ log_level: info
27
+ logging_steps: 10
28
+ lr_scheduler_type: cosine
29
+ max_length: 1024
30
+ max_prompt_length: 512
31
+ num_train_epochs: 1
32
+ optim: adamw_torch
33
+ output_dir: data/zephyr-7b-dpo-full
34
+ per_device_train_batch_size: 8
35
+ per_device_eval_batch_size: 8
36
+ push_to_hub: true
37
+ save_strategy: "steps"
38
+ save_steps: 100
39
+ save_total_limit: 1
40
+ seed: 42
41
+ warmup_ratio: 0.1
recipes/zephyr-7b-beta/dpo/config_qlora.yaml ADDED
@@ -0,0 +1,57 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Model arguments
2
+ model_name_or_path: alignment-handbook/zephyr-7b-sft-qlora
3
+ torch_dtype: bfloat16
4
+ attn_implementation: flash_attention_2
5
+
6
+ # LoRA arguments
7
+ use_peft: true
8
+ load_in_4bit: true
9
+ lora_r: 128
10
+ lora_alpha: 128
11
+ lora_dropout: 0.05
12
+ lora_target_modules:
13
+ - q_proj
14
+ - k_proj
15
+ - v_proj
16
+ - o_proj
17
+ - gate_proj
18
+ - up_proj
19
+ - down_proj
20
+
21
+ # Data training arguments
22
+
23
+ dataset_mixer:
24
+ HuggingFaceH4/ultrafeedback_binarized: 1.0
25
+ dataset_splits:
26
+ - train_prefs
27
+ - test_prefs
28
+ preprocessing_num_workers: 12
29
+
30
+ # DPOTrainer arguments
31
+ bf16: true
32
+ beta: 0.01
33
+ do_eval: true
34
+ eval_strategy: steps
35
+ eval_steps: 100
36
+ gradient_accumulation_steps: 4
37
+ gradient_checkpointing: true
38
+ gradient_checkpointing_kwargs:
39
+ use_reentrant: false
40
+ hub_model_id: zephyr-7b-dpo-qlora
41
+ learning_rate: 5.0e-6
42
+ log_level: info
43
+ logging_steps: 10
44
+ lr_scheduler_type: cosine
45
+ max_length: 1024
46
+ max_prompt_length: 512
47
+ num_train_epochs: 1
48
+ optim: paged_adamw_32bit
49
+ output_dir: data/zephyr-7b-dpo-qlora # It is handy to append `hub_model_revision` to keep track of your local experiments
50
+ per_device_train_batch_size: 4
51
+ per_device_eval_batch_size: 8
52
+ push_to_hub: true
53
+ save_strategy: "steps"
54
+ save_steps: 100
55
+ save_total_limit: 1
56
+ seed: 42
57
+ warmup_ratio: 0.1
recipes/zephyr-7b-beta/sft/config_full.yaml ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Model arguments
2
+ model_name_or_path: mistralai/Mistral-7B-v0.1
3
+ model_revision: main
4
+ torch_dtype: bfloat16
5
+ attn_implementation: flash_attention_2
6
+
7
+ # Data training arguments
8
+ chat_template: "{% for message in messages %}\n{% if message['role'] == 'user' %}\n{{ '<|user|>\n' + message['content'] + eos_token }}\n{% elif message['role'] == 'system' %}\n{{ '<|system|>\n' + message['content'] + eos_token }}\n{% elif message['role'] == 'assistant' %}\n{{ '<|assistant|>\n' + message['content'] + eos_token }}\n{% endif %}\n{% if loop.last and add_generation_prompt %}\n{{ '<|assistant|>' }}\n{% endif %}\n{% endfor %}"
9
+ dataset_mixer:
10
+ HuggingFaceH4/ultrachat_200k: 1.0
11
+ dataset_splits:
12
+ - train_sft
13
+ - test_sft
14
+ preprocessing_num_workers: 12
15
+
16
+ # SFT trainer config
17
+ bf16: true
18
+ do_eval: true
19
+ eval_strategy: epoch
20
+ gradient_accumulation_steps: 1
21
+ gradient_checkpointing: true
22
+ gradient_checkpointing_kwargs:
23
+ use_reentrant: False
24
+ hub_model_id: zephyr-7b-sft-full
25
+ hub_strategy: every_save
26
+ learning_rate: 2.0e-05
27
+ log_level: info
28
+ logging_steps: 5
29
+ logging_strategy: steps
30
+ lr_scheduler_type: cosine
31
+ max_seq_length: 2048
32
+ max_steps: -1
33
+ num_train_epochs: 1
34
+ output_dir: data/zephyr-7b-sft-full
35
+ overwrite_output_dir: true
36
+ per_device_eval_batch_size: 8
37
+ per_device_train_batch_size: 16
38
+ push_to_hub: true
39
+ remove_unused_columns: true
40
+ report_to:
41
+ - tensorboard
42
+ save_strategy: "steps"
43
+ save_steps: 100
44
+ save_total_limit: 1
45
+ seed: 42
46
+ warmup_ratio: 0.1