zswzswzsw
/

grpo_run_code

Model card Files Files and versions Community

zswzswzsw commited on Apr 7

Commit

ae40651

verified ·

1 Parent(s): 663f446

Upload folder using huggingface_hub

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.github/workflows/build_documentation.yml +18 -0
.github/workflows/build_pr_documentation.yml +19 -0
.github/workflows/quality.yml +31 -0
.github/workflows/tests.yml +31 -0
.github/workflows/upload_pr_documentation.yml +16 -0
.gitignore +164 -0
CITATION.cff +29 -0
LICENSE +201 -0
Makefile +44 -0
README.md +130 -0
assets/handbook.png +0 -0
chapters/en/_toctree.yml +4 -0
chapters/en/chapter0/introduction.mdx +3 -0
config_dpo_run.yaml +42 -0
config_grpo_offline.yaml +45 -0
config_sft_test_env.yaml +42 -0
grpo_offline_run.py +217 -0
recipes/accelerate_configs/deepspeed_zero3.yaml +22 -0
recipes/accelerate_configs/fsdp.yaml +26 -0
recipes/accelerate_configs/fsdp_qlora.yaml +25 -0
recipes/accelerate_configs/multi_gpu.yaml +16 -0
recipes/constitutional-ai/README.md +24 -0
recipes/constitutional-ai/dpo/config_anthropic.yaml +41 -0
recipes/constitutional-ai/sft/config_anthropic.yaml +48 -0
recipes/constitutional-ai/sft/config_grok.yaml +48 -0
recipes/gpt2-nl/README.md +43 -0
recipes/gpt2-nl/cpt/config_full.yaml +45 -0
recipes/gpt2-nl/dpo/config_full.yaml +44 -0
recipes/gpt2-nl/sft/config_full.yaml +45 -0
recipes/launch.slurm +86 -0
recipes/pref_align_scan/README.md +49 -0
recipes/pref_align_scan/dpo/config_openhermes.yaml +41 -0
recipes/pref_align_scan/dpo/config_zephyr.yaml +39 -0
recipes/pref_align_scan/launch_scan.sh +24 -0
recipes/smollm/README.md +19 -0
recipes/smollm/sft/config.yaml +53 -0
recipes/smollm2/README.md +28 -0
recipes/smollm2/dpo/config.yaml +43 -0
recipes/smollm2/dpo/config_smol.yaml +43 -0
recipes/smollm2/sft/config.yaml +49 -0
recipes/smollm2/sft/config_smol.yaml +46 -0
recipes/starchat2-15b/README.md +21 -0
recipes/starchat2-15b/dpo/config_v0.1.yaml +43 -0
recipes/starchat2-15b/sft/config_v0.1.yaml +49 -0
recipes/zephyr-141b-A35b/README.md +23 -0
recipes/zephyr-141b-A35b/orpo/config_full.yaml +39 -0
recipes/zephyr-7b-beta/README.md +44 -0
recipes/zephyr-7b-beta/dpo/config_full.yaml +41 -0
recipes/zephyr-7b-beta/dpo/config_qlora.yaml +57 -0
recipes/zephyr-7b-beta/sft/config_full.yaml +46 -0

.github/workflows/build_documentation.yml ADDED Viewed

	@@ -0,0 +1,18 @@

+name: Build documentation
+on:
+  push:
+    branches:
+      - main
+jobs:
+  build:
+    uses: huggingface/doc-builder/.github/workflows/build_main_documentation.yml@main
+    with:
+      commit_sha: ${{ github.sha }}
+      package: alignment-handbook
+      path_to_docs: alignment-handbook/chapters/
+      additional_args: --not_python_module
+      languages: en
+    secrets:
+      hf_token: ${{ secrets.HF_DOC_BUILD_PUSH }}

.github/workflows/build_pr_documentation.yml ADDED Viewed

	@@ -0,0 +1,19 @@

+name: Build PR Documentation
+on:
+  pull_request:
+concurrency:
+  group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
+  cancel-in-progress: true
+jobs:
+  build:
+    uses: huggingface/doc-builder/.github/workflows/build_pr_documentation.yml@main
+    with:
+      commit_sha: ${{ github.event.pull_request.head.sha }}
+      pr_number: ${{ github.event.number }}
+      package: alignment-handbook
+      path_to_docs: alignment-handbook/chapters/
+      additional_args: --not_python_module
+      languages: en

.github/workflows/quality.yml ADDED Viewed

	@@ -0,0 +1,31 @@

+name: Quality
+on:
+  push:
+    branches:
+      - main
+      - v*-release
+  pull_request:
+    branches:
+      - main
+jobs:
+  check_code_quality:
+    name: Check code quality
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v2
+      - name: Setup Python environment
+        uses: actions/setup-python@v2
+        with:
+          python-version: 3.10.10
+      - name: Install dependencies
+        run: |
+          python -m pip install --upgrade pip
+          python -m pip install ".[quality]"
+      - name: Code quality
+        run: |
+          make quality

.github/workflows/tests.yml ADDED Viewed

	@@ -0,0 +1,31 @@

+name: Tests
+on:
+  push:
+    branches:
+      - main
+      - v*-release
+  pull_request:
+    branches:
+      - main
+jobs:
+  unit-tests:
+    name: Run unit tests
+    env:
+      HF_TOKEN: ${{ secrets.HF_TOKEN }}
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v2
+      - name: Setup Python environment
+        uses: actions/setup-python@v2
+        with:
+          python-version: 3.10.10
+      - name: Install dependencies
+        run: |
+          python -m pip install --upgrade pip
+          python -m pip install ".[dev, torch]"
+      - name: Run unit tests
+        run: HF_TOKEN=$HF_TOKEN pytest -sv tests/

.github/workflows/upload_pr_documentation.yml ADDED Viewed

	@@ -0,0 +1,16 @@

+name: Upload PR Documentation
+on:
+  workflow_run:
+    workflows: ["Build PR Documentation"]
+    types:
+      - completed
+jobs:
+  build:
+    uses: huggingface/doc-builder/.github/workflows/upload_pr_documentation.yml@main
+    with:
+      package_name: alignment-handbook
+    secrets:
+      hf_token: ${{ secrets.HF_DOC_BUILD_PUSH }}
+      comment_bot_token: ${{ secrets.COMMENT_BOT_TOKEN }}

.gitignore ADDED Viewed

	@@ -0,0 +1,164 @@

+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+# C extensions
+*.so
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+# Translations
+*.mo
+*.pot
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+# Flask stuff:
+instance/
+.webassets-cache
+# Scrapy stuff:
+.scrapy
+# Sphinx documentation
+docs/_build/
+# PyBuilder
+.pybuilder/
+target/
+# Jupyter Notebook
+.ipynb_checkpoints
+# IPython
+profile_default/
+ipython_config.py
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+# poetry
+#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+#poetry.lock
+# pdm
+#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#pdm.lock
+#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
+#   in version control.
+#   https://pdm.fming.dev/#use-with-ide
+.pdm.toml
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+# SageMath parsed files
+*.sage.py
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+# Spyder project settings
+.spyderproject
+.spyproject
+# Rope project settings
+.ropeproject
+# mkdocs documentation
+/site
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+# Pyre type checker
+.pyre/
+# pytype static type analyzer
+.pytype/
+# Cython debug symbols
+cython_debug/
+# PyCharm
+#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
+#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+#  and can be added to the global gitignore or merged into this file.  For a more nuclear
+#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
+.idea/
+# Temp folders
+data/
+wandb/

CITATION.cff ADDED Viewed

	@@ -0,0 +1,29 @@

+cff-version: 1.2.0
+title: The Alignment Handbook
+message: >-
+  Robust recipes to align language models with human and AI
+  preferences.
+type: software
+authors:
+  - given-names: Lewis
+    family-names: Tunstall
+  - given-names: Edward
+    family-names: Beeching
+  - given-names: Nathan
+    family-names: Lambert
+  - given-names: Nazneen
+    family-names: Rajani
+  - given-names: Shengyi
+    family-names: Huang
+  - given-names: Kashif
+    family-names: Rasul
+  - given-names: Alvaro
+    family-names: Bartolome
+  - given-names: Alexander
+    name-particle: M.
+    family-names: Rush
+  - given-names: Thomas
+    family-names: Wolf
+repository-code: 'https://github.com/huggingface/alignment-handbook'
+license: Apache-2.0
+version: 0.3.0.dev0

LICENSE ADDED Viewed

	@@ -0,0 +1,201 @@

+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+   1. Definitions.
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+   END OF TERMS AND CONDITIONS
+   APPENDIX: How to apply the Apache License to your work.
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+   Copyright [yyyy] [name of copyright owner]
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+       http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.

Makefile ADDED Viewed

	@@ -0,0 +1,44 @@

+.PHONY: style quality
+# make sure to test the local checkout in scripts and not the pre-installed one (don't use quotes!)
+export PYTHONPATH = src
+check_dirs := src tests scripts
+style:
+	black --line-length 119 --target-version py310 $(check_dirs) setup.py
+	isort $(check_dirs) setup.py
+quality:
+	black --check --line-length 119 --target-version py310 $(check_dirs) setup.py
+	isort --check-only $(check_dirs) setup.py
+	flake8 --max-line-length 119 $(check_dirs) setup.py
+# Release stuff
+pre-release:
+	python src/alignment/release.py
+pre-patch:
+	python src/alignment/release.py --patch
+post-release:
+	python src/alignment/release.py --post_release
+post-patch:
+	python src/alignment/release.py --post_release --patch
+wheels:
+	python setup.py bdist_wheel && python setup.py sdist
+wheels_clean:
+	rm -rf build && rm -rf dist
+pypi_upload:
+	python -m pip install twine
+	twine upload dist/* -r pypi
+pypi_test_upload:
+	python -m pip install twine
+	twine upload dist/* -r pypitest --repository-url=https://test.pypi.org/legacy/

README.md ADDED Viewed

	@@ -0,0 +1,130 @@

+<p align="center">
+  <img src="https://raw.githubusercontent.com/huggingface/alignment-handbook/main/assets/handbook.png">
+</p>
+<p align="center">
+    🤗 <a href="https://huggingface.co/collections/alignment-handbook/handbook-v01-models-and-datasets-654e424d22e6880da5ebc015" target="_blank">Models & Datasets</a> | 📃 <a href="https://arxiv.org/abs/2310.16944" target="_blank">Technical Report</a>
+</p>
+# The Alignment Handbook
+Robust recipes to continue pretraining and to align language models with human and AI preferences.
+## What is this?
+Just one year ago, chatbots were out of fashion and most people hadn't heard about techniques like Reinforcement Learning from Human Feedback (RLHF) to align language models with human preferences. Then, OpenAI broke the internet with ChatGPT and Meta followed suit by releasing the Llama series of language models which enabled the ML community to build their very own capable chatbots. This has led to a rich ecosystem of datasets and models that have mostly focused on teaching language models to follow instructions through supervised fine-tuning (SFT).
+However, we know from the [InstructGPT](https://huggingface.co/papers/2203.02155) and [Llama2](https://huggingface.co/papers/2307.09288) papers that significant gains in helpfulness and safety can be had by augmenting SFT with human (or AI) preferences. At the same time, aligning language models to a set of preferences is a fairly novel idea and there are few public resources available on how to train these models, what data to collect, and what metrics to measure for best downstream performance.
+The Alignment Handbook aims to fill that gap by providing the community with a series of robust training recipes that span the whole pipeline.
+## News 🗞️
+* **November 21, 2024**: We release the [recipe](recipes/smollm2/README.md) for finet-uning SmolLM2-Instruct.
+* **August 18, 2024**: We release SmolLM-Instruct v0.2, along with the [recipe](recipes/smollm/README.md)  to fine-tuning small LLMs 💻
+* **April 12, 2024**: We release Zephyr 141B (A35B), in collaboration with Argilla and Kaist AI, along with the recipe to fine-tune Mixtral 8x22B with ORPO 🪁
+* **March 12, 2024:** We release StarChat2 15B, along with the recipe to train capable coding assistants 🌟
+* **March 1, 2024:** We release Zephyr 7B Gemma, which is a new recipe to align Gemma 7B with RLAIF 🔥
+* **February 1, 2024:** We release a recipe to align open LLMs with Constitutional AI 📜! See the [recipe](https://github.com/huggingface/alignment-handbook/tree/main/recipes/constitutional-ai) and the [blog post](https://huggingface.co/blog/constitutional_ai) for details.
+* **January 18, 2024:** We release a suite of evaluations of DPO vs KTO vs IPO, see the [recipe](recipes/pref_align_scan/README.md) and the [blog post](https://huggingface.co/blog/pref-tuning) for details.
+* **November 10, 2023:** We release all the training code to replicate Zephyr-7b-β 🪁! We also release [No Robots](https://huggingface.co/datasets/HuggingFaceH4/no_robots), a brand new dataset of 10,000 instructions and demonstrations written entirely by skilled human annotators.
+## Links 🔗
+* [Zephyr 7B models, datasets, and demos](https://huggingface.co/collections/HuggingFaceH4/zephyr-7b-6538c6d6d5ddd1cbb1744a66)
+## How to navigate this project 🧭
+This project is simple by design and mostly consists of:
+* [`scripts`](./scripts/) to train and evaluate models. Four steps are included: continued pretraining, supervised-finetuning (SFT) for chat, preference alignment with DPO, and supervised-finetuning with preference alignment with ORPO. Each script supports distributed training of the full model weights with DeepSpeed ZeRO-3, or LoRA/QLoRA for parameter-efficient fine-tuning.
+* [`recipes`](./recipes/) to reproduce models like Zephyr 7B. Each recipe takes the form of a YAML file which contains all the parameters associated with a single training run. A `gpt2-nl` recipe is also given to illustrate how this handbook can be used for language or domain adaptation, e.g. by continuing to pretrain on a different language, and then SFT and DPO tuning the result.
+We are also working on a series of guides to explain how methods like direct preference optimization (DPO) work, along with lessons learned from gathering human preferences in practice. To get started, we recommend the following:
+1. Follow the [installation instructions](#installation-instructions) to set up your environment etc.
+2. Replicate Zephyr-7b-β by following the [recipe instructions](./recipes/zephyr-7b-beta/README.md).
+If you would like to train chat models on your own datasets, we recommend following the dataset formatting instructions [here](./scripts/README.md#fine-tuning-on-your-datasets).
+## Contents
+The initial release of the handbook will focus on the following techniques:
+* **Continued pretraining:** adapt language models to a new language or domain, or simply improve it by continued pretraining (causal language modeling) on a new dataset.
+* **Supervised fine-tuning:** teach language models to follow instructions and tips on how to collect and curate your training dataset.
+* **Reward modeling:** teach language models to distinguish model responses according to human or AI preferences.
+* **Rejection sampling:** a simple, but powerful technique to boost the performance of your SFT model.
+* **Direct preference optimisation (DPO):** a powerful and promising alternative to PPO.
+* **Odds Ratio Preference Optimisation (ORPO)**: a technique to fine-tune language models with human preferences, combining SFT and DPO in a single stage.
+## Installation instructions
+To run the code in this project, first, create a Python virtual environment using e.g. Conda:
+```shell
+conda create -n handbook python=3.10 && conda activate handbook
+```
+Next, install PyTorch `v2.1.2` - the precise version is important for reproducibility! Since this is hardware-dependent, we
+direct you to the [PyTorch Installation Page](https://pytorch.org/get-started/locally/).
+You can then install the remaining package dependencies as follows:
+```shell
+git clone https://github.com/huggingface/alignment-handbook.git
+cd ./alignment-handbook/
+python -m pip install .
+```
+You will also need Flash Attention 2 installed, which can be done by running:
+```shell
+python -m pip install flash-attn --no-build-isolation
+```
+> **Note**
+> If your machine has less than 96GB of RAM and many CPU cores, reduce the `MAX_JOBS` arguments, e.g. `MAX_JOBS=4 pip install flash-attn --no-build-isolation`
+Next, log into your Hugging Face account as follows:
+```shell
+huggingface-cli login
+```
+Finally, install Git LFS so that you can push models to the Hugging Face Hub:
+```shell
+sudo apt-get install git-lfs
+```
+You can now check out the `scripts` and `recipes` directories for instructions on how to train some models 🪁!
+## Project structure
+```
+├── LICENSE
+├── Makefile                    <- Makefile with commands like `make style`
+├── README.md                   <- The top-level README for developers using this project
+├── chapters                    <- Educational content to render on hf.co/learn
+├── recipes                     <- Recipe configs, accelerate configs, slurm scripts
+├── scripts                     <- Scripts to train and evaluate chat models
+├── setup.cfg                   <- Installation config (mostly used for configuring code quality & tests)
+├── setup.py                    <- Makes project pip installable (pip install -e .) so `alignment` can be imported
+├── src                         <- Source code for use in this project
+└── tests                       <- Unit tests
+```
+## Citation
+If you find the content of this repo useful in your work, please cite it as follows via `\usepackage{biblatex}`:
+```bibtex
+@software{Tunstall_The_Alignment_Handbook,
+  author = {Tunstall, Lewis and Beeching, Edward and Lambert, Nathan and Rajani, Nazneen and Huang, Shengyi and Rasul, Kashif and Bartolome, Alvaro and M. Rush, Alexander and Wolf, Thomas},
+  license = {Apache-2.0},
+  title = {{The Alignment Handbook}},
+  url = {https://github.com/huggingface/alignment-handbook},
+  version = {0.3.0.dev0}
+}
+```

assets/handbook.png ADDED Viewed

chapters/en/_toctree.yml ADDED Viewed

	@@ -0,0 +1,4 @@

+- title: Unit 0. Welcome to the RLHF Handbook!
+  sections:
+  - local: chapter0/introduction
+    title: What is this about?

chapters/en/chapter0/introduction.mdx ADDED Viewed

	@@ -0,0 +1,3 @@


1	+ # Welcome to the RLHF Handbook!
2	+
3	+ Stay tuned for more details 🤗

config_dpo_run.yaml ADDED Viewed

	@@ -0,0 +1,42 @@

+# Model arguments
+model_name_or_path: /home/swzhang/test_trl_0.12_grpo/qwen/Qwen2/
+model_revision: main
+torch_dtype: bfloat16
+attn_implementation: flash_attention_2
+# Data training arguments
+chat_template: "{% if not add_generation_prompt is defined %}\n{% set add_generation_prompt = false %}\n{% endif %}\n{%- set ns = namespace(found=false) -%}\n{%- for message in messages -%}\n    {%- if message['role'] == 'system' -%}\n        {%- set ns.found = true -%}\n    {%- endif -%}\n{%- endfor -%}\n{{bos_token}}{%- if not ns.found -%}\n{{'You are an AI programming assistant, utilizing the Deepseek Coder model, developed by Deepseek Company, and you only answer questions related to computer science. For politically sensitive questions, security and privacy issues, and other non-computer science questions, you will refuse to answer\\n'}}\n{%- endif %}\n{%- for message in messages %}\n    {%- if message['role'] == 'system' %}\n{{ message['content'] }}\n    {%- else %}\n        {%- if message['role'] == 'user' %}\n{{'### Instruction:\\n' + message['content'] + '\\n'}}\n        {%- else %}\n{{'### Response:\\n' + message['content'] + '\\n<|EOT|>\\n'}}\n        {%- endif %}\n    {%- endif %}\n{%- endfor %}\n{% if add_generation_prompt %}\n{{'### Response:'}}\n{% endif %}"
+dataset_mixer:
+  data/my: 1.0
+dataset_splits:
+- train
+preprocessing_num_workers: 2
+# dpo trainer config
+bf16: true
+do_eval: False
+eval_strategy: epoch
+gradient_accumulation_steps: 1
+gradient_checkpointing: true
+gradient_checkpointing_kwargs:
+  use_reentrant: False
+learning_rate: 1.0e-05
+log_level: info
+logging_steps: 5
+logging_strategy: steps
+lr_scheduler_type: cosine
+max_length: 4096
+num_train_epochs: 5
+output_dir: /home/swzhang/LLM_alignment/alignment-handbook/qwen_test_model
+overwrite_output_dir: true
+per_device_eval_batch_size: 1
+per_device_train_batch_size: 1
+push_to_hub: False
+remove_unused_columns: true
+report_to:
+- tensorboard
+save_strategy: "steps"
+save_steps: 51
+save_total_limit: 30
+seed: 42
+warmup_ratio: 0.2

config_grpo_offline.yaml ADDED Viewed

	@@ -0,0 +1,45 @@

+# Model arguments
+model_name_or_path: /home/swzhang/test_trl_0.12_grpo/qwen/Qwen2/
+model_revision: main
+torch_dtype: bfloat16
+attn_implementation: flash_attention_2
+# Data training arguments
+chat_template: "{% if not add_generation_prompt is defined %}\n{% set add_generation_prompt = false %}\n{% endif %}\n{%- set ns = namespace(found=false) -%}\n{%- for message in messages -%}\n    {%- if message['role'] == 'system' -%}\n        {%- set ns.found = true -%}\n    {%- endif -%}\n{%- endfor -%}\n{{bos_token}}{%- if not ns.found -%}\n{{'You are an AI programming assistant, utilizing the Deepseek Coder model, developed by Deepseek Company, and you only answer questions related to computer science. For politically sensitive questions, security and privacy issues, and other non-computer science questions, you will refuse to answer\\n'}}\n{%- endif %}\n{%- for message in messages %}\n    {%- if message['role'] == 'system' %}\n{{ message['content'] }}\n    {%- else %}\n        {%- if message['role'] == 'user' %}\n{{'### Instruction:\\n' + message['content'] + '\\n'}}\n        {%- else %}\n{{'### Response:\\n' + message['content'] + '\\n<|EOT|>\\n'}}\n        {%- endif %}\n    {%- endif %}\n{%- endfor %}\n{% if add_generation_prompt %}\n{{'### Response:'}}\n{% endif %}"
+dataset_mixer:
+  data/my: 1.0
+dataset_splits:
+- train
+preprocessing_num_workers: 32
+# GRPO trainer config
+bf16: true
+do_eval: False
+eval_strategy: epoch
+gradient_accumulation_steps: 1
+gradient_checkpointing: true
+gradient_checkpointing_kwargs:
+  use_reentrant: False
+learning_rate: 1.0e-05
+log_level: info
+logging_steps: 5
+logging_strategy: steps
+lr_scheduler_type: cosine
+max_prompt_length: 512
+max_completion_length: 512
+num_train_epochs: 5
+output_dir: /home/swzhang/LLM_alignment/alignment-handbook/qwen_grpo
+overwrite_output_dir: true
+# per_device_batch_size = num_generations * per_device_prompt_num (采样数量*per_device_prompt数量）
+per_device_eval_batch_size: 4
+per_device_train_batch_size: 4
+num_generations: 4
+push_to_hub: False
+remove_unused_columns: false
+report_to:
+- tensorboard
+save_strategy: "steps"
+save_steps: 50
+save_total_limit: 30
+seed: 42
+warmup_ratio: 0.2

config_sft_test_env.yaml ADDED Viewed

	@@ -0,0 +1,42 @@

+# Model arguments
+model_name_or_path: /home/swzhang/test_trl_0.12_grpo/qwen/Qwen2/
+model_revision: main
+torch_dtype: bfloat16
+attn_implementation: flash_attention_2
+# Data training arguments
+chat_template: "{% if not add_generation_prompt is defined %}\n{% set add_generation_prompt = false %}\n{% endif %}\n{%- set ns = namespace(found=false) -%}\n{%- for message in messages -%}\n    {%- if message['role'] == 'system' -%}\n        {%- set ns.found = true -%}\n    {%- endif -%}\n{%- endfor -%}\n{{bos_token}}{%- if not ns.found -%}\n{{'You are an AI programming assistant, utilizing the Deepseek Coder model, developed by Deepseek Company, and you only answer questions related to computer science. For politically sensitive questions, security and privacy issues, and other non-computer science questions, you will refuse to answer\\n'}}\n{%- endif %}\n{%- for message in messages %}\n    {%- if message['role'] == 'system' %}\n{{ message['content'] }}\n    {%- else %}\n        {%- if message['role'] == 'user' %}\n{{'### Instruction:\\n' + message['content'] + '\\n'}}\n        {%- else %}\n{{'### Response:\\n' + message['content'] + '\\n<|EOT|>\\n'}}\n        {%- endif %}\n    {%- endif %}\n{%- endfor %}\n{% if add_generation_prompt %}\n{{'### Response:'}}\n{% endif %}"
+dataset_mixer:
+  data/my: 1.0
+dataset_splits:
+- train
+preprocessing_num_workers: 2
+# SFT trainer config
+bf16: true
+do_eval: False
+eval_strategy: epoch
+gradient_accumulation_steps: 1
+gradient_checkpointing: true
+gradient_checkpointing_kwargs:
+  use_reentrant: False
+learning_rate: 1.0e-05
+log_level: info
+logging_steps: 5
+logging_strategy: steps
+lr_scheduler_type: cosine
+max_seq_length: 4096
+num_train_epochs: 5
+output_dir: /home/swzhang/LLM_alignment/alignment-handbook/qwen_test_model
+overwrite_output_dir: true
+per_device_eval_batch_size: 1
+per_device_train_batch_size: 1
+push_to_hub: False
+remove_unused_columns: true
+report_to:
+- tensorboard
+save_strategy: "steps"
+save_steps: 51
+save_total_limit: 30
+seed: 42
+warmup_ratio: 0.2

grpo_offline_run.py ADDED Viewed

	@@ -0,0 +1,217 @@

+#!/usr/bin/env python
+# coding=utf-8
+# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Supervised fine-tuning script for decoder language models.
+CUDA_VISIBLE_DEVICES=1,2,3,4,5 ACCELERATE_LOG_LEVEL=info accelerate launch --config_file recipes/accelerate_configs/deepspeed_zero3.yaml grpo_offline_run.py config_grpo_offline.yaml
+"""
+import logging
+import random
+import sys
+import datasets
+import torch
+import transformers
+from transformers import AutoModelForCausalLM, set_seed
+from trl.data_utils import maybe_apply_chat_template
+from datasets import load_dataset
+from alignment import (
+    DataArguments,
+    H4ArgumentParser,
+    ModelArguments,
+    SFTConfig,
+    apply_chat_template,
+    decontaminate_humaneval,
+    get_checkpoint,
+    get_datasets,
+    get_kbit_device_map,
+    get_peft_config,
+    get_quantization_config,
+    get_tokenizer,
+)
+from trl import SFTTrainer, setup_chat_format
+from trl_012_grpo.grpo_trainer import GRPOTrainer
+from trl_012_grpo.grpo_config import GRPOConfig
+logger = logging.getLogger(__name__)
+def main():
+    parser = H4ArgumentParser((ModelArguments, DataArguments, GRPOConfig))
+    model_args, data_args, training_args = parser.parse()
+    # Set seed for reproducibility
+    set_seed(training_args.seed)
+    ###############
+    # Setup logging
+    ###############
+    logging.basicConfig(
+        format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
+        datefmt="%Y-%m-%d %H:%M:%S",
+        handlers=[logging.StreamHandler(sys.stdout)],
+    )
+    log_level = training_args.get_process_log_level()
+    logger.setLevel(log_level)
+    datasets.utils.logging.set_verbosity(log_level)
+    transformers.utils.logging.set_verbosity(log_level)
+    transformers.utils.logging.enable_default_handler()
+    transformers.utils.logging.enable_explicit_format()
+    # Log on each process a small summary
+    logger.warning(
+        f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}"
+        + f" distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}"
+    )
+    logger.info(f"Model parameters {model_args}")
+    logger.info(f"Data parameters {data_args}")
+    logger.info(f"Training/evaluation parameters {training_args}")
+    # Check for last checkpoint
+    last_checkpoint = get_checkpoint(training_args)
+    if last_checkpoint is not None and training_args.resume_from_checkpoint is None:
+        logger.info(f"Checkpoint detected, resuming training at {last_checkpoint=}.")
+    ###############
+    # Load datasets
+    ###############
+    raw_datasets = load_dataset("json", data_files="/data01/swzhang/dataset/grpo_data_ori/grpo_del_lowscore/shuffle/grpo_test_shuffle.json")
+    eval_raw_datasets = load_dataset("json", data_files="/data01/swzhang/dataset/grpo_data_ori/grpo_del_lowscore/shuffle/grpo_test_shuffle.json")
+    logger.info(
+        f"Training on the following datasets and their proportions: {[split + ' : ' + str(dset.num_rows) for split, dset in raw_datasets.items()]}"
+    )
+    column_names = list(raw_datasets["train"].features)
+    ################
+    # Load tokenizer
+    ################
+    tokenizer = get_tokenizer(model_args, data_args)
+    #######################
+    # Load pretrained model
+    #######################
+    logger.info("*** Load pretrained model ***")
+    torch_dtype = (
+        model_args.torch_dtype if model_args.torch_dtype in ["auto", None] else getattr(torch, model_args.torch_dtype)
+    )
+    quantization_config = get_quantization_config(model_args)
+    model_kwargs = dict(
+        revision=model_args.model_revision,
+        trust_remote_code=model_args.trust_remote_code,
+        attn_implementation=model_args.attn_implementation,
+        torch_dtype=torch_dtype,
+        use_cache=False if training_args.gradient_checkpointing else True,
+        device_map=get_kbit_device_map() if quantization_config is not None else None,
+        quantization_config=quantization_config,
+    )
+    model = model_args.model_name_or_path
+    # For ChatML we need to add special tokens and resize the embedding layer
+    if "<|im_start|>" in tokenizer.chat_template and "gemma-tokenizer-chatml" not in tokenizer.name_or_path:
+        model = AutoModelForCausalLM.from_pretrained(model_args.model_name_or_path, **model_kwargs)
+        model, tokenizer = setup_chat_format(model, tokenizer)
+        model_kwargs = None
+    #####################
+    # Apply chat template
+    #####################
+    def modify_completion(example):
+        # 将 completion 转换为列表
+        example['prompt'] = \
+        maybe_apply_chat_template({"prompt": [{"role": "user", "content": example['prompt']}]}, tokenizer=tokenizer)[
+            'prompt']
+        return example
+    raw_datasets = raw_datasets.map(modify_completion)
+    eval_raw_datasets = eval_raw_datasets.map(modify_completion)
+    train_dataset = raw_datasets["train"]
+    eval_dataset = eval_raw_datasets["train"]
+    ########################
+    # Initialize the Trainer
+    ########################
+    # 这里的reward function实际不会被用到
+    def reward_len(completions, **kwargs):
+        return [-abs(20 - len(completion)) for completion in completions]
+    training_args.model_init_kwargs = model_kwargs
+    trainer = GRPOTrainer(
+        model=model,
+        reward_funcs=reward_len,
+        args=training_args,
+        train_dataset=train_dataset,
+        eval_dataset=eval_dataset,
+    )
+    ###############
+    # Training loop
+    ###############
+    logger.info("*** Train ***")
+    checkpoint = None
+    if training_args.resume_from_checkpoint is not None:
+        checkpoint = training_args.resume_from_checkpoint
+    elif last_checkpoint is not None:
+        checkpoint = last_checkpoint
+    train_result = trainer.train(resume_from_checkpoint=checkpoint)
+    metrics = train_result.metrics
+    metrics["train_samples"] = len(train_dataset)
+    trainer.log_metrics("train", metrics)
+    trainer.save_metrics("train", metrics)
+    trainer.save_state()
+    ##################################
+    # Save model and create model card
+    ##################################
+    logger.info("*** Save model ***")
+    trainer.save_model(training_args.output_dir)
+    logger.info(f"Model saved to {training_args.output_dir}")
+    # Save everything else on main process
+    kwargs = {
+        "finetuned_from": model_args.model_name_or_path,
+        "dataset": list(data_args.dataset_mixer.keys()),
+        "dataset_tags": list(data_args.dataset_mixer.keys()),
+        "tags": ["alignment-handbook"],
+    }
+    if trainer.accelerator.is_main_process:
+        trainer.create_model_card(**kwargs)
+        # Restore k,v cache for fast inference
+        trainer.model.config.use_cache = True
+        trainer.model.config.save_pretrained(training_args.output_dir)
+    ##########
+    # Evaluate
+    ##########
+    if training_args.do_eval:
+        logger.info("*** Evaluate ***")
+        metrics = trainer.evaluate()
+        metrics["eval_samples"] = len(eval_dataset)
+        trainer.log_metrics("eval", metrics)
+        trainer.save_metrics("eval", metrics)
+    if training_args.push_to_hub is True:
+        logger.info("Pushing to hub...")
+        trainer.push_to_hub(**kwargs)
+    logger.info("*** Training complete ***")
+if __name__ == "__main__":
+    main()

recipes/accelerate_configs/deepspeed_zero3.yaml ADDED Viewed

	@@ -0,0 +1,22 @@

+compute_environment: LOCAL_MACHINE
+debug: false
+deepspeed_config:
+  deepspeed_multinode_launcher: standard
+  offload_optimizer_device: none
+  offload_param_device: none
+  zero3_init_flag: true
+  zero3_save_16bit_model: true
+  zero_stage: 3
+distributed_type: DEEPSPEED
+downcast_bf16: 'no'
+machine_rank: 0
+main_training_function: main
+mixed_precision: bf16
+num_machines: 1
+num_processes: 4
+rdzv_backend: static
+same_network: true
+tpu_env: []
+tpu_use_cluster: false
+tpu_use_sudo: false
+use_cpu: false

recipes/accelerate_configs/fsdp.yaml ADDED Viewed

	@@ -0,0 +1,26 @@

+compute_environment: LOCAL_MACHINE
+debug: false
+distributed_type: FSDP
+downcast_bf16: 'no'
+enable_cpu_affinity: false
+fsdp_config:
+  fsdp_auto_wrap_policy: TRANSFORMER_BASED_WRAP
+  fsdp_backward_prefetch: BACKWARD_PRE
+  fsdp_cpu_ram_efficient_loading: true
+  fsdp_forward_prefetch: true
+  fsdp_offload_params: false
+  fsdp_sharding_strategy: FULL_SHARD
+  fsdp_state_dict_type: SHARDED_STATE_DICT
+  fsdp_sync_module_states: true
+  fsdp_use_orig_params: true
+machine_rank: 0
+main_training_function: main
+mixed_precision: bf16
+num_machines: 1
+num_processes: 8
+rdzv_backend: static
+same_network: true
+tpu_env: []
+tpu_use_cluster: false
+tpu_use_sudo: false
+use_cpu: false

recipes/accelerate_configs/fsdp_qlora.yaml ADDED Viewed

	@@ -0,0 +1,25 @@

+compute_environment: LOCAL_MACHINE
+debug: false
+distributed_type: FSDP
+downcast_bf16: 'no'
+fsdp_config:
+  fsdp_auto_wrap_policy: TRANSFORMER_BASED_WRAP
+  fsdp_backward_prefetch: BACKWARD_PRE
+  fsdp_cpu_ram_efficient_loading: true
+  fsdp_forward_prefetch: false
+  fsdp_offload_params: true
+  fsdp_sharding_strategy: FULL_SHARD
+  fsdp_state_dict_type: SHARDED_STATE_DICT
+  fsdp_sync_module_states: true
+  fsdp_use_orig_params: false
+machine_rank: 0
+main_training_function: main
+mixed_precision: 'no'
+num_machines: 1
+num_processes: 2
+rdzv_backend: static
+same_network: true
+tpu_env: []
+tpu_use_cluster: false
+tpu_use_sudo: false
+use_cpu: false

recipes/accelerate_configs/multi_gpu.yaml ADDED Viewed

	@@ -0,0 +1,16 @@

+compute_environment: LOCAL_MACHINE
+debug: false
+distributed_type: MULTI_GPU
+downcast_bf16: 'no'
+gpu_ids: all
+machine_rank: 0
+main_training_function: main
+mixed_precision: bf16
+num_machines: 1
+num_processes: 8
+rdzv_backend: static
+same_network: true
+tpu_env: []
+tpu_use_cluster: false
+tpu_use_sudo: false
+use_cpu: false

recipes/constitutional-ai/README.md ADDED Viewed

	@@ -0,0 +1,24 @@

+# Constitutional AI
+This repo includes the recipe for training the following models:
+* https://huggingface.co/HuggingFaceH4/mistral-7b-anthropic
+* https://huggingface.co/HuggingFaceH4/mistral-7b-grok
+## Full training examples
+You will require 8 GPUs (80GB of VRAM) to train the full model.
+```shell
+# Step 1 - SFT
+ACCELERATE_LOG_LEVEL=info accelerate launch --config_file recipes/accelerate_configs/deepspeed_zero3.yaml scripts/run_sft.py recipes/constitutional-ai/sft/config_{grok,anthropic}.yaml
+# Step 2 - DPO
+ACCELERATE_LOG_LEVEL=info accelerate launch --config_file recipes/accelerate_configs/deepspeed_zero3.yaml scripts/run_dpo.py recipes/constitutional-ai/dpo/config_anthropic.yaml
+# Note that we did not include the DPO recipe for grok, as that model's seems overtrained and too snarky.
+```
+## Advanced: generating you own dataset
+To generate the constitutional AI dataset, see https://github.com/huggingface/llm-swarm/tree/main/examples/constitutional-ai for detailed instructions if you want to build or customize the dataset.

recipes/constitutional-ai/dpo/config_anthropic.yaml ADDED Viewed

	@@ -0,0 +1,41 @@

+# Model arguments
+model_name_or_path: alignment-handbook/mistral-7b-sft-constitutional-ai
+torch_dtype: null
+# Data training arguments
+# For definitions, see: src/h4/training/config.py
+dataset_mixer:
+  HuggingFaceH4/ultrafeedback_binarized: 1.0
+  HuggingFaceH4/cai-conversation-harmless: 1.0
+dataset_splits:
+- train_prefs
+- test_prefs
+preprocessing_num_workers: 12
+# DPOTrainer arguments
+bf16: true
+beta: 0.1
+do_eval: true
+do_train: true
+eval_strategy: steps
+eval_steps: 1000
+gradient_accumulation_steps: 1
+gradient_checkpointing: true
+hub_model_id: mistral-7b-dpo-constitutional-ai
+learning_rate: 5.0e-7
+log_level: info
+logging_steps: 10
+lr_scheduler_type: linear
+max_length: 1024
+max_prompt_length: 512
+num_train_epochs: 3
+optim: rmsprop
+output_dir: data/mistral-7b-dpo-constitutional-ai
+per_device_train_batch_size: 2
+per_device_eval_batch_size: 8
+push_to_hub: true
+save_strategy: "steps"
+save_steps: 100
+save_total_limit: 1
+seed: 42
+warmup_ratio: 0.1

recipes/constitutional-ai/sft/config_anthropic.yaml ADDED Viewed

	@@ -0,0 +1,48 @@

+# Model arguments
+model_name_or_path: mistralai/Mistral-7B-v0.1
+model_revision: main
+torch_dtype: bfloat16
+attn_implementation: flash_attention_2
+# Data training arguments
+chat_template: "{% for message in messages %}\n{% if message['role'] == 'user' %}\n{{ '<|user|>\n' + message['content'] + eos_token }}\n{% elif message['role'] == 'system' %}\n{{ '<|system|>\n' + message['content'] + eos_token }}\n{% elif message['role'] == 'assistant' %}\n{{ '<|assistant|>\n'  + message['content'] + eos_token }}\n{% endif %}\n{% if loop.last and add_generation_prompt %}\n{{ '<|assistant|>' }}\n{% endif %}\n{% endfor %}"
+dataset_mixer:
+  HuggingFaceH4/cai-conversation-harmless: 1.0
+  HuggingFaceH4/ultrachat_200k: 1.0
+dataset_splits:
+- train_sft
+- test_sft
+preprocessing_num_workers: 12
+# SFT trainer config
+bf16: true
+do_eval: true
+do_train: true
+eval_strategy: epoch # One of ["no", "steps", "epoch"]
+gradient_accumulation_steps: 4
+gradient_checkpointing: true
+gradient_checkpointing_kwargs:
+  use_reentrant: False
+hub_model_id: mistral-7b-sft-constitutional-ai
+hub_strategy: every_save
+learning_rate: 2.0e-05
+log_level: info
+logging_steps: 5
+logging_strategy: steps
+lr_scheduler_type: cosine
+max_seq_length: 2048
+max_steps: -1
+num_train_epochs: 1
+output_dir: data/mistral-7b-sft-constitutional-ai
+overwrite_output_dir: true
+per_device_eval_batch_size: 8
+per_device_train_batch_size: 8
+push_to_hub: true
+remove_unused_columns: true
+report_to:
+- tensorboard
+save_strategy: "steps"
+save_steps: 100
+save_total_limit: 1
+seed: 42
+warmup_ratio: 0.1

recipes/constitutional-ai/sft/config_grok.yaml ADDED Viewed

	@@ -0,0 +1,48 @@

+# Model arguments
+model_name_or_path: mistralai/Mistral-7B-v0.1
+model_revision: main
+torch_dtype: bfloat16
+attn_implementation: flash_attention_2
+# Data training arguments
+chat_template: "{% for message in messages %}\n{% if message['role'] == 'user' %}\n{{ '<|user|>\n' + message['content'] + eos_token }}\n{% elif message['role'] == 'system' %}\n{{ '<|system|>\n' + message['content'] + eos_token }}\n{% elif message['role'] == 'assistant' %}\n{{ '<|assistant|>\n'  + message['content'] + eos_token }}\n{% endif %}\n{% if loop.last and add_generation_prompt %}\n{{ '<|assistant|>' }}\n{% endif %}\n{% endfor %}"
+dataset_mixer:
+  HuggingFaceH4/grok-conversation-harmless: 0.15
+  HuggingFaceH4/ultrachat_200k: 1.0
+dataset_splits:
+- train_sft
+- test_sft
+preprocessing_num_workers: 12
+# SFT trainer config
+bf16: true
+do_eval: true
+do_train: true
+eval_strategy: epoch # One of ["no", "steps", "epoch"]
+gradient_accumulation_steps: 4
+gradient_checkpointing: true
+gradient_checkpointing_kwargs:
+  use_reentrant: False
+hub_model_id: mistral-7b-sft-constitutional-ai
+hub_strategy: every_save
+learning_rate: 2.0e-05
+log_level: info
+logging_steps: 5
+logging_strategy: steps
+lr_scheduler_type: cosine
+max_seq_length: 2048
+max_steps: -1
+num_train_epochs: 1
+output_dir: data/mistral-7b-sft-constitutional-ai
+overwrite_output_dir: true
+per_device_eval_batch_size: 8
+per_device_train_batch_size: 8
+push_to_hub: true
+remove_unused_columns: true
+report_to:
+- tensorboard
+save_strategy: "steps"
+save_steps: 100
+save_total_limit: 1
+seed: 42
+warmup_ratio: 0.1

recipes/gpt2-nl/README.md ADDED Viewed

	@@ -0,0 +1,43 @@

+# Language Adaptation through Continued Pretraining
+This directory shows a base example of how to use continued pretraining and further tuning to adapt a language model to new data (e.g. a new language or domain).
+Three steps are needed: continued pretraining (`cpt`), supervised finetuning (`sft`), and direct preference optimisation (`dpo`). In this dummy example, we'll continue pretraining gpt2 on Dutch raw data, then sft-tuning it, and finally aligning it with DPO. Note that no extensive hyperparameters were tested in this example and that the output models are bad - it is just to show you how you can use the scripts for LM adaptation. The scripts work on 4x 3090s (24GB VRAM). If you have less powerful hardware you may need to reduce the batch size.
+## Continued pretraining
+This step will further pretrain the original `gpt2` model on plain Dutch text. Note that the script will by default use the `text` column in the dataset but you can change that by specifying `text_column` in the yaml file or on the command-line.
+```shell
+ACCELERATE_LOG_LEVEL=info accelerate launch \
+    --config_file recipes/accelerate_configs/multi_gpu.yaml \
+    --num_processes 4 \
+    scripts/run_cpt.py \
+    recipes/gpt2-nl/cpt/config_full.yaml
+```
+## Supervised finetuning
+As other recipes, such as the famous zephyr-7b-beta recipe, have shown, we can then teach our model how to hold a conversation by finetuning it on chat-formatted data. As a base model, we'll make use of the output of the previous step.
+```shell
+ACCELERATE_LOG_LEVEL=info accelerate launch \
+    --config_file recipes/accelerate_configs/multi_gpu.yaml \
+    --num_processes 4 \
+    scripts/run_sft.py recipes/gpt2-nl/sft/config_full.yaml
+```
+## Direct preference optimisation
+Finally, to align the model better with feedback, we can finetune the SFT output with the DPO algorithm. This should improve the quality of the chat capabilities of the model.
+```shell
+ACCELERATE_LOG_LEVEL=info accelerate launch \
+    --config_file recipes/accelerate_configs/multi_gpu.yaml \
+    --num_processes 4 \
+    scripts/run_dpo.py recipes/gpt2-nl/dpo/config_full.yaml
+```
+## Conclusion
+With the steps above you can adapt an LM to a new domain, more data, or even a different language. Then, with sft and dpo, you can end up building a powerful chatbot, too! All within just three simple commands. It should be obvious that all of these follow a very similar approach, which makes them suitable to apply in parameterized slurm jobs. The neat part is that you can easily overwrite arguments in the yaml files by specifying the overwriting argument as a command-line argument, so the adaptability is also great.

recipes/gpt2-nl/cpt/config_full.yaml ADDED Viewed

	@@ -0,0 +1,45 @@

+# Model arguments
+model_name_or_path: gpt2
+model_revision: main
+torch_dtype: bfloat16
+# Data training arguments
+dataset_mixer:
+  yhavinga/mc4_nl_cleaned: 1.0
+dataset_splits:
+  - train
+dataset_configs:
+  - tiny
+preprocessing_num_workers: 12
+# SFT trainer config
+bf16: true
+do_eval: False
+eval_strategy: "no"
+gradient_accumulation_steps: 1
+gradient_checkpointing: true
+gradient_checkpointing_kwargs:
+  use_reentrant: False
+hub_model_id: gpt2-cpt-dutch
+hub_strategy: every_save
+learning_rate: 2.0e-04
+log_level: info
+logging_steps: 5
+logging_strategy: steps
+lr_scheduler_type: cosine
+max_seq_length: 1024
+max_steps: -1
+num_train_epochs: 1
+output_dir: data/gpt2-cpt-dutch
+overwrite_output_dir: true
+per_device_eval_batch_size: 8
+per_device_train_batch_size: 16
+push_to_hub: true
+remove_unused_columns: true
+report_to:
+- wandb
+save_strategy: "steps"
+save_steps: 100
+save_total_limit: 1
+seed: 42
+warmup_ratio: 0.1

recipes/gpt2-nl/dpo/config_full.yaml ADDED Viewed

	@@ -0,0 +1,44 @@

+# Model arguments
+model_name_or_path: BramVanroy/gpt2-sft-dutch
+model_revision: main
+torch_dtype: bfloat16
+# Data training arguments
+# For definitions, see: src/h4/training/config.py
+dataset_mixer:
+  BramVanroy/ultra_feedback_dutch: 1.0
+dataset_splits:
+- train_prefs
+- test_prefs
+preprocessing_num_workers: 12
+# DPOTrainer arguments
+bf16: true
+beta: 0.1
+do_eval: true
+eval_strategy: steps
+eval_steps: 100
+gradient_accumulation_steps: 8
+gradient_checkpointing: true
+gradient_checkpointing_kwargs:
+  use_reentrant: False
+hub_model_id: gpt2-dpo-dutch
+learning_rate: 5.0e-7
+log_level: info
+logging_steps: 10
+lr_scheduler_type: cosine
+max_length: 1024
+max_prompt_length: 512
+num_train_epochs: 1
+optim: adamw_torch
+output_dir: data/gpt2-dpo-dutch
+per_device_train_batch_size: 8
+per_device_eval_batch_size: 8
+push_to_hub: true
+save_strategy: "steps"
+save_steps: 100
+save_total_limit: 1
+seed: 42
+warmup_ratio: 0.1
+report_to:
+- wandb

recipes/gpt2-nl/sft/config_full.yaml ADDED Viewed

	@@ -0,0 +1,45 @@

+# Model arguments
+model_name_or_path: BramVanroy/gpt2-cpt-dutch
+model_revision: main
+torch_dtype: bfloat16
+# Data training arguments
+chat_template: "{% for message in messages %}\n{% if message['role'] == 'user' %}\n{{ '<|user|>\n' + message['content'] + eos_token }}\n{% elif message['role'] == 'system' %}\n{{ '<|system|>\n' + message['content'] + eos_token }}\n{% elif message['role'] == 'assistant' %}\n{{ '<|assistant|>\n'  + message['content'] + eos_token }}\n{% endif %}\n{% if loop.last and add_generation_prompt %}\n{{ '<|assistant|>' }}\n{% endif %}\n{% endfor %}"
+dataset_mixer:
+  BramVanroy/ultrachat_200k_dutch: 1.0
+dataset_splits:
+- train_sft
+- test_sft
+preprocessing_num_workers: 12
+# SFT trainer config
+bf16: true
+do_eval: true
+eval_strategy: epoch
+gradient_accumulation_steps: 1
+gradient_checkpointing: true
+gradient_checkpointing_kwargs:
+  use_reentrant: False
+hub_model_id: gpt2-sft-dutch
+hub_strategy: every_save
+learning_rate: 2.0e-05
+log_level: info
+logging_steps: 5
+logging_strategy: steps
+lr_scheduler_type: cosine
+max_seq_length: 1024
+max_steps: -1
+num_train_epochs: 1
+output_dir: data/gpt2-sft-dutch
+overwrite_output_dir: true
+per_device_eval_batch_size: 8
+per_device_train_batch_size: 8
+push_to_hub: true
+remove_unused_columns: true
+report_to:
+- wandb
+save_strategy: "steps"
+save_steps: 100
+save_total_limit: 1
+seed: 42
+warmup_ratio: 0.1

recipes/launch.slurm ADDED Viewed

	@@ -0,0 +1,86 @@

+#!/bin/bash
+#SBATCH --ntasks-per-node=1
+#SBATCH --exclusive
+#SBATCH --gres=gpu:8
+#SBATCH --partition=hopper-prod  # Adjust this for your cluster
+#SBATCH --output=/fsx/h4/logs/%x-%j.out # Adjust this for your cluster
+#SBATCH --err=/fsx/h4/logs/%x-%j.err    # Adjust this for your cluster
+set -x -e
+source ~/.bashrc
+conda activate handbook
+echo "START TIME: $(date)"
+MODEL=$1
+TASK=$2
+PRECISION=$3
+ACCELERATOR=$4
+OPTIONAL_ARGS=$5
+# Training setup
+NUM_NODES=$SLURM_NNODES
+GPUS_PER_NODE=8
+WORLD_SIZE=$(($NUM_NODES*$GPUS_PER_NODE))
+# Due to conflicts between Accelerate's DeepSpeed configs and Transformers' TrainingArguments, we need to parse the gradient accumulation steps from the config file to ensure they match
+CONFIG_FILE=recipes/$MODEL/$TASK/config_$PRECISION.yaml
+GRAD_ACC_STEPS=$(grep 'gradient_accumulation_steps' $CONFIG_FILE | awk '{print $2}')
+# Split the string into individual arguments
+IFS=' ' read -ra ARGS <<< "$OPTIONAL_ARGS"
+# Loop through the arguments and find the one with "--gradient_accumulation_steps"
+for arg in "${ARGS[@]}"; do
+    if [[ "$arg" == "--gradient_accumulation_steps="* ]]; then
+        # Extract the value after the equals sign
+        GRAD_ACC_STEPS="${arg#*=}"
+        break  # Exit the loop once we find the desired argument
+    fi
+done
+echo "Gradient accumulation steps: $GRAD_ACC_STEPS"
+# so processes know who to talk to
+MASTER_ADDR=$(scontrol show hostnames $SLURM_JOB_NODELIST | head -n 1)
+MASTER_PORT=6000
+export CMD=" \
+    scripts/run_$TASK.py $CONFIG_FILE $OPTIONAL_ARGS
+    "
+export LAUNCHER="HF_HUB_ENABLE_HF_TRANSFER=1 ACCELERATE_LOG_LEVEL=info TRANSFORMERS_VERBOSITY=info accelerate launch \
+    --config_file recipes/accelerate_configs/$ACCELERATOR.yaml  \
+    --gradient_accumulation_steps $GRAD_ACC_STEPS \
+    --num_machines $NUM_NODES \
+    --num_processes $WORLD_SIZE \
+    --main_process_ip $MASTER_ADDR \
+    --main_process_port $MASTER_PORT \
+    --machine_rank \$SLURM_PROCID \
+    --rdzv_conf "rdzv_backend=c10d,rdzv_endpoint=$MASTER_ADDR:$MASTER_PORT" \
+    --max_restarts 1 \
+    --role \$(hostname -s): \
+    --tee 3 \
+    "
+# force crashing on nccl issues like hanging broadcast
+export NCCL_ASYNC_ERROR_HANDLING=1
+# export NCCL_DEBUG=INFO
+# export NCCL_DEBUG_SUBSYS=COLL
+# export NCCL_SOCKET_NTHREADS=1
+# export NCCL_NSOCKS_PERTHREAD=1
+# export CUDA_LAUNCH_BLOCKING=1
+# Specific configuration optimized for the Hugging Face Compute Cluster
+# Be ye warned this may not work on other clusters!
+module load cuda/12.1
+# srun error handling:
+# --wait=60: wait 60 sec after the first task terminates before terminating all remaining tasks
+# --kill-on-bad-exit=1: terminate a step if any task exits with a non-zero exit code
+SRUN_ARGS=" \
+    --wait=60 \
+    --kill-on-bad-exit=1 \
+    "
+clear; srun $SRUN_ARGS --jobid $SLURM_JOB_ID bash -c "$LAUNCHER --role \$SLURMD_NODENAME: $CMD" 2>&1
+echo "END TIME: $(date)"

recipes/pref_align_scan/README.md ADDED Viewed

	@@ -0,0 +1,49 @@

+# Comparing Preference Alignment Algorithms
+This directory contains various comparisons for three algorithms: DPO, IPO, and KTO. Each algorithm has been run in different hyperparameter configurations to study their performance. Two different models and datasets have been used to compare the performance of each algorithm:
+- zephyr-beta-sft and Ultrafeedback
+- OpenHermes-2.5 and the OpenOrca datasets
+We release a collection containing the datasets and models used for these experiments, if you require the other trained models, we can release them on request.
+You can find a longer description of these results in our [blogpost](https://huggingface.co/blog/pref-tuning)
+## Comparisons
+For each algorithm, we aim to tune the beta parameter for a fixed learning rate. We vary beta from 0.1-0.9 in steps of 0.1, we have also found that in certain configurations a tiny value of beta, 0.01, can be effective. So we have included this smaller value in all our comparisons.
+## Usage
+The experiments can be launched with the following bash script:
+```bash
+#!/bin/bash
+# Define an array containing the base configs we wish to fine tune
+configs=("zephyr" "openhermes")
+# Define an array of loss types
+loss_types=("sigmoid" "kto_pair" "ipo")
+# Define an array of beta values
+betas=("0.01" "0.1" "0.2" "0.3" "0.4" "0.5" "0.6" "0.7" "0.8" "0.9")
+# Outer loop for loss types
+for config in "${configs[@]}"; do
+    for loss_type in "${loss_types[@]}"; do
+        # Inner loop for beta values
+        for beta in "${betas[@]}"; do
+            # Determine the job name and model revision based on loss type
+            job_name="$config_${loss_type}_beta_${beta}"
+            model_revision="${loss_type}-${beta}"
+            # Submit the job
+            sbatch --job-name=${job_name} recipes/launch.slurm pref_align_scan dpo $config deepspeed_zero3 \
+            "--beta=${beta} --loss_type=${loss_type} --output_dir=data/$config-7b-align-scan-${loss_type}-beta-${beta} --hub_model_revision=${model_revision}"
+        done
+    done
+done
+```

recipes/pref_align_scan/dpo/config_openhermes.yaml ADDED Viewed

	@@ -0,0 +1,41 @@

+# Model arguments
+model_name_or_path: teknium/OpenHermes-2.5-Mistral-7B
+torch_dtype: null
+# Data training arguments
+dataset_mixer:
+  HuggingFaceH4/orca_dpo_pairs: 1.0
+dataset_splits:
+- train_prefs
+- test_prefs
+preprocessing_num_workers: 12
+# Training arguments with sensible defaults
+bf16: true
+beta: 0.01
+loss_type: sigmoid
+do_eval: true
+do_train: true
+eval_strategy: steps
+eval_steps: 100
+gradient_accumulation_steps: 2
+gradient_checkpointing: true
+gradient_checkpointing_kwargs:
+  use_reentrant: False
+hub_model_id: HuggingFaceH4/openhermes-2.5-mistral-7b-dpo
+hub_model_revision: v1.0
+learning_rate: 5.0e-7
+logging_steps: 10
+lr_scheduler_type: cosine
+max_prompt_length: 512
+num_train_epochs: 1
+optim: adamw_torch
+output_dir: data/openhermes-2.5-mistral-7b-dpo-v1.0
+per_device_train_batch_size: 8
+per_device_eval_batch_size: 8
+save_strategy: "steps"
+save_steps: 100
+save_total_limit: 1
+seed: 42
+warmup_ratio: 0.1

recipes/pref_align_scan/dpo/config_zephyr.yaml ADDED Viewed

	@@ -0,0 +1,39 @@

+# Model arguments
+model_name_or_path: alignment-handbook/zephyr-7b-sft-full
+torch_dtype: null
+# Data training arguments
+dataset_mixer:
+  HuggingFaceH4/ultrafeedback_binarized: 1.0
+dataset_splits:
+- train_prefs
+- test_prefs
+preprocessing_num_workers: 12
+# Training arguments with sensible defaults
+bf16: true
+beta: 0.01
+loss_type: sigmoid
+do_eval: true
+eval_strategy: steps
+eval_steps: 100
+gradient_accumulation_steps: 2
+gradient_checkpointing: true
+gradient_checkpointing_kwargs:
+  use_reentrant: False
+hub_model_id: zephyr-7b-align-scan
+hub_model_revision: dpo-beta-0.01
+learning_rate: 5.0e-7
+logging_steps: 10
+lr_scheduler_type: cosine
+max_prompt_length: 512
+num_train_epochs: 1
+optim: adamw_torch
+output_dir: data/zephyr-7b-align-scan-dpo-beta-0.01
+per_device_train_batch_size: 8
+per_device_eval_batch_size: 8
+save_strategy: "steps"
+save_steps: 100
+save_total_limit: 1
+seed: 42
+warmup_ratio: 0.1

recipes/pref_align_scan/launch_scan.sh ADDED Viewed

	@@ -0,0 +1,24 @@

+#!/bin/bash
+# Define an array containing the base configs we wish to fine tune
+configs=("zephyr" "openhermes")
+# Define an array of loss types
+loss_types=("sigmoid" "kto_pair" "ipo")
+# Define an array of beta values
+betas=("0.01" "0.1" "0.2" "0.3" "0.4" "0.5" "0.6" "0.7" "0.8" "0.9")
+# Outer loop for loss types
+for config in "${configs[@]}"; do
+    for loss_type in "${loss_types[@]}"; do
+        # Inner loop for beta values
+        for beta in "${betas[@]}"; do
+            # Determine the job name and model revision based on loss type
+            job_name="$config_${loss_type}_beta_${beta}"
+            model_revision="${loss_type}-${beta}"
+            # Submit the job
+            sbatch --job-name=${job_name} recipes/launch.slurm pref_align_scan dpo $config deepspeed_zero3 \
+            "--beta=${beta} --loss_type=${loss_type} --output_dir=data/$config-7b-align-scan-${loss_type}-beta-${beta} --hub_model_revision=${model_revision}"
+        done
+    done
+done

recipes/smollm/README.md ADDED Viewed

	@@ -0,0 +1,19 @@

+# Instructions to train SmolLM-Instruct
+We build the [SmolLM-Instruct](https://huggingface.co/collections/HuggingFaceTB/smollm-6695016cad7167254ce15966) (v0.2) models (135M, 360M and 1.7B) by doing SFT on a mix of these datasets:
+- a dataset of 2k simple everyday conversations we generated by llama3.1-70B [everyday-conversations-llama3.1-2k](https://huggingface.co/datasets/HuggingFaceTB/everyday-conversations-llama3.1-2k/)
+- [Magpie-Pro-300K-Filtered](https://huggingface.co/datasets/Magpie-Align/Magpie-Pro-300K-Filtered)
+- [StarCoder2-Self-OSS-Instruct](https://huggingface.co/datasets/bigcode/self-oss-instruct-sc2-exec-filter-50k)
+- A small subset of [OpenHermes-2.5](https://huggingface.co/datasets/teknium/OpenHermes-2.5)
+## Setup
+Follow the installation instructions in https://github.com/huggingface/alignment-handbook/tree/main?tab=readme-ov-file#installation-instructions
+## Training
+We train the models on 8 GPUs using the following command:
+```shell
+ACCELERATE_LOG_LEVEL=info accelerate launch --config_file recipes/accelerate_configs/deepspeed_zero3.yaml scripts/run_sft.py recipes/smollm/sft/config.yaml
+```

recipes/smollm/sft/config.yaml ADDED Viewed

	@@ -0,0 +1,53 @@

+# Model arguments
+model_name_or_path: HuggingFaceTB/SmolLM-360M
+model_revision: main
+tokenizer_name_or_path: HuggingFaceTB/SmolLM-360M-Instruct # Custom tokenizer with <|im_start|> and <|im_end|> tokens
+torch_dtype: bfloat16
+use_flash_attention_2: true
+# Data training arguments
+dataset_mixer:
+  HuggingFaceTB/Magpie-Pro-300K-Filtered-H4: 1.0
+  HuggingFaceTB/self-oss-instruct-sc2-H4: 1.0
+  HuggingFaceTB/OpenHermes-2.5-H4: 0.001
+  HuggingFaceTB/everyday-conversations-llama3.1-2k: 1.0
+  HuggingFaceTB/instruct-data-basics-smollm-H4: 1.0
+dataset_splits:
+- train_sft
+- test_sft
+preprocessing_num_workers: 36
+# SFT trainer config
+bf16: true
+dataset_kwargs:
+  add_special_tokens: false  # We already wrap <bos> and <eos> in the chat template
+  append_concat_token: false # No need to add <eos> across samples
+do_eval: true
+evaluation_strategy: epoch
+gradient_accumulation_steps: 4
+gradient_checkpointing: true
+gradient_checkpointing_kwargs:
+  use_reentrant: false
+hub_model_id: smollm-360M-instruct-new
+hub_strategy: every_save
+learning_rate: 1.0e-03 # 3e-4
+log_level: info
+logging_steps: 5
+logging_strategy: steps
+lr_scheduler_type: cosine
+max_seq_length: 2048
+max_steps: -1
+num_train_epochs: 1
+output_dir: data/smollm-360M-instruct-new
+overwrite_output_dir: true
+per_device_eval_batch_size: 4
+per_device_train_batch_size: 4
+push_to_hub: true
+remove_unused_columns: true
+report_to:
+- tensorboard
+- wandb
+save_strategy: "no"
+seed: 42
+warmup_ratio: 0.1

recipes/smollm2/README.md ADDED Viewed

	@@ -0,0 +1,28 @@

+# Instructions to train SmolLM2-1.7B-Instruct
+We build the [SmolLM2-Instruct](https://huggingface.co/collections/HuggingFaceTB/smollm2-6723884218bcda64b34d7db9) by doing SFT on [SmolTalk](https://huggingface.co/datasets/HuggingFaceTB/smoltalk) and then DPO on [UltraFeedBack](https://huggingface.co/datasets/HuggingFaceH4/ultrafeedback_binarized).
+## Setup
+Follow the installation instructions in https://github.com/huggingface/alignment-handbook/tree/main?tab=readme-ov-file#installation-instructions
+## Training
+We train the 1.7B on 8 GPUs using the following command:
+```shell
+# SFT
+ACCELERATE_LOG_LEVEL=info accelerate launch --config_file recipes/accelerate_configs/deepspeed_zero3.yaml scripts/run_sft.py recipes/smollm2/sft/config.yaml
+# DPO
+ACCELERATE_LOG_LEVEL=info accelerate launch --config_file recipes/accelerate_configs/deepspeed_zero3.yaml scripts/run_dpo.py recipes/smollm2/dpo/config.yaml
+```
+For the 135M and 360M we use [smol-smoltalk](https://huggingface.co/datasets/HuggingFaceTB/smol-smoltalk) dataset for SFT and UltraFeedback for DPO:
+```shell
+# SFT
+ACCELERATE_LOG_LEVEL=info accelerate launch --config_file recipes/accelerate_configs/deepspeed_zero3.yaml scripts/run_sft.py recipes/smollm2/sft/config_smol.yaml
+# DPO
+ACCELERATE_LOG_LEVEL=info accelerate launch --config_file recipes/accelerate_configs/deepspeed_zero3.yaml scripts/run_dpo.py recipes/smollm2/dpo/config_smol.yaml
+```

recipes/smollm2/dpo/config.yaml ADDED Viewed

	@@ -0,0 +1,43 @@

+# Model arguments
+model_name_or_path: loubnabnl/smollm2-1.7B-sft
+torch_dtype: bfloat16
+# Data training arguments
+dataset_mixer:
+  HuggingFaceH4/ultrafeedback_binarized: 1.0
+dataset_splits:
+- train_prefs
+- test_prefs
+preprocessing_num_workers: 12
+# DPOTrainer arguments
+bf16: true
+beta: 0.5
+do_eval: true
+hub_private_repo: true
+eval_strategy: steps
+eval_steps: 100
+gradient_accumulation_steps: 8
+gradient_checkpointing: true
+gradient_checkpointing_kwargs:
+  use_reentrant: False
+hub_model_id: smollm2-1.7B-dpo
+learning_rate: 1.0e-6
+log_level: info
+logging_steps: 10
+lr_scheduler_type: cosine
+max_length: 1024
+max_prompt_length: 512
+num_train_epochs: 3
+optim: adamw_torch
+output_dir: data/smollm2-1.7B-dpo
+per_device_train_batch_size: 2
+per_device_eval_batch_size: 4
+push_to_hub: true
+report_to:
+- tensorboard
+- wandb
+save_strategy: "no"
+seed: 42
+warmup_ratio: 0.1

recipes/smollm2/dpo/config_smol.yaml ADDED Viewed

	@@ -0,0 +1,43 @@

+# Model arguments
+model_name_or_path: loubnabnl/smollm2-360M-sft # we use this script for the 135M model too
+torch_dtype: bfloat16
+# Data training arguments
+dataset_mixer:
+  HuggingFaceH4/ultrafeedback_binarized: 1.0
+dataset_splits:
+- train_prefs
+- test_prefs
+preprocessing_num_workers: 12
+# DPOTrainer arguments
+bf16: true
+beta: 0.5
+do_eval: true
+hub_private_repo: true
+eval_strategy: steps
+eval_steps: 100
+gradient_accumulation_steps: 8
+gradient_checkpointing: true
+gradient_checkpointing_kwargs:
+  use_reentrant: False
+hub_model_id: smollm2-360M-dpo
+learning_rate: 1.0e-6
+log_level: info
+logging_steps: 10
+lr_scheduler_type: cosine
+max_length: 1024
+max_prompt_length: 512
+num_train_epochs: 2
+optim: adamw_torch
+output_dir: data/smollm2-360M-dpo
+per_device_train_batch_size: 2
+per_device_eval_batch_size: 4
+push_to_hub: true
+report_to:
+- tensorboard
+- wandb
+save_strategy: "no"
+seed: 42
+warmup_ratio: 0.1

recipes/smollm2/sft/config.yaml ADDED Viewed

	@@ -0,0 +1,49 @@

+# Model arguments
+model_name_or_path: HuggingFaceTB/SmolLM2-1.7B
+model_revision: main
+tokenizer_name_or_path: HuggingFaceTB/SmolLM2-1.7B-Instruct # Custom tokenizer with <|im_start|> and <|im_end|> tokens
+torch_dtype: bfloat16
+use_flash_attention_2: true
+# Data training arguments
+dataset_mixer:
+  HuggingFaceTB/smoltalk: 1.0
+dataset_configs:
+- all
+dataset_splits:
+- train
+- test
+preprocessing_num_workers: 36
+# SFT trainer config
+bf16: true
+do_eval: true
+evaluation_strategy: epoch
+gradient_accumulation_steps: 4
+gradient_checkpointing: true
+gradient_checkpointing_kwargs:
+  use_reentrant: false
+hub_model_id: smollm2-1.7B-sft
+hub_strategy: every_save
+learning_rate: 3.0e-04
+log_level: info
+logging_steps: 5
+logging_strategy: steps
+lr_scheduler_type: cosine
+max_seq_length: 8192
+max_steps: -1
+num_train_epochs: 2
+output_dir: data/smollm2-1.7B-sft
+overwrite_output_dir: true
+per_device_eval_batch_size: 4
+per_device_train_batch_size: 4
+push_to_hub: true
+remove_unused_columns: true
+report_to:
+- tensorboard
+- wandb
+save_strategy: "no"
+seed: 42
+warmup_ratio: 0.1

recipes/smollm2/sft/config_smol.yaml ADDED Viewed

	@@ -0,0 +1,46 @@

+# Model arguments
+model_name_or_path: HuggingFaceTB/SmolLM2-360M # we use this script for the 135M model too
+model_revision: main
+tokenizer_name_or_path: HuggingFaceTB/SmolLM2-360M-Instruct # Custom tokenizer with <|im_start|> and <|im_end|> tokens
+torch_dtype: bfloat16
+use_flash_attention_2: true
+# Data training arguments
+dataset_mixer:
+  HuggingFaceTB/smol-smoltalk: 1.0
+dataset_splits:
+- train
+- test
+preprocessing_num_workers: 36
+# SFT trainer config
+bf16: true
+do_eval: true
+evaluation_strategy: epoch
+gradient_accumulation_steps: 4
+gradient_checkpointing: true
+gradient_checkpointing_kwargs:
+  use_reentrant: false
+hub_model_id: smollm2-360M-sft
+hub_strategy: every_save
+learning_rate: 1.0e-03 # 3e-4
+log_level: info
+logging_steps: 5
+logging_strategy: steps
+lr_scheduler_type: cosine
+max_seq_length: 8192
+max_steps: -1
+num_train_epochs: 2
+output_dir: data/smollm2-360M-sft
+overwrite_output_dir: true
+per_device_eval_batch_size: 4
+per_device_train_batch_size: 4
+push_to_hub: true
+remove_unused_columns: true
+report_to:
+- tensorboard
+- wandb
+save_strategy: "no"
+seed: 42
+warmup_ratio: 0.1

recipes/starchat2-15b/README.md ADDED Viewed

	@@ -0,0 +1,21 @@

+# Instructions to train StarChat2
+Similar to how we trained Zephyr 7B Beta in our [technical report](https://huggingface.co/papers/2310.16944), training this model proceeds in two steps:
+1. Apply SFT to fine-tune [StarCoder2 15B](https://huggingface.co/bigcode/starcoder2-15b) on a blend of chat, code, and math datastets. The result is an SFT model like [`starchat2-15b-sft-v0.1`](https://huggingface.co/HuggingFaceH4/starchat2-15b-sft-v0.1).
+2. Align the SFT model to AI feedback via DPO on the UltraFeedback and Orca DPO Pairs datasets. The result is a DPO model like [`starchat2-15b-v0.1`](https://huggingface.co/HuggingFaceH4/starchat2-15b-v0.1).
+See below for commands to train these models using DeepSpeed ZeRO-3.
+## Full training examples
+You will require 8 GPUs (80GB of VRAM) to train the full model - alternatively, you can train on 1 GPU by adjusting `per_device_train_batch_size` and `gradient_accumulation_steps` to keep the global batch size constant. A recipe involving QLoRA will come later 🤗.
+```shell
+# Step 1 - SFT
+ACCELERATE_LOG_LEVEL=info accelerate launch --config_file recipes/accelerate_configs/deepspeed_zero3.yaml scripts/run_sft.py recipes/starchat2-15b/sft/config_v0.1.yaml
+# Step 2 - DPO
+ACCELERATE_LOG_LEVEL=info accelerate launch --config_file recipes/accelerate_configs/deepspeed_zero3.yaml scripts/run_dpo.py recipes/starchat2-15b/dpo/config_v0.1.yaml
+```

recipes/starchat2-15b/dpo/config_v0.1.yaml ADDED Viewed

	@@ -0,0 +1,43 @@

+# Model arguments
+model_name_or_path: HuggingFaceH4/starchat2-15b-sft-v0.1
+torch_dtype: bfloat16
+# Data training arguments
+# For definitions, see: src/h4/training/config.py
+dataset_mixer:
+  HuggingFaceH4/ultrafeedback_binarized: 1.0
+  HuggingFaceH4/orca_dpo_pairs: 1.0
+dataset_splits:
+- train_prefs
+- test_prefs
+preprocessing_num_workers: 12
+# DPOTrainer arguments
+bf16: true
+beta: 0.05
+do_eval: true
+eval_strategy: steps
+eval_steps: 100
+gradient_accumulation_steps: 8
+gradient_checkpointing: true
+gradient_checkpointing_kwargs:
+  use_reentrant: False
+hub_model_id: starchat2-15b-dpo-v0.1
+learning_rate: 5.0e-7
+log_level: info
+logging_steps: 10
+lr_scheduler_type: cosine
+max_length: 1024
+max_prompt_length: 512
+num_train_epochs: 2
+optim: adamw_torch
+output_dir: data/starchat2-15b-dpo-v0.1
+per_device_train_batch_size: 2
+per_device_eval_batch_size: 4
+push_to_hub: true
+report_to:
+- tensorboard
+- wandb
+save_strategy: "no"
+seed: 42
+warmup_ratio: 0.1

recipes/starchat2-15b/sft/config_v0.1.yaml ADDED Viewed

	@@ -0,0 +1,49 @@

+# Model arguments
+model_name_or_path: bigcode/starcoder2-15b
+model_revision: main
+torch_dtype: bfloat16
+attn_implementation: flash_attention_2
+# Data training arguments
+chat_template: "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}"
+dataset_mixer:
+  HuggingFaceH4/airoboros-3.2: 1.0
+  HuggingFaceH4/Code-Feedback: 1.0
+  HuggingFaceH4/orca-math-word-problems-200k: 1.0
+  HuggingFaceH4/SystemChat: 1.0
+  HuggingFaceH4/capybara: 1.0
+dataset_splits:
+- train_sft
+- test_sft
+preprocessing_num_workers: 24
+# SFT trainer config
+bf16: true
+do_eval: true
+eval_strategy: epoch
+gradient_accumulation_steps: 2
+gradient_checkpointing: true
+gradient_checkpointing_kwargs:
+  use_reentrant: false
+hub_model_id: starchat2-15b-v0.1
+hub_strategy: every_save
+learning_rate: 2.0e-05
+log_level: info
+logging_steps: 5
+logging_strategy: steps
+lr_scheduler_type: cosine
+max_seq_length: 2048
+max_steps: -1
+num_train_epochs: 3
+output_dir: data/starchat2-15b-v0.1
+overwrite_output_dir: true
+per_device_eval_batch_size: 8
+per_device_train_batch_size: 8
+push_to_hub: true
+remove_unused_columns: true
+report_to:
+- tensorboard
+- wandb
+save_strategy: "no"
+seed: 42
+warmup_ratio: 0.1

recipes/zephyr-141b-A35b/README.md ADDED Viewed

	@@ -0,0 +1,23 @@

+# Instructions to train Zephyr-141B-A35B with ORPO
+This model is fine-tuned via a novel alignment algorithm called [Odds Ratio Preference Optimization (ORPO)](https://huggingface.co/papers/2403.07691). ORPO does not require an SFT step to achieve high performance and is thus much more computationally efficient than methods like DPO and PPO. To train Zephyr-141B-A35B, we used the [`argilla/distilabel-capybara-dpo-7k-binarized`](https://huggingface.co/datasets/argilla/distilabel-capybara-dpo-7k-binarized) preference dataset, which consists of synthetic, high-quality, multi-turn preferences that have been scored via LLMs.
+See below for commands to train these models using FSDP. **Note:** we found it was not possible to train this large model with DeepSpeed ZeRO-3 due to unresolved NCCL errors which cause GPUs to hang.
+## Full training examples
+You will require 4 nodes of 8 GPUs (80GB of VRAM) to train the full model - alternatively, you may be able to train on fewer GPUs by adjusting `per_device_train_batch_size` and `gradient_accumulation_steps` and `num_train_epochs` to keep the global batch size constant. A recipe involving QLoRA will come later 🤗.
+To run with Slurm, use:
+```shell
+sbatch --job-name=handbook_sft --nodes=4 recipes/launch.slurm zephyr-141b-A35b orpo full fsdp
+```
+Under the hood, this calls the following script which can be adapted to other models and datasets:
+```shell
+ACCELERATE_LOG_LEVEL=info TRANSFORMERS_VERBOSITY=info accelerate launch --config_file recipes/accelerate_configs/fsdp.yaml scripts/run_orpo.py recipes/zephyr-141b-A35b/orpo/config_full.yaml
+```

recipes/zephyr-141b-A35b/orpo/config_full.yaml ADDED Viewed

	@@ -0,0 +1,39 @@

+# Model arguments
+model_name_or_path: mistral-community/Mixtral-8x22B-v0.1
+model_revision: main
+torch_dtype: bfloat16
+attn_implementation: flash_attention_2
+# Data training arguments
+chat_template: "{% for message in messages %}\n{% if message['role'] == 'user' %}\n{{ '<|user|>\n' + message['content'] + eos_token }}\n{% elif message['role'] == 'system' %}\n{{ '<|system|>\n' + message['content'] + eos_token }}\n{% elif message['role'] == 'assistant' %}\n{{ '<|assistant|>\n'  + message['content'] + eos_token }}\n{% endif %}\n{% if loop.last and add_generation_prompt %}\n{{ '<|assistant|>' }}\n{% endif %}\n{% endfor %}"
+dataset_mixer:
+  argilla/distilabel-capybara-dpo-7k-binarized: 1.0
+dataset_splits:
+- train
+preprocessing_num_workers: 8
+# ORPOTrainer arguments
+bf16: true
+beta: 0.05
+gradient_accumulation_steps: 1
+gradient_checkpointing: true
+gradient_checkpointing_kwargs:
+  use_reentrant: true
+hub_model_id: zephyr-orpo-141b-A35b
+learning_rate: 5.0e-6
+log_level: info
+logging_steps: 10
+lr_scheduler_type: inverse_sqrt
+max_length: 2048
+max_prompt_length: 1792
+num_train_epochs: 3
+optim: adamw_bnb_8bit
+output_dir: data/zephyr-orpo-141b-A35b
+per_device_train_batch_size: 1
+push_to_hub: true
+report_to:
+- tensorboard
+- wandb
+save_strategy: "no"
+seed: 42
+warmup_steps: 100

recipes/zephyr-7b-beta/README.md ADDED Viewed

	@@ -0,0 +1,44 @@

+# Instructions to Replicate Zephyr-7b-β
+As described in the Zephyr [technical report](https://huggingface.co/papers/2310.16944), training this model proceeds in two steps:
+1. Apply SFT to fine-tune Mistral 7B on a filtered version of the UltraChat dataset ([link](https://huggingface.co/datasets/HuggingFaceH4/ultrachat_200k)). The result is an SFT model like [`zephyr-7b-sft-full`](https://huggingface.co/alignment-handbook/zephyr-7b-sft-full) or [`zephyr-7b-sft-qlora`](https://huggingface.co/alignment-handbook/zephyr-7b-sft-qlora).
+2. Align the SFT model to AI feedback via DPO on a preprocessed version of the UltraFeedback dataset ([link](https://huggingface.co/datasets/HuggingFaceH4/ultrafeedback_binarized)). The result is a DPO model like [`zephyr-7b-dpo-full`](https://huggingface.co/alignment-handbook/zephyr-7b-dpo-full) or [`zephyr-7b-dpo-qlora`](https://huggingface.co/alignment-handbook/zephyr-7b-dpo-qlora).
+**Note:** after the release of Zephyr, the team at [Argilla](https://argilla.io) found that the source UltraFeedback dataset had a few thousand incorrect preference labels from GPT-4. Additionally, TRL's `SFTTrainer` had a bug in the learning rate scheduler which terminated training early. Accounting for these changes led us to find a better set of hyperparameters from those described in the technical report. In particular, for DPO training we found that training for 1 epoch with `beta=0.01` was sufficient to achieve comparable performance to `zephyr-7b-beta` (vs. 3 epochs with `beta=0.1`).
+See below for commands to train these models using either DeepSpeed ZeRO-3 or LoRA.
+## Full training examples
+You will require 8 GPUs (80GB of VRAM) to train the full model.
+```shell
+# Step 1 - SFT
+ACCELERATE_LOG_LEVEL=info accelerate launch --config_file recipes/accelerate_configs/deepspeed_zero3.yaml scripts/run_sft.py recipes/zephyr-7b-beta/sft/config_full.yaml
+# Step 2 - DPO
+ACCELERATE_LOG_LEVEL=info accelerate launch --config_file recipes/accelerate_configs/deepspeed_zero3.yaml scripts/run_dpo.py recipes/zephyr-7b-beta/dpo/config_full.yaml
+```
+## QLoRA training examples
+Train faster with flash-attention 2 (GPU supporting FA2: A100, H100, etc)
+```````shell
+# Step 1 - SFT
+ACCELERATE_LOG_LEVEL=info accelerate launch --config_file recipes/accelerate_configs/multi_gpu.yaml --num_processes=1 scripts/run_sft.py recipes/zephyr-7b-beta/sft/config_qlora.yaml --load_in_4bit=true
+# Step 2 - DPO
+ACCELERATE_LOG_LEVEL=info accelerate launch --config_file recipes/accelerate_configs/multi_gpu.yaml --num_processes=1 scripts/run_dpo.py recipes/zephyr-7b-beta/dpo/config_qlora.yaml
+```````
+P.S. Using Flash Attention also allows you to drastically increase the batch size (x2 in my case)
+Train without flash-attention (i.e. via PyTorch's scaled dot product attention):
+```````shell
+# Step 1 - SFT
+ACCELERATE_LOG_LEVEL=info accelerate launch --config_file recipes/accelerate_configs/multi_gpu.yaml --num_processes=1 scripts/run_sft.py recipes/zephyr-7b-beta/sft/config_qlora.yaml --load_in_4bit=true --attn_implementation=sdpa
+# Step 2 - DPO
+ACCELERATE_LOG_LEVEL=info accelerate launch --config_file recipes/accelerate_configs/multi_gpu.yaml --num_processes=1 scripts/run_dpo.py recipes/zephyr-7b-beta/dpo/config_qlora.yaml --attn_implementation=sdpa
+```````

recipes/zephyr-7b-beta/dpo/config_full.yaml ADDED Viewed

	@@ -0,0 +1,41 @@

+# Model arguments
+model_name_or_path: alignment-handbook/zephyr-7b-sft-full
+torch_dtype: null
+# Data training arguments
+# For definitions, see: src/h4/training/config.py
+dataset_mixer:
+  HuggingFaceH4/ultrafeedback_binarized: 1.0
+dataset_splits:
+- train_prefs
+- test_prefs
+preprocessing_num_workers: 12
+# DPOTrainer arguments
+bf16: true
+beta: 0.01
+do_eval: true
+eval_strategy: steps
+eval_steps: 100
+gradient_accumulation_steps: 2
+gradient_checkpointing: true
+gradient_checkpointing_kwargs:
+  use_reentrant: False
+hub_model_id: zephyr-7b-dpo-full
+learning_rate: 5.0e-7
+log_level: info
+logging_steps: 10
+lr_scheduler_type: cosine
+max_length: 1024
+max_prompt_length: 512
+num_train_epochs: 1
+optim: adamw_torch
+output_dir: data/zephyr-7b-dpo-full
+per_device_train_batch_size: 8
+per_device_eval_batch_size: 8
+push_to_hub: true
+save_strategy: "steps"
+save_steps: 100
+save_total_limit: 1
+seed: 42
+warmup_ratio: 0.1

recipes/zephyr-7b-beta/dpo/config_qlora.yaml ADDED Viewed

	@@ -0,0 +1,57 @@

+# Model arguments
+model_name_or_path: alignment-handbook/zephyr-7b-sft-qlora
+torch_dtype: bfloat16
+attn_implementation: flash_attention_2
+# LoRA arguments
+use_peft: true
+load_in_4bit: true
+lora_r: 128
+lora_alpha: 128
+lora_dropout: 0.05
+lora_target_modules:
+- q_proj
+- k_proj
+- v_proj
+- o_proj
+- gate_proj
+- up_proj
+- down_proj
+# Data training arguments
+dataset_mixer:
+  HuggingFaceH4/ultrafeedback_binarized: 1.0
+dataset_splits:
+- train_prefs
+- test_prefs
+preprocessing_num_workers: 12
+# DPOTrainer arguments
+bf16: true
+beta: 0.01
+do_eval: true
+eval_strategy: steps
+eval_steps: 100
+gradient_accumulation_steps: 4
+gradient_checkpointing: true
+gradient_checkpointing_kwargs:
+  use_reentrant: false
+hub_model_id: zephyr-7b-dpo-qlora
+learning_rate: 5.0e-6
+log_level: info
+logging_steps: 10
+lr_scheduler_type: cosine
+max_length: 1024
+max_prompt_length: 512
+num_train_epochs: 1
+optim: paged_adamw_32bit
+output_dir: data/zephyr-7b-dpo-qlora # It is handy to append `hub_model_revision` to keep track of your local experiments
+per_device_train_batch_size: 4
+per_device_eval_batch_size: 8
+push_to_hub: true
+save_strategy: "steps"
+save_steps: 100
+save_total_limit: 1
+seed: 42
+warmup_ratio: 0.1

recipes/zephyr-7b-beta/sft/config_full.yaml ADDED Viewed

	@@ -0,0 +1,46 @@

+# Model arguments
+model_name_or_path: mistralai/Mistral-7B-v0.1
+model_revision: main
+torch_dtype: bfloat16
+attn_implementation: flash_attention_2
+# Data training arguments
+chat_template: "{% for message in messages %}\n{% if message['role'] == 'user' %}\n{{ '<|user|>\n' + message['content'] + eos_token }}\n{% elif message['role'] == 'system' %}\n{{ '<|system|>\n' + message['content'] + eos_token }}\n{% elif message['role'] == 'assistant' %}\n{{ '<|assistant|>\n'  + message['content'] + eos_token }}\n{% endif %}\n{% if loop.last and add_generation_prompt %}\n{{ '<|assistant|>' }}\n{% endif %}\n{% endfor %}"
+dataset_mixer:
+  HuggingFaceH4/ultrachat_200k: 1.0
+dataset_splits:
+- train_sft
+- test_sft
+preprocessing_num_workers: 12
+# SFT trainer config
+bf16: true
+do_eval: true
+eval_strategy: epoch
+gradient_accumulation_steps: 1
+gradient_checkpointing: true
+gradient_checkpointing_kwargs:
+  use_reentrant: False
+hub_model_id: zephyr-7b-sft-full
+hub_strategy: every_save
+learning_rate: 2.0e-05
+log_level: info
+logging_steps: 5
+logging_strategy: steps
+lr_scheduler_type: cosine
+max_seq_length: 2048
+max_steps: -1
+num_train_epochs: 1
+output_dir: data/zephyr-7b-sft-full
+overwrite_output_dir: true
+per_device_eval_batch_size: 8
+per_device_train_batch_size: 16
+push_to_hub: true
+remove_unused_columns: true
+report_to:
+- tensorboard
+save_strategy: "steps"
+save_steps: 100
+save_total_limit: 1
+seed: 42
+warmup_ratio: 0.1