Update modeling_motif.py

by leejunhyeok - opened 23 days ago

base: refs/heads/main

←

from: refs/pr/1

Discussion Files changed

+223665

-11

Files changed (17) hide show

.gitattributes +1 -0
LICENSE +5 -6
README.md +234 -5
added_tokens.json +127 -0
config.json +35 -0
configuration_motif.py +167 -0
generation_config.json +10 -0
merges.txt +0 -0
model-00001-of-00003.safetensors +3 -0
model-00002-of-00003.safetensors +3 -0
model-00003-of-00003.safetensors +3 -0
model.safetensors.index.json +521 -0
modeling_motif.py +1378 -0
special_tokens_map.json +30 -0
tokenizer.json +3 -0
tokenizer_config.json +1027 -0
vocab.json +0 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+tokenizer.json filter=lfs diff=lfs merge=lfs -text

LICENSE CHANGED Viewed

@@ -8,18 +8,17 @@ Motif-2.6B Release Date: June 9, 2025
 "Motif Technologies" or "we" means Motif Technologies Corp.
 By clicking "I Accept" below or by using or distributing any portion or element of the Motif Materials, you agree to be bound by this Agreement.
 1. License Rights and Redistribution.
     a. Grant of Rights. You are granted a non-exclusive, worldwide, non-transferable and royalty-free limited license under Motif Technologies' intellectual property or other rights owned by Motif Technologies embodied in the Motif Materials to use, reproduce, distribute, copy, create derivative works of, and make modifications to the Motif Materials.
     b. Redistribution and Use.
-          i. If you distribute or make available the Motif Materials (or any derivative works thereof), or a product or service (including another AI model) that contains any of them, you shall (A) provide a copy of this Agreement with any such Motif Materials; and (B) prominently display "Built with Motif" on a related website, user interface, blogpost, about page, or product documentation. If you use the Motif Materials or any outputs or results of the Motif Materials to create, train, fine tune, or otherwise improve an AI model, which is distributed or made available, you shall also include "Motif-2.6B" at the beginning of any such AI model name.
-          ii. If you receive Motif Materials, or any derivative works thereof, from a Licensee as part of an integrated end user product, then Section 2 of this Agreement will not apply to you.
-          iii. You must retain in all copies of the Motif Materials that you distribute the following attribution notice within a "Notice" text file distributed as a part of such copies: "Motif-2.6B is licensed under the Motif-2.6B Community License, Copyright © Motif Technologies Corp. All Rights Reserved."
-          iv. Your use of the Motif Materials must comply with applicable laws and regulations (including trade compliance laws and regulations) and adhere to the Acceptable Use Policy for the Motif Materials (available at https://motiftech.io), which is hereby incorporated by reference into this Agreement.
 2. Additional Commercial Terms. If, on the Motif-2.6B version release date, the monthly active users of the products or services made available by or for Licensee, or Licensee's affiliates, is greater than 700 million monthly active users in the preceding calendar month, you must request a license from Motif Technologies, which Motif Technologies may grant to you in its sole discretion, and you are not authorized to exercise any of the rights under this Agreement unless or until Motif Technologies otherwise expressly grants you such license.
 3. Disclaimer of Warranty. UNLESS REQUIRED BY APPLICABLE LAW, THE MOTIF MATERIALS AND ANY OUTPUT AND RESULTS THEREFROM ARE PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, AND MOTIF TECHNOLOGIES DISCLAIMS ALL WARRANTIES OF ANY KIND, BOTH EXPRESS AND IMPLIED, INCLUDING, WITHOUT LIMITATION, ANY WARRANTIES OF TITLE, NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. YOU ARE SOLELY RESPONSIBLE FOR DETERMINING THE APPROPRIATENESS OF USING OR REDISTRIBUTING THE MOTIF MATERIALS AND ASSUME ANY RISKS ASSOCIATED WITH YOUR USE OF THE MOTIF MATERIALS AND ANY OUTPUT AND RESULTS.
 4. Limitation of Liability. IN NO EVENT WILL MOTIF TECHNOLOGIES OR ITS SHAREHOLDER OR ITS AFFILIATES BE LIABLE UNDER ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, TORT, NEGLIGENCE, PRODUCTS LIABILITY, OR OTHERWISE, ARISING OUT OF THIS AGREEMENT, FOR ANY LOST PROFITS OR ANY INDIRECT, SPECIAL, CONSEQUENTIAL, INCIDENTAL, EXEMPLARY OR PUNITIVE DAMAGES, EVEN IF MOTIF TECHNOLOGIES OR ITS SHAREHOLDER OR ITS AFFILIATES HAVE BEEN ADVISED OF THE POSSIBILITY OF ANY OF THE FOREGOING.
- 5. Intellectual Property.
     a. No trademark licenses are granted under this Agreement, and in connection with the Motif Materials, neither Motif Technologies nor Licensee may use any name or mark owned by or associated with the other or any of its affiliates, except as required for reasonable and customary use in describing and redistributing the Motif Materials or as set forth in this Section 5(a). Motif Technologies hereby grants you a license to use "Motif" (the "Mark") solely as required to comply with the last sentence of Section 1.b.i. All goodwill arising out of your use of the Mark will inure to the benefit of Motif Technologies.
     b. Subject to Motif Technologies' ownership of Motif Materials and derivatives made by or for Motif Technologies, with respect to any derivative works and modifications of the Motif Materials that are made by you, as between you and Motif Technologies, you are and will be the owner of such derivative works and modifications.
     c. If you institute litigation or other proceedings against Motif Technologies, Motif Technologies' shareholder or affiliate or any entity (including a cross-claim or counterclaim in a lawsuit) alleging that the Motif Materials or Motif-2.6B outputs or results, or any portion of any of the foregoing, constitutes infringement of intellectual property or other rights owned or licensable by you, then any licenses granted to you under this Agreement shall terminate as of the date such litigation or claim is filed or instituted. You will indemnify and hold harmless Motif Technologies from and against any claim by any third party arising out of or related to your use or distribution of the Motif Materials.

 "Motif Technologies" or "we" means Motif Technologies Corp.
 By clicking "I Accept" below or by using or distributing any portion or element of the Motif Materials, you agree to be bound by this Agreement.
 1. License Rights and Redistribution.
     a. Grant of Rights. You are granted a non-exclusive, worldwide, non-transferable and royalty-free limited license under Motif Technologies' intellectual property or other rights owned by Motif Technologies embodied in the Motif Materials to use, reproduce, distribute, copy, create derivative works of, and make modifications to the Motif Materials.
     b. Redistribution and Use.
+        i. If you distribute or make available the Motif Materials (or any derivative works thereof), or a product or service (including another AI model) that contains any of them, you shall (A) provide a copy of this Agreement with any such Motif Materials; and (B) prominently display "Built with Motif" on a related website, user interface, blogpost, about page, or product documentation. If you use the Motif Materials or any outputs or results of the Motif Materials to create, train, fine tune, or otherwise improve an AI model, which is distributed or made available, you shall also include "Motif-2.6B" at the beginning of any such AI model name.
+        ii. If you receive Motif Materials, or any derivative works thereof, from a Licensee as part of an integrated end user product, then Section 2 of this Agreement will not apply to you.
+        iii. You must retain in all copies of the Motif Materials that you distribute the following attribution notice within a "Notice" text file distributed as a part of such copies: "Motif-2.6B is licensed under the Motif-2.6B Community License, Copyright © Motif Technologies Corp. All Rights Reserved."
+        iv. Your use of the Motif Materials must comply with applicable laws and regulations (including trade compliance laws and regulations) and adhere to the Acceptable Use Policy for the Motif Materials (available at https://motiftech.io), which is hereby incorporated by reference into this Agreement.
 2. Additional Commercial Terms. If, on the Motif-2.6B version release date, the monthly active users of the products or services made available by or for Licensee, or Licensee's affiliates, is greater than 700 million monthly active users in the preceding calendar month, you must request a license from Motif Technologies, which Motif Technologies may grant to you in its sole discretion, and you are not authorized to exercise any of the rights under this Agreement unless or until Motif Technologies otherwise expressly grants you such license.
 3. Disclaimer of Warranty. UNLESS REQUIRED BY APPLICABLE LAW, THE MOTIF MATERIALS AND ANY OUTPUT AND RESULTS THEREFROM ARE PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, AND MOTIF TECHNOLOGIES DISCLAIMS ALL WARRANTIES OF ANY KIND, BOTH EXPRESS AND IMPLIED, INCLUDING, WITHOUT LIMITATION, ANY WARRANTIES OF TITLE, NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. YOU ARE SOLELY RESPONSIBLE FOR DETERMINING THE APPROPRIATENESS OF USING OR REDISTRIBUTING THE MOTIF MATERIALS AND ASSUME ANY RISKS ASSOCIATED WITH YOUR USE OF THE MOTIF MATERIALS AND ANY OUTPUT AND RESULTS.
 4. Limitation of Liability. IN NO EVENT WILL MOTIF TECHNOLOGIES OR ITS SHAREHOLDER OR ITS AFFILIATES BE LIABLE UNDER ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, TORT, NEGLIGENCE, PRODUCTS LIABILITY, OR OTHERWISE, ARISING OUT OF THIS AGREEMENT, FOR ANY LOST PROFITS OR ANY INDIRECT, SPECIAL, CONSEQUENTIAL, INCIDENTAL, EXEMPLARY OR PUNITIVE DAMAGES, EVEN IF MOTIF TECHNOLOGIES OR ITS SHAREHOLDER OR ITS AFFILIATES HAVE BEEN ADVISED OF THE POSSIBILITY OF ANY OF THE FOREGOING.
+5. Intellectual Property.
     a. No trademark licenses are granted under this Agreement, and in connection with the Motif Materials, neither Motif Technologies nor Licensee may use any name or mark owned by or associated with the other or any of its affiliates, except as required for reasonable and customary use in describing and redistributing the Motif Materials or as set forth in this Section 5(a). Motif Technologies hereby grants you a license to use "Motif" (the "Mark") solely as required to comply with the last sentence of Section 1.b.i. All goodwill arising out of your use of the Mark will inure to the benefit of Motif Technologies.
     b. Subject to Motif Technologies' ownership of Motif Materials and derivatives made by or for Motif Technologies, with respect to any derivative works and modifications of the Motif Materials that are made by you, as between you and Motif Technologies, you are and will be the owner of such derivative works and modifications.
     c. If you institute litigation or other proceedings against Motif Technologies, Motif Technologies' shareholder or affiliate or any entity (including a cross-claim or counterclaim in a lawsuit) alleging that the Motif Materials or Motif-2.6B outputs or results, or any portion of any of the foregoing, constitutes infringement of intellectual property or other rights owned or licensable by you, then any licenses granted to you under this Agreement shall terminate as of the date such litigation or claim is filed or instituted. You will indemnify and hold harmless Motif Technologies from and against any claim by any third party arising out of or related to your use or distribution of the Motif Materials.

README.md CHANGED Viewed

@@ -1,5 +1,234 @@
----
-license: other
-license_name: motif-license
-license_link: LICENSE
----

+---
+license: other
+license_name: motif-license
+license_link: LICENSE
+language:
+- en
+- ko
+---
+*Last update: 8th June 2025*
+# Introduction
+We announce **Motif 2.6B**, a 2.6 billion parameter language model trained from scratch on AMD Instinct™ MI250X GPUs. Motif 2.6B marks our very first step toward building helpful, reliable AI aligned with human values. With this initial release, our goal is for Motif 2.6B to match the performance of well-known open-source models such as Gemma, Llama, and Phi — particularly those in the sLLM regime.
+# Training information
+- GPUs: 384 MI250X
+- Training time: 42 days
+- Training data: 2.4T tokens
+*Notice: A detailed technical report will be released at a later time.*
+# Evaluation
+When models are released, their accompanying technical reports or papers often present benchmark results based on evaluation settings chosen by the developers. While this is a common and understandable practice, it can lead to challenges when comparing models across different organizations. The same model may yield different scores depending on evaluation conditions, and details of these conditions are not always fully disclosed. This lack of standardization can make it difficult for the open-source community to interpret and trust reported results. We therefore reference performance scores based on the official numbers reported by each model’s developers in their respective publications.
+To illustrate how much evaluation scores can vary across reports, we provide concrete examples of benchmark score differences for major models in the **Evaluation Appendix**.
+### Comparison to Mistral 7B by Mistral AI
+The benchmarks and corresponding scores listed in the table below are taken directly from the [Mistral 7B technical report](https://arxiv.org/pdf/2310.06825).
+|Benchmark|Metric|Mistral 7B|Motif 2.6B|Improvement|
+|---|---|---|---|---|
+|MMLU|5-shot|60.1|57.93|-3.61%|
+|HellaSwag|0-shot|81.3|61.35|-24.54%|
+|WinoG|0-shot|75.3|59.91|-20.44%|
+|PIQA|0-shot|83|75.95|-8.49%|
+|Arc-e|0-shot|80|87.21|+9.01%|
+|Arc-c|0-shot|55.5|74.2|+33.69%|
+|NQ|5-shot|28.8|11.14|-61.32%|
+|TriviaQA|5-shot|69.9|54.97|-21.36%|
+|HumanEval|0-shot|30.5|68.3|+123.93%|
+|MBPP|3-shot|47.5|60.3|+26.95%|
+|MATH|4-shot, maj@4|13.1|40.2*|+206.87%|
+|GSM8K|8-shot, maj@8|52.2|77.71|+48.87%|
+||||**Average**|**+33.77%**|
+\* : We report the 4-shot, maj@1 score instead of the 4-shot, maj@4.
+### Comparison to the Gemma series by Google
+#### Gemma 1 & 2
+The benchmarks and corresponding scores listed in the table below are taken directly from the [Gemma 2 technical report](https://arxiv.org/abs/2408.00118).
+*Note: Although referred to as "2B", Gemma 2 2B actually has <U>2.6 billion</U> parameters.*
+|Benchmark|Metric|Gemma 1 2B|Gemma 1 7B|Gemma 2 2B|Gemma 2 9B|Motif 2.6B|Improvement(over 1 1B)|Improvement(over 1 7B)|Improvement(over 2 2B)|Improvement(over 2 9B)|
+|---|---|---|---|---|---|---|---|---|---|---|
+|MMLU|5-shot|42.3|64.4|52.2|71.3|57.93|+36.95%|-10.05%|+10.98%|-18.75%|
+|ARC-C|25-shot|48.5|61.1|55.7|68.4|75.08|+54.80%|+22.88%|+34.79%|+9.77%|
+|GSM8K|5-shot|15.1|51.8|24.3|68.6|67.85|+349.34%|+30.98%|+179.22%|-1.09%|
+|AGIEval|3-5-shot|24.2|44.9|31.5|52.8|-|-|-|-|-|
+|DROP|3-shot, F1|48.5|56.3|51.2|69.4|29.33|-39.53%|-47.90%|-42.71%|-57.74%|
+|BBH|3-shot, CoT|35.2|59|41.9|68.2|48.56|37.95%|-17.69%|+15.89%|-28.80%|
+|Winogrande|5-shot|66.8|79|71.3|80.6|67.09|+0.43%|-15.08%|-5.90%|-16.76%|
+|HellaSwag|10-shot|71.7|82.3|72.9|81.9|69.89|-2.52%|-15.08%|-4.13%|-14.66%|
+|MATH|4-shot|11.8|24.3|16|36.6|40.2|+240.88%|+65.43%|+151.25%|+9.84%|
+|ARC-e|0-shot|73.2|81.5|80.6|88|87.21|+19.14%|+7.01%|+8.20%|-0.90%|
+|PIQA|0-shot|77.3|81.2|78.4|81.7|75.95|-1.75%|-6.47%|-3.13%|-7.04%|
+|SIQA|0-shot|49.7|51.8|51.9|53.4|61.97|+24.69%|+19.63%|+19.40%|+16.05%|
+|Boolq|0-shot|69.4|83.2|72.7|84.2|67.76|-2.36%|-18.56%|-6.80%|-19.52%|
+|TriviaQA|5-shot|53.2|63.4|60.4|76.6|54.97|+3.33%|-13.30%|-8.99%|-28.24%|
+|NQ|5-shot|12.5|23|17.1|29.2|10.91|-12.72%|-52.57%|-36.20%|-62.64%|
+|HumanEval|pass@1|22|32.3|20.1|40.2|68.3|+210.45%|+111.46%|+239.80%|+69.90%|
+|MBPP|3-shot|29.2|44.4|30.2|52.4|60.3|+106.51%|+35.81%|+99.67%|+15.08%|
+|||||||**Average**|**+84.76%**|**+1.69%**|**+42.42%**|**-14.78%**|
+#### Gemma 3
+The benchmarks and corresponding scores listed in the table below are taken directly from the [Gemma 3 technical report](https://arxiv.org/abs/2503.19786).
+|Benchmark|Metric|Gemma 3 1B|Gemma 3 4B|Motif 2.6B|Improvement(over 1B)|Improvement(over 4B)|
+|---|---|---|---|---|---|---|
+|HellaS|10-shot|62.3|77.2|69.89|+12.18%|-9.47%|
+|BoolQ|0-shot|63.2|72.3|67.76|+7.22%|-6.28%|
+|PIQA|0-shot|73.8|79.6|75.59|+2.43%|-5.04%|
+|SIQA|0-shot|48.9|51.9|61.97|+26.73%|+19.40%|
+|TQA|5-shot|39.8|65.8|54.97|+38.12%|-16.46%|
+|NQ|5-shot|9.48|20|10.91|+15.08%|-45.45%|
+|ARC-C|25-shot|38.4|56.2|75.08|+95.52%|+33.59%|
+|ARC-E|0-shot|73|82.4|87.21|+19.47%|+5.84%|
+|WinoG|5-shot|58.2|64.7|67.09|+15.27%|+3.69%|
+|BBH|few-shot, CoT|28.4|50.9|48.56|+70.99%|-4.60%|
+|Drop|1-shot, F1|42.4|60.1|29.33|-30.83%|-51.20%|
+|MMLU|5-shot|-|59.6|57.93|-|-2.80%|
+|MMLUpro|5-shot, CoT|-|29.2|-|-|-|
+|AGIE|3-5-shot|-|42.1|-|-|-|
+|MATH|4-shot, CoT|-|24.2|40.2|-|+66.12%|
+|GSM8K|8-shot, CoT|-|38.4|77.71|-|+102.37%|
+|GPQA Diamond|5-shot, CoT|-|15|31.81|-|+112.07%|
+|MBPP|3-shot|-|46|60.3|-|+31.09%|
+|HumanE|0-shot|-|36|68.3|-|+89.72%|
+|IFEval|-|80.2|90.2|74.02|-7.71%|-17.94%|
+|||||**Average**|**+22.04%**|**+16.93%**|
+### Comparison to the Llama series by Meta
+#### Llama 3
+The benchmarks and corresponding scores listed in the table below are taken directly from the [Llama 3 technical report](https://arxiv.org/abs/2407.21783).
+|Benchmark|Metric|Llama 3 8B|Motif 2.6B|Improvement|
+|---|---|---|---|---|
+|MMLU|5-shot|69.4|57.93|-16.53%|
+|MMLU|0-shot, CoT|73|57.95|-20.62%|
+|MMLU-Pro|5-shot, CoT|48.3|-|-|
+|IFEval|-|80.4|74.02|-7.94%|
+|HumanEval|0-shot|72.6|68.3|-5.92%|
+|MBPP|0-shot|72.8|57.93|-20.43%|
+|GSM8K|8-shot, CoT|84.5|77.71|-8.04%|
+|MATH|0-shot, CoT|51.9|49.68|-4.28%|
+|ARC Challenge|0-shot|83.4|74.2|-11.03%|
+|GPQA|0-shot, CoT|32.8|18.53|-43.51%|
+||||**Average**|**-15.36%**|
+#### Llama 3.2
+The benchmarks and corresponding scores listed in the table below are taken directly from the [Llama 3.2 official blog](https://ai.meta.com/blog/llama-3-2-connect-2024-vision-edge-mobile-devices/).
+|Benchmark|Metric|Llama 3.2 1B|Llama 3.2 1B|Motif 2.6B|Improvement(over 1B)|Improvement(over 3B)|
+|---|---|---|---|---|---|---|
+|MMLU|0-shot|49.3|63.4|57.6|+16.75%|-9.21%|
+|Open-rewrite eval*|0-shot, rougeL|41.6|40.1|-|-|-|
+|TLDR9+|test, 1-shot, rougeL|16.8|19|-|-|-|
+|IFEval|-|59.5|77.4|74.02|+24.40%|-4.37%|
+|GSM9K|8-shot, CoT|44.4|77.7|74.9|+68.69%|-3.60%|
+|MATH|0-shot, CoT|30.6|48|49.68|+62.35%|+3.50%|
+|ARC Challenge|0-shot|59.4|78.6|74.2|+24.92%|-5.6%|
+|GPQA|0-shot|27.2|32.8|25.45|-6.43%|-22.41%|
+|Hellaswag|0-shot|41.2|69.8|61.35|+48.91%|-12.11%|
+|||||**Average**|**+39.42%**|**-3.86%**|
+### Comparison to the Phi series by Microsoft
+The benchmarks and corresponding scores listed in the table below are taken directly from the [Phi-3 technical report](https://arxiv.org/abs/2404.14219).
+|Benchmark|Metric|Phi-3 3.8B|Phi-3 7B|Phi-2 2.7B|Motif 2.6B|Improvement(over 3.8B)|Improvement(over 7B)|Improvement(over 2.7B)|
+|---|---|---|---|---|---|---|---|---|
+|MMLU|5-shot|68.8|75.7|56.3|57.93|-15.80%|-23.47%|+2.90%|
+|HellaSwag|5-shot|76.7|77|53.6|68.97|-10.08%|-10.43%|+28.68%|
+|ANLI|7-shot|52.8|58.1|42.5|47.99|-9.11%|-17.40%|+12.92%|
+|GSM-8K|8-shot, CoT|82.5|89.6|61.1|76.5|-7.27%|-14.62%|+25.20%|
+|MATH|0-shot, CoT|41.3|34.6|-|49.68|+20.29%|+43.58%|-|
+|MedQA|2-shot|53.8|65.4|40.9|42.1|-21.75%|-35.63%|+2.93%|
+|AGIEval|0-shot|37.5|45.1|29.8|-|-|-|-|
+|TriviaQA|5-shot|64|58.1|45.2|54.97|-14.11%|-5.39%|+21.62%|
+|Arc-C|10-shot|84.9|90.7|75.9|75.17|-11.46%|-17.12%|-0.96%|
+|Arc-E|10-shot|94.6|97|88.5|88.64|-6.30%|-8.62%|+0.16%|
+|PIQA|5-shot|84.2|86.9|60.2|78.29|-7.02%|-9.91%|+30.05%|
+|SociQA|5-shot|76.6|79.2|68.3|66.73|-12.89%|-15.74%|-2.3%|
+|BigBench-Hard|3-shot, CoT|71.7|79.1|59.4|48.56|-32.27%|-38.61%|-18.25%|
+|WinoGrande|5-shot|70.8|81.5|54.7|67.09|-5.24%|-17.68%|+22.65%|
+|OpenBookQA|10-shot|83.2|88|73.6|87.8|+5.53%|-0.23%|+19.29%|
+|BoolQ|2-shot|77.2|84.8|-|70.7|-8.42%|-16.63%|-|
+|CommonSenseQA|10-shot|80.2|80|69.3|71.25|-11.16%|-10.94%|2.81%|
+|TruthfulQA|10-shot|65|70.2|-|52.07|-19.89%|-25.83%|-|
+|HumanEval|0-shot|58.5|61|59|68.29|+16.74%|+11.95%|+15.75%|
+|MBPP|3-shot|70|71.7|60.6|60.3|-13.86%|-15.90%|-0.50%|
+|GPQA|2-shot, CoT|32.8|34.3|-|23.44|-28.54%|-31.66%|-|
+|MT Bench|2R. Avg.|8.38|8.7|-|6.77|-19.21%|-22.18%|-|
+||||||**Average**|**-10.09%**|**-13.45%**|**+10.18%**|
+## Evaluation Appendix
+In the comparisons presented above, Motif 2.6B showed average performance improvements of -15.36% and -14.78% over Llama 3 8B and Gemma 2 9B, respectively, based on the benchmark scores reported in their original technical reports. However, when compared to the benchmarks and scores reported in the Qwen 2.5 technical report, Motif 2.6B shows an average improvement of +18.55% over Llama 3 8B and +1.12% over Gemma 2 9B. See the table below for details.
+### Comparison to Llama 3 8B and Gemma 2 9B based on scores from the *Qwen2.5 technical report*
+The benchmarks and corresponding scores listed in the table below are taken directly from the [Qwen2.5 technical report](https://arxiv.org/abs/2412.15115).
+|Benchmark|Metric|Llama 3 8B|Gemma 2 9B|Motif 2.6B|Improvement(over Llama 3 8B)|Improvement(over Gemma 2 9B)|
+|---|---|---|---|---|---|---|
+|MMLU|5-shot|66.6|71.3|57.93|-13.02%|-18.75%|
+|MMLU-pro|5-shot|35.4|44.7|28.4|-19.77%|-36.47%|
+|MMLU-redux|5-shot|61.6|67.9|59.54|-3.34%|-12.31%|
+|BBH|3-shot|57.7|68.2|39.28|-31.92%|-42.40%|
+|ARC-C|25-shot|59.3|68.2|75.08|+26.61%|+10.09%|
+|TruthfulQA|0-shot|44|45.3|41.55|-5.56%|-8.27%|
+|Winogrande|5-shot|77.4|79.5|67.09|-13.32%|-15.61%|
+|HellaSwag|10-shot|82.1|81.9|69.88|-14.88%|-14.68%|
+|GPQA|5-shot|25.8|32.8|29.24|+13.33%|-10.85%|
+|TheoremQA|5-shot|22.1|28.9|-|-|-|
+|MATH|4-shot|20.5|37.7|40.2|+96.10%|+6.63%|
+|MMLU-stem|5-shot|55.3|65.1|52.9|-4.34%|-18.74%|
+|GSM8K|4-shot|55.3|70.7|68.84|+24.48%|-2.63%|
+|HumanEval|0-shot|33.5|37.8|68.3|+103.88%|+80.69%|
+|HumanEval+|0-shot|29.3|30.5|62.2|+112.29%|+103.93%|
+|MBPP|0-shot|53.9|62.2|60.3|+11.87%|-3.05%|
+|MBPP+|0-shot|44.4|50.6|50.8|+14.41%|+0.40%|
+|MultiPL-E|0-shot|22.6|34.9|-|-|-|
+|||||**Average**|**+18.55%**|**+1.12%**|
+## How to use
+```python
+from transformers import AutoModelForCausalLM, AutoTokenizer
+model = AutoModelForCausalLM.from_pretrained(
+    "Motif-Technologies/Motif-2.6B",
+    trust_remote_code = True,
+    _attn_implementation = "eager", # also supports flash_attention_2
+).cuda()
+tokenizer = AutoTokenizer.from_pretrained(
+    "Motif-Technologies/Motif-2.6B",
+    trust_remote_code = True,
+)
+query = "What is the capital city of South Korea?"
+input_ids = tokenizer.apply_chat_template(
+    [
+        {'role': 'system', 'content': 'you are an helpful assistant'},
+        {'role': 'user', 'content': query},
+    ],
+    add_generation_prompt = True,
+    return_tensors='pt',
+).cuda()
+output = model.generate(input_ids, max_new_tokens=128, pad_token_id=tokenizer.eos_token_id)
+output = tokenizer.decode(res[0, input_ids.shape[-1]:], skip_special_tokens = True)
+print(output)
+"""
+The capital city of South Korea is Seoul. Located in the southern part of the country, Seoul is not only the largest city in South Korea but also one of the largest metropolitan areas in the world.
+It is a vibrant and dynamic city known for its rich history, cultural heritage, and modern amenities. Seoul is a major economic, cultural, and political center in East Asia, and it plays a crucial role in the region's politics, economy, and culture.
+The city is divided into different administrative districts, each with its own unique characteristics and attractions.
+"""

added_tokens.json ADDED Viewed

	@@ -0,0 +1,127 @@

+{
+  "</think>": 219406,
+  "<think>": 219404,
+  "<|assistant|>": 219402,
+  "<|beginoftext|>": 219396,
+  "<|dummy_id_100|>": 219505,
+  "<|dummy_id_101|>": 219506,
+  "<|dummy_id_102|>": 219507,
+  "<|dummy_id_103|>": 219508,
+  "<|dummy_id_104|>": 219509,
+  "<|dummy_id_105|>": 219510,
+  "<|dummy_id_106|>": 219511,
+  "<|dummy_id_107|>": 219512,
+  "<|dummy_id_108|>": 219513,
+  "<|dummy_id_109|>": 219514,
+  "<|dummy_id_10|>": 219414,
+  "<|dummy_id_110|>": 219515,
+  "<|dummy_id_111|>": 219516,
+  "<|dummy_id_112|>": 219517,
+  "<|dummy_id_113|>": 219518,
+  "<|dummy_id_114|>": 219519,
+  "<|dummy_id_11|>": 219415,
+  "<|dummy_id_12|>": 219417,
+  "<|dummy_id_13|>": 219418,
+  "<|dummy_id_14|>": 219419,
+  "<|dummy_id_15|>": 219420,
+  "<|dummy_id_16|>": 219421,
+  "<|dummy_id_17|>": 219422,
+  "<|dummy_id_18|>": 219423,
+  "<|dummy_id_19|>": 219424,
+  "<|dummy_id_20|>": 219425,
+  "<|dummy_id_21|>": 219426,
+  "<|dummy_id_22|>": 219427,
+  "<|dummy_id_23|>": 219428,
+  "<|dummy_id_24|>": 219429,
+  "<|dummy_id_25|>": 219430,
+  "<|dummy_id_26|>": 219431,
+  "<|dummy_id_27|>": 219432,
+  "<|dummy_id_28|>": 219433,
+  "<|dummy_id_29|>": 219434,
+  "<|dummy_id_30|>": 219435,
+  "<|dummy_id_31|>": 219436,
+  "<|dummy_id_32|>": 219437,
+  "<|dummy_id_33|>": 219438,
+  "<|dummy_id_34|>": 219439,
+  "<|dummy_id_35|>": 219440,
+  "<|dummy_id_36|>": 219441,
+  "<|dummy_id_37|>": 219442,
+  "<|dummy_id_38|>": 219443,
+  "<|dummy_id_39|>": 219444,
+  "<|dummy_id_3|>": 219407,
+  "<|dummy_id_40|>": 219445,
+  "<|dummy_id_41|>": 219446,
+  "<|dummy_id_42|>": 219447,
+  "<|dummy_id_43|>": 219448,
+  "<|dummy_id_44|>": 219449,
+  "<|dummy_id_45|>": 219450,
+  "<|dummy_id_46|>": 219451,
+  "<|dummy_id_47|>": 219452,
+  "<|dummy_id_48|>": 219453,
+  "<|dummy_id_49|>": 219454,
+  "<|dummy_id_4|>": 219408,
+  "<|dummy_id_50|>": 219455,
+  "<|dummy_id_51|>": 219456,
+  "<|dummy_id_52|>": 219457,
+  "<|dummy_id_53|>": 219458,
+  "<|dummy_id_54|>": 219459,
+  "<|dummy_id_55|>": 219460,
+  "<|dummy_id_56|>": 219461,
+  "<|dummy_id_57|>": 219462,
+  "<|dummy_id_58|>": 219463,
+  "<|dummy_id_59|>": 219464,
+  "<|dummy_id_5|>": 219409,
+  "<|dummy_id_60|>": 219465,
+  "<|dummy_id_61|>": 219466,
+  "<|dummy_id_62|>": 219467,
+  "<|dummy_id_63|>": 219468,
+  "<|dummy_id_64|>": 219469,
+  "<|dummy_id_65|>": 219470,
+  "<|dummy_id_66|>": 219471,
+  "<|dummy_id_67|>": 219472,
+  "<|dummy_id_68|>": 219473,
+  "<|dummy_id_69|>": 219474,
+  "<|dummy_id_6|>": 219410,
+  "<|dummy_id_70|>": 219475,
+  "<|dummy_id_71|>": 219476,
+  "<|dummy_id_72|>": 219477,
+  "<|dummy_id_73|>": 219478,
+  "<|dummy_id_74|>": 219479,
+  "<|dummy_id_75|>": 219480,
+  "<|dummy_id_76|>": 219481,
+  "<|dummy_id_77|>": 219482,
+  "<|dummy_id_78|>": 219483,
+  "<|dummy_id_79|>": 219484,
+  "<|dummy_id_7|>": 219411,
+  "<|dummy_id_80|>": 219485,
+  "<|dummy_id_81|>": 219486,
+  "<|dummy_id_82|>": 219487,
+  "<|dummy_id_83|>": 219488,
+  "<|dummy_id_84|>": 219489,
+  "<|dummy_id_85|>": 219490,
+  "<|dummy_id_86|>": 219491,
+  "<|dummy_id_87|>": 219492,
+  "<|dummy_id_88|>": 219493,
+  "<|dummy_id_89|>": 219494,
+  "<|dummy_id_8|>": 219412,
+  "<|dummy_id_90|>": 219495,
+  "<|dummy_id_91|>": 219496,
+  "<|dummy_id_92|>": 219497,
+  "<|dummy_id_93|>": 219498,
+  "<|dummy_id_94|>": 219499,
+  "<|dummy_id_95|>": 219500,
+  "<|dummy_id_96|>": 219501,
+  "<|dummy_id_97|>": 219502,
+  "<|dummy_id_98|>": 219503,
+  "<|dummy_id_99|>": 219504,
+  "<|dummy_id_9|>": 219413,
+  "<|endofprompt|>": 219416,
+  "<|endoftext|>": 219395,
+  "<|endofturn|>": 219405,
+  "<|fim_middle|>": 219398,
+  "<|fim_prefix|>": 219397,
+  "<|fim_suffix|>": 219399,
+  "<|startofturn|>": 219403,
+  "<|system|>": 219400,
+  "<|user|>": 219401
+}

config.json ADDED Viewed

	@@ -0,0 +1,35 @@

+{
+  "absolute_position_embedding": false,
+  "architectures": [
+    "MotifForCausalLM"
+  ],
+  "attention_dropout": 0.0,
+  "auto_map": {
+    "AutoConfig": "configuration_motif.MotifConfig",
+    "AutoModelForCausalLM": "modeling_motif.MotifForCausalLM"
+  },
+  "bos_token_id": 219396,
+  "eos_token_id": 219395,
+  "hidden_act": "poly_norm",
+  "hidden_size": 2048,
+  "initializer_range": 2e-05,
+  "intermediate_size": 8192,
+  "loss_reduction": "mean",
+  "max_position_embeddings": 16384,
+  "max_window_layers": 28,
+  "model_type": "Motif",
+  "num_attention_heads": 16,
+  "num_hidden_layers": 32,
+  "num_key_value_heads": 16,
+  "rms_norm_eps": 1e-06,
+  "rope_scaling": null,
+  "rope_theta": 500000.0,
+  "sliding_window": null,
+  "tie_word_embeddings": true,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.46.3",
+  "use_bias": false,
+  "use_cache": true,
+  "use_sliding_window": false,
+  "vocab_size": 219520
+}

configuration_motif.py ADDED Viewed

	@@ -0,0 +1,167 @@

+import math
+from typing import Optional
+from transformers.configuration_utils import PretrainedConfig
+from transformers.modeling_rope_utils import rope_config_validation
+from transformers.utils import logging
+logger = logging.get_logger(__name__)
+class MotifConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`MotifModel`]. It is used to instantiate a
+    Motif model according to the specified arguments, defining the model architecture. Instantiating a configuration
+    with the defaults will yield a similar configuration to that of
+    Motif-102B [moreh/Motif-102B](https://huggingface.co/moreh/Motif-102B).
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+    Args:
+        vocab_size (`int`, *optional*, defaults to 151936):
+            Vocabulary size of the Motif model. Defines the number of different tokens that can be represented by the
+            `inputs_ids` passed when calling [`MotifModel`]
+        hidden_size (`int`, *optional*, defaults to 4096):
+            Dimension of the hidden representations.
+        intermediate_size (`int`, *optional*, defaults to 22016):
+            Dimension of the MLP representations.
+        num_hidden_layers (`int`, *optional*, defaults to 32):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (`int`, *optional*, defaults to 32):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        num_key_value_heads (`int`, *optional*, defaults to 32):
+            This is the number of key_value heads that should be used to implement Grouped Query Attention. If
+            `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
+            `num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
+            converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
+            by meanpooling all the original heads within that group. For more details checkout [this
+            paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to `32`.
+        hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
+            The non-linear activation function (function or string) in the decoder.
+        max_position_embeddings (`int`, *optional*, defaults to 32768):
+            The maximum sequence length that this model might ever be used with.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        rms_norm_eps (`float`, *optional*, defaults to 1e-06):
+            The epsilon used by the rms normalization layers.
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether or not the model should return the last key/values attentions (not used by all models). Only
+            relevant if `config.is_decoder=True`.
+        tie_word_embeddings (`bool`, *optional*, defaults to `False`):
+            Whether the model's input and output word embeddings should be tied.
+        rope_theta (`float`, *optional*, defaults to 10000.0):
+            The base period of the RoPE embeddings.
+        rope_scaling (`Dict`, *optional*):
+            Dictionary containing the scaling configuration for the RoPE embeddings. NOTE: if you apply new rope type
+            and you expect the model to work on longer `max_position_embeddings`, we recommend you to update this value
+            accordingly.
+            Expected contents:
+                `rope_type` (`str`):
+                    The sub-variant of RoPE to use. Can be one of ['default', 'linear', 'dynamic', 'yarn', 'longrope',
+                    'llama3'], with 'default' being the original RoPE implementation.
+                `factor` (`float`, *optional*):
+                    Used with all rope types except 'default'. The scaling factor to apply to the RoPE embeddings. In
+                    most scaling types, a `factor` of x will enable the model to handle sequences of length x *
+                    original maximum pre-trained length.
+                `original_max_position_embeddings` (`int`, *optional*):
+                    Used with 'dynamic', 'longrope' and 'llama3'. The original max position embeddings used during
+                    pretraining.
+                `attention_factor` (`float`, *optional*):
+                    Used with 'yarn' and 'longrope'. The scaling factor to be applied on the attention
+                    computation. If unspecified, it defaults to value recommended by the implementation, using the
+                    `factor` field to infer the suggested value.
+                `beta_fast` (`float`, *optional*):
+                    Only used with 'yarn'. Parameter to set the boundary for extrapolation (only) in the linear
+                    ramp function. If unspecified, it defaults to 32.
+                `beta_slow` (`float`, *optional*):
+                    Only used with 'yarn'. Parameter to set the boundary for interpolation (only) in the linear
+                    ramp function. If unspecified, it defaults to 1.
+                `short_factor` (`List[float]`, *optional*):
+                    Only used with 'longrope'. The scaling factor to be applied to short contexts (<
+                    `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden
+                    size divided by the number of attention heads divided by 2
+                `long_factor` (`List[float]`, *optional*):
+                    Only used with 'longrope'. The scaling factor to be applied to long contexts (<
+                    `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden
+                    size divided by the number of attention heads divided by 2
+                `low_freq_factor` (`float`, *optional*):
+                    Only used with 'llama3'. Scaling factor applied to low frequency components of the RoPE
+                `high_freq_factor` (`float`, *optional*):
+                    Only used with 'llama3'. Scaling factor applied to high frequency components of the RoPE
+        use_sliding_window (`bool`, *optional*, defaults to `False`):
+            Whether to use sliding window attention.
+        sliding_window (`int`, *optional*, defaults to 4096):
+            Sliding window attention (SWA) window size. If not specified, will default to `4096`.
+        max_window_layers (`int`, *optional*, defaults to 28):
+            The number of layers that use SWA (Sliding Window Attention). The bottom layers use SWA while the top use full attention.
+        attention_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+    ```python
+    >>> from transformers import MotifModel, MotifConfig
+    >>> # Initializing a Motif style configuration
+    >>> configuration = MotifConfig()
+    >>> # Initializing a model from the Motif-102B style configuration
+    >>> model = MotifModel(configuration)
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+    model_type = "Motif"
+    keys_to_ignore_at_inference = ["past_key_values"]
+    def __init__(
+        self,
+        vocab_size=151936,
+        hidden_size=4096,
+        intermediate_size=22016,
+        num_hidden_layers=32,
+        num_attention_heads=32,
+        num_key_value_heads=32,
+        hidden_act="silu",
+        max_position_embeddings=32768,
+        initializer_range=0.02,
+        rms_norm_eps=1e-6,
+        use_cache=True,
+        tie_word_embeddings=False,
+        rope_theta=10000.0,
+        rope_scaling=None,
+        use_sliding_window=False,
+        sliding_window=4096,
+        max_window_layers=28,
+        attention_dropout=0.0,
+        **kwargs,
+    ):
+        self.vocab_size = vocab_size
+        self.max_position_embeddings = max_position_embeddings
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.use_sliding_window = use_sliding_window
+        self.sliding_window = sliding_window if use_sliding_window else None
+        self.max_window_layers = max_window_layers
+        # for backward compatibility
+        if num_key_value_heads is None:
+            num_key_value_heads = num_attention_heads
+        self.num_key_value_heads = num_key_value_heads
+        self.hidden_act = hidden_act
+        self.initializer_range = initializer_range
+        self.rms_norm_eps = rms_norm_eps
+        self.use_cache = use_cache
+        self.rope_theta = rope_theta
+        self.rope_scaling = rope_scaling
+        self.attention_dropout = attention_dropout
+        # Validate the correctness of rotary position embeddings parameters
+        # BC: if there is a 'type' field, move it to 'rope_type'.
+        if self.rope_scaling is not None and "type" in self.rope_scaling:
+            self.rope_scaling["rope_type"] = self.rope_scaling["type"]
+        rope_config_validation(self)
+        super().__init__(
+            tie_word_embeddings=tie_word_embeddings,
+            **kwargs,
+        )
+        logger.info(f' kwargs : {kwargs}')

generation_config.json ADDED Viewed

	@@ -0,0 +1,10 @@

+{
+  "_from_model_config": true,
+  "bos_token_id": 219396,
+  "eos_token_id": [
+    219395,
+    219405
+  ],
+  "transformers_version": "4.51.3",
+  "use_cache": true
+}

merges.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

model-00001-of-00003.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c3585a3263814f762598b6fc4464430d61b069742fa543e118d78bbefe01da08
+size 4952662512

model-00002-of-00003.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c9380f28b7892e62f48fbd07e0d533ddc0fee44a7ba0b6111408a8a49e577996
+size 4966459400

model-00003-of-00003.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:db220fa89bb483e4f8c029335034beb866b1f323adcac350f3d77fe546cad46c
+size 469808712

model.safetensors.index.json ADDED Viewed

	@@ -0,0 +1,521 @@

+{
+  "metadata": {
+    "total_size": 10388873728
+  },
+  "weight_map": {
+    "model.embed_tokens.weight": "model-00001-of-00003.safetensors",
+    "model.layers.0.input_layernorm.weight": "model-00001-of-00003.safetensors",
+    "model.layers.0.mlp.act_fn.bias": "model-00001-of-00003.safetensors",
+    "model.layers.0.mlp.act_fn.weight": "model-00001-of-00003.safetensors",
+    "model.layers.0.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.0.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.0.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
+    "model.layers.0.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.0.self_attn.lambda_k1": "model-00001-of-00003.safetensors",
+    "model.layers.0.self_attn.lambda_k2": "model-00001-of-00003.safetensors",
+    "model.layers.0.self_attn.lambda_q1": "model-00001-of-00003.safetensors",
+    "model.layers.0.self_attn.lambda_q2": "model-00001-of-00003.safetensors",
+    "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.0.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.0.self_attn.subln.weight": "model-00001-of-00003.safetensors",
+    "model.layers.0.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.1.input_layernorm.weight": "model-00001-of-00003.safetensors",
+    "model.layers.1.mlp.act_fn.bias": "model-00001-of-00003.safetensors",
+    "model.layers.1.mlp.act_fn.weight": "model-00001-of-00003.safetensors",
+    "model.layers.1.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.1.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.1.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
+    "model.layers.1.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.1.self_attn.lambda_k1": "model-00001-of-00003.safetensors",
+    "model.layers.1.self_attn.lambda_k2": "model-00001-of-00003.safetensors",
+    "model.layers.1.self_attn.lambda_q1": "model-00001-of-00003.safetensors",
+    "model.layers.1.self_attn.lambda_q2": "model-00001-of-00003.safetensors",
+    "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.1.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.1.self_attn.subln.weight": "model-00001-of-00003.safetensors",
+    "model.layers.1.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.10.input_layernorm.weight": "model-00001-of-00003.safetensors",
+    "model.layers.10.mlp.act_fn.bias": "model-00001-of-00003.safetensors",
+    "model.layers.10.mlp.act_fn.weight": "model-00001-of-00003.safetensors",
+    "model.layers.10.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.10.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.10.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.10.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
+    "model.layers.10.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.10.self_attn.lambda_k1": "model-00001-of-00003.safetensors",
+    "model.layers.10.self_attn.lambda_k2": "model-00001-of-00003.safetensors",
+    "model.layers.10.self_attn.lambda_q1": "model-00001-of-00003.safetensors",
+    "model.layers.10.self_attn.lambda_q2": "model-00001-of-00003.safetensors",
+    "model.layers.10.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.10.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.10.self_attn.subln.weight": "model-00001-of-00003.safetensors",
+    "model.layers.10.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.11.input_layernorm.weight": "model-00002-of-00003.safetensors",
+    "model.layers.11.mlp.act_fn.bias": "model-00002-of-00003.safetensors",
+    "model.layers.11.mlp.act_fn.weight": "model-00002-of-00003.safetensors",
+    "model.layers.11.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.11.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.11.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.11.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
+    "model.layers.11.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.11.self_attn.lambda_k1": "model-00001-of-00003.safetensors",
+    "model.layers.11.self_attn.lambda_k2": "model-00001-of-00003.safetensors",
+    "model.layers.11.self_attn.lambda_q1": "model-00001-of-00003.safetensors",
+    "model.layers.11.self_attn.lambda_q2": "model-00001-of-00003.safetensors",
+    "model.layers.11.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.11.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.11.self_attn.subln.weight": "model-00001-of-00003.safetensors",
+    "model.layers.11.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.12.input_layernorm.weight": "model-00002-of-00003.safetensors",
+    "model.layers.12.mlp.act_fn.bias": "model-00002-of-00003.safetensors",
+    "model.layers.12.mlp.act_fn.weight": "model-00002-of-00003.safetensors",
+    "model.layers.12.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.12.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.12.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.12.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
+    "model.layers.12.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.12.self_attn.lambda_k1": "model-00002-of-00003.safetensors",
+    "model.layers.12.self_attn.lambda_k2": "model-00002-of-00003.safetensors",
+    "model.layers.12.self_attn.lambda_q1": "model-00002-of-00003.safetensors",
+    "model.layers.12.self_attn.lambda_q2": "model-00002-of-00003.safetensors",
+    "model.layers.12.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.12.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.12.self_attn.subln.weight": "model-00002-of-00003.safetensors",
+    "model.layers.12.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.13.input_layernorm.weight": "model-00002-of-00003.safetensors",
+    "model.layers.13.mlp.act_fn.bias": "model-00002-of-00003.safetensors",
+    "model.layers.13.mlp.act_fn.weight": "model-00002-of-00003.safetensors",
+    "model.layers.13.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.13.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.13.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.13.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
+    "model.layers.13.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.13.self_attn.lambda_k1": "model-00002-of-00003.safetensors",
+    "model.layers.13.self_attn.lambda_k2": "model-00002-of-00003.safetensors",
+    "model.layers.13.self_attn.lambda_q1": "model-00002-of-00003.safetensors",
+    "model.layers.13.self_attn.lambda_q2": "model-00002-of-00003.safetensors",
+    "model.layers.13.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.13.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.13.self_attn.subln.weight": "model-00002-of-00003.safetensors",
+    "model.layers.13.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.14.input_layernorm.weight": "model-00002-of-00003.safetensors",
+    "model.layers.14.mlp.act_fn.bias": "model-00002-of-00003.safetensors",
+    "model.layers.14.mlp.act_fn.weight": "model-00002-of-00003.safetensors",
+    "model.layers.14.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.14.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.14.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.14.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
+    "model.layers.14.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.14.self_attn.lambda_k1": "model-00002-of-00003.safetensors",
+    "model.layers.14.self_attn.lambda_k2": "model-00002-of-00003.safetensors",
+    "model.layers.14.self_attn.lambda_q1": "model-00002-of-00003.safetensors",
+    "model.layers.14.self_attn.lambda_q2": "model-00002-of-00003.safetensors",
+    "model.layers.14.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.14.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.14.self_attn.subln.weight": "model-00002-of-00003.safetensors",
+    "model.layers.14.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.15.input_layernorm.weight": "model-00002-of-00003.safetensors",
+    "model.layers.15.mlp.act_fn.bias": "model-00002-of-00003.safetensors",
+    "model.layers.15.mlp.act_fn.weight": "model-00002-of-00003.safetensors",
+    "model.layers.15.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.15.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.15.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.15.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
+    "model.layers.15.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.15.self_attn.lambda_k1": "model-00002-of-00003.safetensors",
+    "model.layers.15.self_attn.lambda_k2": "model-00002-of-00003.safetensors",
+    "model.layers.15.self_attn.lambda_q1": "model-00002-of-00003.safetensors",
+    "model.layers.15.self_attn.lambda_q2": "model-00002-of-00003.safetensors",
+    "model.layers.15.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.15.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.15.self_attn.subln.weight": "model-00002-of-00003.safetensors",
+    "model.layers.15.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.16.input_layernorm.weight": "model-00002-of-00003.safetensors",
+    "model.layers.16.mlp.act_fn.bias": "model-00002-of-00003.safetensors",
+    "model.layers.16.mlp.act_fn.weight": "model-00002-of-00003.safetensors",
+    "model.layers.16.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.16.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.16.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.16.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
+    "model.layers.16.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.16.self_attn.lambda_k1": "model-00002-of-00003.safetensors",
+    "model.layers.16.self_attn.lambda_k2": "model-00002-of-00003.safetensors",
+    "model.layers.16.self_attn.lambda_q1": "model-00002-of-00003.safetensors",
+    "model.layers.16.self_attn.lambda_q2": "model-00002-of-00003.safetensors",
+    "model.layers.16.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.16.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.16.self_attn.subln.weight": "model-00002-of-00003.safetensors",
+    "model.layers.16.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.17.input_layernorm.weight": "model-00002-of-00003.safetensors",
+    "model.layers.17.mlp.act_fn.bias": "model-00002-of-00003.safetensors",
+    "model.layers.17.mlp.act_fn.weight": "model-00002-of-00003.safetensors",
+    "model.layers.17.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.17.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.17.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.17.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
+    "model.layers.17.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.17.self_attn.lambda_k1": "model-00002-of-00003.safetensors",
+    "model.layers.17.self_attn.lambda_k2": "model-00002-of-00003.safetensors",
+    "model.layers.17.self_attn.lambda_q1": "model-00002-of-00003.safetensors",
+    "model.layers.17.self_attn.lambda_q2": "model-00002-of-00003.safetensors",
+    "model.layers.17.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.17.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.17.self_attn.subln.weight": "model-00002-of-00003.safetensors",
+    "model.layers.17.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.18.input_layernorm.weight": "model-00002-of-00003.safetensors",
+    "model.layers.18.mlp.act_fn.bias": "model-00002-of-00003.safetensors",
+    "model.layers.18.mlp.act_fn.weight": "model-00002-of-00003.safetensors",
+    "model.layers.18.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.18.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.18.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.18.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
+    "model.layers.18.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.18.self_attn.lambda_k1": "model-00002-of-00003.safetensors",
+    "model.layers.18.self_attn.lambda_k2": "model-00002-of-00003.safetensors",
+    "model.layers.18.self_attn.lambda_q1": "model-00002-of-00003.safetensors",
+    "model.layers.18.self_attn.lambda_q2": "model-00002-of-00003.safetensors",
+    "model.layers.18.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.18.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.18.self_attn.subln.weight": "model-00002-of-00003.safetensors",
+    "model.layers.18.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.19.input_layernorm.weight": "model-00002-of-00003.safetensors",
+    "model.layers.19.mlp.act_fn.bias": "model-00002-of-00003.safetensors",
+    "model.layers.19.mlp.act_fn.weight": "model-00002-of-00003.safetensors",
+    "model.layers.19.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.19.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.19.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.19.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
+    "model.layers.19.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.19.self_attn.lambda_k1": "model-00002-of-00003.safetensors",
+    "model.layers.19.self_attn.lambda_k2": "model-00002-of-00003.safetensors",
+    "model.layers.19.self_attn.lambda_q1": "model-00002-of-00003.safetensors",
+    "model.layers.19.self_attn.lambda_q2": "model-00002-of-00003.safetensors",
+    "model.layers.19.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.19.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.19.self_attn.subln.weight": "model-00002-of-00003.safetensors",
+    "model.layers.19.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.2.input_layernorm.weight": "model-00001-of-00003.safetensors",
+    "model.layers.2.mlp.act_fn.bias": "model-00001-of-00003.safetensors",
+    "model.layers.2.mlp.act_fn.weight": "model-00001-of-00003.safetensors",
+    "model.layers.2.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.2.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.2.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
+    "model.layers.2.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.2.self_attn.lambda_k1": "model-00001-of-00003.safetensors",
+    "model.layers.2.self_attn.lambda_k2": "model-00001-of-00003.safetensors",
+    "model.layers.2.self_attn.lambda_q1": "model-00001-of-00003.safetensors",
+    "model.layers.2.self_attn.lambda_q2": "model-00001-of-00003.safetensors",
+    "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.2.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.2.self_attn.subln.weight": "model-00001-of-00003.safetensors",
+    "model.layers.2.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.20.input_layernorm.weight": "model-00002-of-00003.safetensors",
+    "model.layers.20.mlp.act_fn.bias": "model-00002-of-00003.safetensors",
+    "model.layers.20.mlp.act_fn.weight": "model-00002-of-00003.safetensors",
+    "model.layers.20.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.20.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.20.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.20.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
+    "model.layers.20.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.20.self_attn.lambda_k1": "model-00002-of-00003.safetensors",
+    "model.layers.20.self_attn.lambda_k2": "model-00002-of-00003.safetensors",
+    "model.layers.20.self_attn.lambda_q1": "model-00002-of-00003.safetensors",
+    "model.layers.20.self_attn.lambda_q2": "model-00002-of-00003.safetensors",
+    "model.layers.20.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.20.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.20.self_attn.subln.weight": "model-00002-of-00003.safetensors",
+    "model.layers.20.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.21.input_layernorm.weight": "model-00002-of-00003.safetensors",
+    "model.layers.21.mlp.act_fn.bias": "model-00002-of-00003.safetensors",
+    "model.layers.21.mlp.act_fn.weight": "model-00002-of-00003.safetensors",
+    "model.layers.21.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.21.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.21.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.21.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
+    "model.layers.21.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.21.self_attn.lambda_k1": "model-00002-of-00003.safetensors",
+    "model.layers.21.self_attn.lambda_k2": "model-00002-of-00003.safetensors",
+    "model.layers.21.self_attn.lambda_q1": "model-00002-of-00003.safetensors",
+    "model.layers.21.self_attn.lambda_q2": "model-00002-of-00003.safetensors",
+    "model.layers.21.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.21.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.21.self_attn.subln.weight": "model-00002-of-00003.safetensors",
+    "model.layers.21.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.22.input_layernorm.weight": "model-00002-of-00003.safetensors",
+    "model.layers.22.mlp.act_fn.bias": "model-00002-of-00003.safetensors",
+    "model.layers.22.mlp.act_fn.weight": "model-00002-of-00003.safetensors",
+    "model.layers.22.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.22.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.22.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.22.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
+    "model.layers.22.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.22.self_attn.lambda_k1": "model-00002-of-00003.safetensors",
+    "model.layers.22.self_attn.lambda_k2": "model-00002-of-00003.safetensors",
+    "model.layers.22.self_attn.lambda_q1": "model-00002-of-00003.safetensors",
+    "model.layers.22.self_attn.lambda_q2": "model-00002-of-00003.safetensors",
+    "model.layers.22.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.22.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.22.self_attn.subln.weight": "model-00002-of-00003.safetensors",
+    "model.layers.22.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.23.input_layernorm.weight": "model-00002-of-00003.safetensors",
+    "model.layers.23.mlp.act_fn.bias": "model-00002-of-00003.safetensors",
+    "model.layers.23.mlp.act_fn.weight": "model-00002-of-00003.safetensors",
+    "model.layers.23.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.23.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.23.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.23.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
+    "model.layers.23.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.23.self_attn.lambda_k1": "model-00002-of-00003.safetensors",
+    "model.layers.23.self_attn.lambda_k2": "model-00002-of-00003.safetensors",
+    "model.layers.23.self_attn.lambda_q1": "model-00002-of-00003.safetensors",
+    "model.layers.23.self_attn.lambda_q2": "model-00002-of-00003.safetensors",
+    "model.layers.23.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.23.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.23.self_attn.subln.weight": "model-00002-of-00003.safetensors",
+    "model.layers.23.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.24.input_layernorm.weight": "model-00002-of-00003.safetensors",
+    "model.layers.24.mlp.act_fn.bias": "model-00002-of-00003.safetensors",
+    "model.layers.24.mlp.act_fn.weight": "model-00002-of-00003.safetensors",
+    "model.layers.24.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.24.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.24.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.24.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
+    "model.layers.24.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.24.self_attn.lambda_k1": "model-00002-of-00003.safetensors",
+    "model.layers.24.self_attn.lambda_k2": "model-00002-of-00003.safetensors",
+    "model.layers.24.self_attn.lambda_q1": "model-00002-of-00003.safetensors",
+    "model.layers.24.self_attn.lambda_q2": "model-00002-of-00003.safetensors",
+    "model.layers.24.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.24.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.24.self_attn.subln.weight": "model-00002-of-00003.safetensors",
+    "model.layers.24.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.25.input_layernorm.weight": "model-00002-of-00003.safetensors",
+    "model.layers.25.mlp.act_fn.bias": "model-00002-of-00003.safetensors",
+    "model.layers.25.mlp.act_fn.weight": "model-00002-of-00003.safetensors",
+    "model.layers.25.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.25.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.25.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.25.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
+    "model.layers.25.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.25.self_attn.lambda_k1": "model-00002-of-00003.safetensors",
+    "model.layers.25.self_attn.lambda_k2": "model-00002-of-00003.safetensors",
+    "model.layers.25.self_attn.lambda_q1": "model-00002-of-00003.safetensors",
+    "model.layers.25.self_attn.lambda_q2": "model-00002-of-00003.safetensors",
+    "model.layers.25.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.25.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.25.self_attn.subln.weight": "model-00002-of-00003.safetensors",
+    "model.layers.25.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.26.input_layernorm.weight": "model-00002-of-00003.safetensors",
+    "model.layers.26.mlp.act_fn.bias": "model-00002-of-00003.safetensors",
+    "model.layers.26.mlp.act_fn.weight": "model-00002-of-00003.safetensors",
+    "model.layers.26.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.26.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.26.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.26.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
+    "model.layers.26.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.26.self_attn.lambda_k1": "model-00002-of-00003.safetensors",
+    "model.layers.26.self_attn.lambda_k2": "model-00002-of-00003.safetensors",
+    "model.layers.26.self_attn.lambda_q1": "model-00002-of-00003.safetensors",
+    "model.layers.26.self_attn.lambda_q2": "model-00002-of-00003.safetensors",
+    "model.layers.26.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.26.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.26.self_attn.subln.weight": "model-00002-of-00003.safetensors",
+    "model.layers.26.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.27.input_layernorm.weight": "model-00002-of-00003.safetensors",
+    "model.layers.27.mlp.act_fn.bias": "model-00002-of-00003.safetensors",
+    "model.layers.27.mlp.act_fn.weight": "model-00002-of-00003.safetensors",
+    "model.layers.27.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.27.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.27.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.27.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
+    "model.layers.27.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.27.self_attn.lambda_k1": "model-00002-of-00003.safetensors",
+    "model.layers.27.self_attn.lambda_k2": "model-00002-of-00003.safetensors",
+    "model.layers.27.self_attn.lambda_q1": "model-00002-of-00003.safetensors",
+    "model.layers.27.self_attn.lambda_q2": "model-00002-of-00003.safetensors",
+    "model.layers.27.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.27.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.27.self_attn.subln.weight": "model-00002-of-00003.safetensors",
+    "model.layers.27.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.28.input_layernorm.weight": "model-00002-of-00003.safetensors",
+    "model.layers.28.mlp.act_fn.bias": "model-00002-of-00003.safetensors",
+    "model.layers.28.mlp.act_fn.weight": "model-00002-of-00003.safetensors",
+    "model.layers.28.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.28.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.28.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.28.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
+    "model.layers.28.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.28.self_attn.lambda_k1": "model-00002-of-00003.safetensors",
+    "model.layers.28.self_attn.lambda_k2": "model-00002-of-00003.safetensors",
+    "model.layers.28.self_attn.lambda_q1": "model-00002-of-00003.safetensors",
+    "model.layers.28.self_attn.lambda_q2": "model-00002-of-00003.safetensors",
+    "model.layers.28.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.28.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.28.self_attn.subln.weight": "model-00002-of-00003.safetensors",
+    "model.layers.28.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.29.input_layernorm.weight": "model-00002-of-00003.safetensors",
+    "model.layers.29.mlp.act_fn.bias": "model-00002-of-00003.safetensors",
+    "model.layers.29.mlp.act_fn.weight": "model-00002-of-00003.safetensors",
+    "model.layers.29.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.29.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.29.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.29.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
+    "model.layers.29.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.29.self_attn.lambda_k1": "model-00002-of-00003.safetensors",
+    "model.layers.29.self_attn.lambda_k2": "model-00002-of-00003.safetensors",
+    "model.layers.29.self_attn.lambda_q1": "model-00002-of-00003.safetensors",
+    "model.layers.29.self_attn.lambda_q2": "model-00002-of-00003.safetensors",
+    "model.layers.29.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.29.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.29.self_attn.subln.weight": "model-00002-of-00003.safetensors",
+    "model.layers.29.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.3.input_layernorm.weight": "model-00001-of-00003.safetensors",
+    "model.layers.3.mlp.act_fn.bias": "model-00001-of-00003.safetensors",
+    "model.layers.3.mlp.act_fn.weight": "model-00001-of-00003.safetensors",
+    "model.layers.3.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.3.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.3.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.3.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
+    "model.layers.3.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.3.self_attn.lambda_k1": "model-00001-of-00003.safetensors",
+    "model.layers.3.self_attn.lambda_k2": "model-00001-of-00003.safetensors",
+    "model.layers.3.self_attn.lambda_q1": "model-00001-of-00003.safetensors",
+    "model.layers.3.self_attn.lambda_q2": "model-00001-of-00003.safetensors",
+    "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.3.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.3.self_attn.subln.weight": "model-00001-of-00003.safetensors",
+    "model.layers.3.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.30.input_layernorm.weight": "model-00003-of-00003.safetensors",
+    "model.layers.30.mlp.act_fn.bias": "model-00003-of-00003.safetensors",
+    "model.layers.30.mlp.act_fn.weight": "model-00003-of-00003.safetensors",
+    "model.layers.30.mlp.down_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.30.mlp.gate_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.30.mlp.up_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.30.post_attention_layernorm.weight": "model-00003-of-00003.safetensors",
+    "model.layers.30.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.30.self_attn.lambda_k1": "model-00002-of-00003.safetensors",
+    "model.layers.30.self_attn.lambda_k2": "model-00002-of-00003.safetensors",
+    "model.layers.30.self_attn.lambda_q1": "model-00002-of-00003.safetensors",
+    "model.layers.30.self_attn.lambda_q2": "model-00002-of-00003.safetensors",
+    "model.layers.30.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.30.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.30.self_attn.subln.weight": "model-00002-of-00003.safetensors",
+    "model.layers.30.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.31.input_layernorm.weight": "model-00003-of-00003.safetensors",
+    "model.layers.31.mlp.act_fn.bias": "model-00003-of-00003.safetensors",
+    "model.layers.31.mlp.act_fn.weight": "model-00003-of-00003.safetensors",
+    "model.layers.31.mlp.down_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.31.mlp.gate_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.31.mlp.up_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.31.post_attention_layernorm.weight": "model-00003-of-00003.safetensors",
+    "model.layers.31.self_attn.k_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.31.self_attn.lambda_k1": "model-00003-of-00003.safetensors",
+    "model.layers.31.self_attn.lambda_k2": "model-00003-of-00003.safetensors",
+    "model.layers.31.self_attn.lambda_q1": "model-00003-of-00003.safetensors",
+    "model.layers.31.self_attn.lambda_q2": "model-00003-of-00003.safetensors",
+    "model.layers.31.self_attn.o_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.31.self_attn.q_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.31.self_attn.subln.weight": "model-00003-of-00003.safetensors",
+    "model.layers.31.self_attn.v_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.4.input_layernorm.weight": "model-00001-of-00003.safetensors",
+    "model.layers.4.mlp.act_fn.bias": "model-00001-of-00003.safetensors",
+    "model.layers.4.mlp.act_fn.weight": "model-00001-of-00003.safetensors",
+    "model.layers.4.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.4.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.4.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.4.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
+    "model.layers.4.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.4.self_attn.lambda_k1": "model-00001-of-00003.safetensors",
+    "model.layers.4.self_attn.lambda_k2": "model-00001-of-00003.safetensors",
+    "model.layers.4.self_attn.lambda_q1": "model-00001-of-00003.safetensors",
+    "model.layers.4.self_attn.lambda_q2": "model-00001-of-00003.safetensors",
+    "model.layers.4.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.4.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.4.self_attn.subln.weight": "model-00001-of-00003.safetensors",
+    "model.layers.4.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.5.input_layernorm.weight": "model-00001-of-00003.safetensors",
+    "model.layers.5.mlp.act_fn.bias": "model-00001-of-00003.safetensors",
+    "model.layers.5.mlp.act_fn.weight": "model-00001-of-00003.safetensors",
+    "model.layers.5.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.5.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.5.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.5.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
+    "model.layers.5.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.5.self_attn.lambda_k1": "model-00001-of-00003.safetensors",
+    "model.layers.5.self_attn.lambda_k2": "model-00001-of-00003.safetensors",
+    "model.layers.5.self_attn.lambda_q1": "model-00001-of-00003.safetensors",
+    "model.layers.5.self_attn.lambda_q2": "model-00001-of-00003.safetensors",
+    "model.layers.5.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.5.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.5.self_attn.subln.weight": "model-00001-of-00003.safetensors",
+    "model.layers.5.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.6.input_layernorm.weight": "model-00001-of-00003.safetensors",
+    "model.layers.6.mlp.act_fn.bias": "model-00001-of-00003.safetensors",
+    "model.layers.6.mlp.act_fn.weight": "model-00001-of-00003.safetensors",
+    "model.layers.6.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.6.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.6.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.6.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
+    "model.layers.6.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.6.self_attn.lambda_k1": "model-00001-of-00003.safetensors",
+    "model.layers.6.self_attn.lambda_k2": "model-00001-of-00003.safetensors",
+    "model.layers.6.self_attn.lambda_q1": "model-00001-of-00003.safetensors",
+    "model.layers.6.self_attn.lambda_q2": "model-00001-of-00003.safetensors",
+    "model.layers.6.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.6.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.6.self_attn.subln.weight": "model-00001-of-00003.safetensors",
+    "model.layers.6.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.7.input_layernorm.weight": "model-00001-of-00003.safetensors",
+    "model.layers.7.mlp.act_fn.bias": "model-00001-of-00003.safetensors",
+    "model.layers.7.mlp.act_fn.weight": "model-00001-of-00003.safetensors",
+    "model.layers.7.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.7.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.7.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.7.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
+    "model.layers.7.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.7.self_attn.lambda_k1": "model-00001-of-00003.safetensors",
+    "model.layers.7.self_attn.lambda_k2": "model-00001-of-00003.safetensors",
+    "model.layers.7.self_attn.lambda_q1": "model-00001-of-00003.safetensors",
+    "model.layers.7.self_attn.lambda_q2": "model-00001-of-00003.safetensors",
+    "model.layers.7.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.7.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.7.self_attn.subln.weight": "model-00001-of-00003.safetensors",
+    "model.layers.7.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.8.input_layernorm.weight": "model-00001-of-00003.safetensors",
+    "model.layers.8.mlp.act_fn.bias": "model-00001-of-00003.safetensors",
+    "model.layers.8.mlp.act_fn.weight": "model-00001-of-00003.safetensors",
+    "model.layers.8.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.8.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.8.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.8.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
+    "model.layers.8.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.8.self_attn.lambda_k1": "model-00001-of-00003.safetensors",
+    "model.layers.8.self_attn.lambda_k2": "model-00001-of-00003.safetensors",
+    "model.layers.8.self_attn.lambda_q1": "model-00001-of-00003.safetensors",
+    "model.layers.8.self_attn.lambda_q2": "model-00001-of-00003.safetensors",
+    "model.layers.8.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.8.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.8.self_attn.subln.weight": "model-00001-of-00003.safetensors",
+    "model.layers.8.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.9.input_layernorm.weight": "model-00001-of-00003.safetensors",
+    "model.layers.9.mlp.act_fn.bias": "model-00001-of-00003.safetensors",
+    "model.layers.9.mlp.act_fn.weight": "model-00001-of-00003.safetensors",
+    "model.layers.9.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.9.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.9.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.9.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
+    "model.layers.9.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.9.self_attn.lambda_k1": "model-00001-of-00003.safetensors",
+    "model.layers.9.self_attn.lambda_k2": "model-00001-of-00003.safetensors",
+    "model.layers.9.self_attn.lambda_q1": "model-00001-of-00003.safetensors",
+    "model.layers.9.self_attn.lambda_q2": "model-00001-of-00003.safetensors",
+    "model.layers.9.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.9.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.9.self_attn.subln.weight": "model-00001-of-00003.safetensors",
+    "model.layers.9.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
+    "model.norm.weight": "model-00003-of-00003.safetensors"
+  }
+}

modeling_motif.py ADDED Viewed

	@@ -0,0 +1,1378 @@

+import math
+from dataclasses import dataclass
+from typing import List, Optional, Tuple, Union
+import torch
+import torch.nn.functional as F
+import torch.utils.checkpoint
+from torch import nn
+from torch.nn import CrossEntropyLoss
+from transformers.activations import ACT2CLS as _ACT2CLS
+from transformers.activations import ClassInstantier
+from transformers.cache_utils import Cache, DynamicCache, SlidingWindowCache, StaticCache
+from transformers.generation import GenerationMixin
+from transformers.modeling_attn_mask_utils import AttentionMaskConverter
+from transformers.modeling_flash_attention_utils import _flash_attention_forward
+from transformers.modeling_outputs import CausalLMOutputWithPast, ModelOutput
+from transformers.modeling_rope_utils import ROPE_INIT_FUNCTIONS
+from transformers.modeling_utils import PreTrainedModel
+from transformers.pytorch_utils import ALL_LAYERNORM_LAYERS
+from transformers.utils import (add_start_docstrings, add_start_docstrings_to_model_forward, is_flash_attn_2_available,
+                                is_flash_attn_greater_or_equal_2_10, logging, replace_return_docstrings)
+from .configuration_motif import MotifConfig
+class PolyNorm(torch.nn.Module):
+    """
+    A trainable activation function introduced in https://arxiv.org/html/2411.03884v1.
+    The code is copied from https://github.com/BryceZhuo/PolyCom?tab=readme-ov-file/README.md
+    """
+    def __init__(self, eps=1e-6):
+        super(PolyNorm, self).__init__()
+        self.weight = torch.nn.Parameter(torch.ones(3) / 3)
+        self.bias = torch.nn.Parameter(torch.zeros(1))
+        self.eps = eps
+    def _norm(self, x):
+        return x / torch.sqrt(x.pow(2).mean(-1, keepdim=True) + self.eps)
+    def forward(self, x):
+        return self.weight[0] * self._norm(x ** 3) + self.weight[1] * self._norm(
+            x ** 2) + self.weight[2] * self._norm(x) + self.bias
+CUSTOM_ACT2CLS = {"poly_norm": PolyNorm}
+ACT2CLS = {**_ACT2CLS, **CUSTOM_ACT2CLS}
+ACT2FN = ClassInstantier(ACT2CLS)
+logger = logging.get_logger(__name__)
+if is_flash_attn_2_available():
+    from transformers.modeling_flash_attention_utils import _flash_attention_forward
+_CONFIG_FOR_DOC = "MotifConfig"
+class MotifRMSNorm(nn.Module):
+    def __init__(self, hidden_size, eps=1e-6):
+        """
+        MotifRMSNorm is equivalent to T5LayerNorm
+        """
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(hidden_size))
+        self.variance_epsilon = eps
+    def forward(self, hidden_states):
+        input_dtype = hidden_states.dtype
+        hidden_states = hidden_states.to(torch.float32)
+        variance = hidden_states.pow(2).mean(-1, keepdim=True)
+        hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
+        return self.weight * hidden_states.to(input_dtype)
+    def extra_repr(self):
+        return f"{tuple(self.weight.shape)}, eps={self.variance_epsilon}"
+ALL_LAYERNORM_LAYERS.append(MotifRMSNorm)
+class MotifRotaryEmbeddingWithCache(nn.Module):
+    """
+    Rotary positional embedding module with caching for efficiency.
+    Args:
+        dim (int): Dimensionality of the embedding.
+        max_position_embeddings (int): Maximum sequence length for caching. Default is 2048.
+        base (int): Base for computing inverse frequency. Default is 10000.
+        device (torch.device, optional): Device for tensor storage.
+    Methods:
+        forward(x, seq_len=None):
+            Computes cosine and sine embeddings for input sequence length.
+            Automatically updates cache if `seq_len` exceeds cached length.
+    Attributes:
+        inv_freq (torch.Tensor): Inverse frequency tensor for position encoding.
+        cos_cached (torch.Tensor): Cached cosine embeddings.
+        sin_cached (torch.Tensor): Cached sine embeddings.
+    """
+    def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None):
+        super().__init__()
+        self.dim = dim
+        self.max_position_embeddings = max_position_embeddings
+        self.base = base
+        inv_freq = 1.0 / (self.base ** (torch.arange(0, self.dim, 2).float().to(device) / self.dim))
+        self.register_buffer("inv_freq", inv_freq, persistent=False)
+        self._set_cos_sin_cache(seq_len=max_position_embeddings,
+                                device=self.inv_freq.device,
+                                dtype=torch.get_default_dtype())
+    def _set_cos_sin_cache(self, seq_len, device, dtype):
+        self.max_seq_len_cached = seq_len
+        t = torch.arange(self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype)
+        freqs = torch.outer(t, self.inv_freq)
+        # Different from paper, but it uses a different permutation in order to obtain the same calculation
+        emb = torch.cat((freqs, freqs), dim=-1)
+        self.register_buffer("cos_cached", emb.cos().to(dtype), persistent=False)
+        self.register_buffer("sin_cached", emb.sin().to(dtype), persistent=False)
+    def forward(self, x, seq_len=None):
+        # x: [bs, num_attention_heads, seq_len, head_size]
+        if seq_len > self.max_seq_len_cached:
+            self._set_cos_sin_cache(seq_len=seq_len, device=x.device, dtype=x.dtype)
+        return (
+            self.cos_cached[ :seq_len].to(dtype=x.dtype),
+            self.sin_cached[ :seq_len].to(dtype=x.dtype),
+        )
+class MotifRotaryEmbedding(nn.Module):
+    def __init__(
+        self,
+        dim=None,
+        max_position_embeddings=2048,
+        base=10000,
+        device=None,
+        scaling_factor=1.0,
+        rope_type="default",
+        config: Optional[MotifConfig] = None,
+    ):
+        super().__init__()
+        # TODO (joao): remove the `if` below, only used for BC
+        self.rope_kwargs = {}
+        if config is None:
+            logger.warning_once(
+                "`MotifRotaryEmbedding` can now be fully parameterized by passing the model config through the "
+                "`config` argument. All other arguments will be removed in v4.46")
+            self.rope_kwargs = {
+                "rope_type": rope_type,
+                "factor": scaling_factor,
+                "dim": dim,
+                "base": base,
+                "max_position_embeddings": max_position_embeddings,
+            }
+            self.rope_type = rope_type
+            self.max_seq_len_cached = max_position_embeddings
+            self.original_max_seq_len = max_position_embeddings
+        else:
+            if config.rope_scaling is not None:
+                self.rope_type = config.rope_scaling.get("rope_type", config.rope_scaling.get("type"))
+            else:
+                self.rope_type = "default"
+            self.max_seq_len_cached = config.max_position_embeddings
+            self.original_max_seq_len = config.max_position_embeddings
+        self.config = config
+        self.rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type]
+        inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device, **self.rope_kwargs)
+        self.register_buffer("inv_freq", inv_freq, persistent=False)
+        self.original_inv_freq = self.inv_freq
+    def _dynamic_frequency_update(self, position_ids, device):
+        """
+        dynamic RoPE layers should recompute `inv_freq` in the following situations:
+        1 - growing beyond the cached sequence length (allow scaling)
+        2 - the current sequence length is in the original scale (avoid losing precision with small sequences)
+        """
+        seq_len = torch.max(position_ids) + 1
+        if seq_len > self.max_seq_len_cached:  # growth
+            inv_freq, self.attention_scaling = self.rope_init_fn(self.config,
+                                                                 device,
+                                                                 seq_len=seq_len,
+                                                                 **self.rope_kwargs)
+            self.register_buffer("inv_freq", inv_freq, persistent=False)  # TODO joao: may break with compilation
+            self.max_seq_len_cached = seq_len
+        if seq_len < self.original_max_seq_len and self.max_seq_len_cached > self.original_max_seq_len:  # reset
+            self.register_buffer("inv_freq", self.original_inv_freq, persistent=False)
+            self.max_seq_len_cached = self.original_max_seq_len
+    @torch.no_grad()
+    def forward(self, x, position_ids):
+        if "dynamic" in self.rope_type:
+            self._dynamic_frequency_update(position_ids, device=x.device)
+        # Core RoPE block
+        inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1)
+        position_ids_expanded = position_ids[:, None, :].float()
+        # Force float32 (see https://github.com/huggingface/transformers/pull/29285)
+        device_type = x.device.type
+        device_type = device_type if isinstance(device_type, str) and device_type != "mps" else "cpu"
+        with torch.autocast(device_type=device_type, enabled=False):
+            freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
+            emb = torch.cat((freqs, freqs), dim=-1)
+            cos = emb.cos()
+            sin = emb.sin()
+        # Advanced RoPE types (e.g. yarn) apply a post-processing scaling factor, equivalent to scaling attention
+        cos = cos * self.attention_scaling
+        sin = sin * self.attention_scaling
+        return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype)
+def rotate_half(x):
+    """
+    Rotates half of the dimensions of the input tensor using torch.roll and in-place negation.
+    Args:
+    x (torch.Tensor): The input tensor.
+    Returns:
+    torch.Tensor: A tensor where the latter half of the dimensions are negated
+                  and moved before the first half.
+    """
+    half_size = x.shape[-1] // 2
+    rotated_tensor = torch.roll(x, shifts=-half_size, dims=-1)
+    rotated_tensor[..., :half_size] *= -1
+    return rotated_tensor
+def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_dim=1):
+    """
+    Applies rotary position embeddings to the input tensors.
+    Args:
+        q (torch.Tensor): Query tensor of shape (B, NH, S, D_KV).
+        k (torch.Tensor): Key tensor of shape (B, NH, S, D_KV).
+        cos (torch.Tensor): Cosine values for rotary embedding.
+        sin (torch.Tensor): Sine values for rotary embedding.
+        unsqueeze_dim (int, optional): Dimension along which `cos` and `sin` are unsqueezed.
+            Defaults to 1.
+    Returns:
+        Tuple[torch.Tensor, torch.Tensor]: Returns transformed query and key tensors after applying rotary embeddings.
+    """
+    '''
+    # (B, NH, S, D_KV) -> (B, S, NH, D_KV)
+    cos = cos.unsqueeze(unsqueeze_dim)
+    sin = sin.unsqueeze(unsqueeze_dim)
+    q_embed = (q * cos) + (rotate_half(q) * sin)
+    k_embed = (k * cos) + (rotate_half(k) * sin)
+    '''
+    device = q.device
+    return map(
+            lambda x: (x * cos[position_ids].unsqueeze(unsqueeze_dim).to(device)) +
+            (rotate_half(x) * sin[position_ids].unsqueeze(unsqueeze_dim).to(device)), (q, k))
+class MotifMLP(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        self.intermediate_size = config.intermediate_size
+        self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=config.use_bias)
+        self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=config.use_bias)
+        self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=config.use_bias)
+        self.act_fn = ACT2FN[config.hidden_act]
+    def forward(self, hidden_state):
+        return self.down_proj(self.act_fn(self.gate_proj(hidden_state)) * self.up_proj(hidden_state))
+def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
+    """
+    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
+    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
+    batch, num_key_value_heads, slen, head_dim = hidden_states.shape
+    if n_rep == 1:
+        return hidden_states
+    hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim)
+    return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
+    """
+    return torch.repeat_interleave(hidden_states, dim=1, repeats=n_rep)
+class MotifAttention(nn.Module):
+    """
+    Differential Attention (DiffAttention) module.
+    Implements the Differential Attention from
+    "DIFFERENTIAL TRANSFORMER" (https://arxiv.org/pdf/2410.05258).
+    Overview
+        Standard transformers often over-allocate attention to irrelevant context.
+        DiffAttention addresses this by computing attention as the difference between
+        two separate softmax attention maps, effectively canceling noise and promoting
+        sparse, structured attention patterns.
+    Reference Implementation
+        https://github.com/microsoft/unilm/tree/master/Diff-Transformer
+    Args
+        The differential attention mechanism computes attention as the difference of two softmax attention scores, weighted by a learnable scalar λ.
+        λ is re-parameterized as λ = exp(λ_q1 · λ_k1) − exp(λ_q2 · λ_k2) + λ_init.
+        - lambda_q1, lambda_q2 (nn.Parameter): Learnable vectors used to compute the first and second components of λ for query transformations.
+        - lambda_k1, lambda_k2 (nn.Parameter): Learnable vectors used to compute the first and second components of λ for key transformations.
+        - lambda_init (float): A constant used for initializing λ, typically set as λ_init = 0.8 − 0.6 × exp(−0.3 × (layer_index − 1)).
+    """
+    def __init__(self, config: MotifConfig, layer_idx: Optional[int] = None):
+        super().__init__()
+        self.config = config
+        self.layer_idx = layer_idx
+        if layer_idx is None:
+            logger.warning_once(
+                f"Instantiating {self.__class__.__name__} without passing `layer_idx` is not recommended and will "
+                "to errors during the forward call, if caching is used. Please make sure to provide a `layer_idx` "
+                "when creating this class.")
+        self.hidden_size = config.hidden_size
+        self.num_heads = config.num_attention_heads
+        self.head_dim = self.hidden_size // self.num_heads
+        self.num_key_value_heads = config.num_key_value_heads
+        self.num_key_value_groups = self.num_heads // self.num_key_value_heads
+        self.max_position_embeddings = config.max_position_embeddings
+        self.rope_theta = config.rope_theta
+        self.is_causal = True
+        self.attention_dropout = config.attention_dropout
+        if (self.head_dim * self.num_heads) != self.hidden_size:
+            raise ValueError(f"hidden_size must be divisible by num_heads (got `hidden_size`: {self.hidden_size}"
+                             f" and `num_heads`: {self.num_heads}).")
+        self.num_heads //= 2
+        self.num_key_value_heads //= 2
+        self.n_rep = self.num_heads // self.num_key_value_heads
+        self.q_proj = nn.Linear(self.hidden_size, self.hidden_size, bias=False)
+        self.k_proj = nn.Linear(self.hidden_size, self.hidden_size // self.n_rep, bias=False)
+        self.v_proj = nn.Linear(self.hidden_size, self.hidden_size // self.n_rep, bias=False)
+        self.o_proj = nn.Linear(self.hidden_size, self.hidden_size, bias=False)
+        for name in ["lambda_q1", "lambda_k1", "lambda_q2", "lambda_k2"]:
+            setattr(self, name, nn.Parameter(torch.zeros(self.head_dim, dtype=torch.float32)))
+            getattr(self, name).data.normal_(mean=0.0, std=0.1)
+        self.subln = MotifRMSNorm(2 * self.head_dim, eps=1e-5)
+        self.lambda_init = 0.8 - 0.6 * math.exp(-0.3 * (layer_idx - 1))
+        self.rotary_emb = MotifRotaryEmbeddingWithCache(self.head_dim,
+                                                max_position_embeddings=self.max_position_embeddings,
+                                                base=self.rope_theta)
+    def forward(
+            self,
+            hidden_states: torch.Tensor,
+            attention_mask: Optional[torch.Tensor] = None,
+            position_ids: Optional[torch.LongTensor] = None,
+            past_key_value: Optional[Cache] = None,
+            output_attentions: bool = False,
+            use_cache: bool = False,
+            cache_position: Optional[torch.LongTensor] = None,
+            position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+        bsz, q_len, _ = hidden_states.size()
+        query_states = self.q_proj(hidden_states)
+        key_states = self.k_proj(hidden_states)
+        value_states = self.v_proj(hidden_states)
+        query_states = query_states.view(bsz, q_len, 2 * self.num_heads, self.head_dim).transpose(1, 2)
+        key_states = key_states.view(bsz, q_len, 2 * self.num_key_value_heads, self.head_dim).transpose(1, 2)
+        value_states = value_states.view(bsz, q_len, self.num_key_value_heads, 2 * self.head_dim).transpose(1, 2)
+        kv_seq_len = key_states.shape[-2]
+        if position_embeddings is None:
+            logger.warning_once(
+                "The attention layers in this model are transitioning from computing the RoPE embeddings internally "
+                "through `position_ids` (2D tensor with the indexes of the tokens), to using externally computed "
+                "`position_embeddings` (Tuple of tensors, containing cos and sin). In v4.46 `position_ids` will be "
+                "removed and `position_embeddings` will be mandatory.")
+            cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
+        else:
+            cos, sin = (self.rotary_emb(value_states, q_len + past_key_value.get_usable_length(q_len, self.layer_idx))
+                        if use_cache else position_embeddings)
+        query_states, key_states = apply_rotary_pos_emb(query_states,
+                                                        key_states,
+                                                        cos,
+                                                        sin,
+                                                        position_ids=position_ids)
+        if past_key_value is not None:
+            cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}  # Specific to RoPE models
+            key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
+        key_states = repeat_kv(key_states, self.num_key_value_groups)
+        value_states = repeat_kv(value_states, self.num_key_value_groups)
+        attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim)
+        kv_seq_len = key_states.shape[-2]
+        offset = kv_seq_len - q_len
+        attention_mask = torch.triu(
+            torch.full((q_len, kv_seq_len), float("-inf"), dtype=attn_weights.dtype, device=attn_weights.device),
+            1 + offset)
+        attn_weights = attn_weights + attention_mask
+        attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype)
+        attn_weights = nn.functional.dropout(attn_weights, p=self.attention_dropout, training=self.training)
+        lambda_1 = torch.exp(torch.sum(self.lambda_q1 * self.lambda_k1, dim=-1).float()).type_as(attn_weights)
+        lambda_2 = torch.exp(torch.sum(self.lambda_q2 * self.lambda_k2, dim=-1).float()).type_as(attn_weights)
+        lambda_full = lambda_1 - lambda_2 + self.lambda_init
+        attn_weights = attn_weights.view(bsz, self.num_heads, 2, q_len, -1)
+        attn_weights = attn_weights[:, :, 0] - lambda_full * attn_weights[:, :, 1]
+        attn_output = torch.matmul(attn_weights, value_states)
+        attn_output = self.subln(attn_output)
+        attn_output = attn_output * (1 - self.lambda_init)
+        if attn_output.size() != (bsz, self.num_heads, q_len, self.head_dim * 2):
+            raise ValueError(f"`attn_output` should be of size {(bsz, self.num_heads, q_len, self.head_dim)}, but is"
+                             f" {attn_output.size()}")
+        attn_output = attn_output.transpose(1, 2).contiguous()
+        attn_output = attn_output.reshape(bsz, q_len, self.hidden_size)
+        attn_output = self.o_proj(attn_output)
+        if not output_attentions:
+            attn_weights = None
+        return attn_output, attn_weights, past_key_value
+class MotifFlashAttention2(MotifAttention):
+    """
+    Motif flash attention module, following Motif attention module. This module inherits from `MotifAttention`
+    as the weights of the module stays untouched. The only required change would be on the forward pass
+    where it needs to correctly call the public API of flash attention and deal with padding tokens
+    in case the input contains any of them. Additionally, for sliding window attention, we apply SWA only to the bottom
+    config.max_window_layers layers.
+    """
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        # TODO: Should be removed once Flash Attention for RoCm is bumped to 2.1.
+        # flash_attn<2.1 generates top-left aligned causal mask, while what is needed here is bottom-right alignement, that was made default for flash_attn>=2.1. This attribute is used to handle this difference. Reference: https://github.com/Dao-AILab/flash-attention/releases/tag/v2.1.0.
+        # Beware that with flash_attn<2.1, using q_seqlen != k_seqlen (except for the case q_seqlen == 1) produces a wrong mask (top-left).
+        self._flash_attn_uses_top_left_mask = not is_flash_attn_greater_or_equal_2_10()
+        logger.info(f'flash attention is used {not self._flash_attn_uses_top_left_mask}')
+    def _reshape_heads(self, tensor, batch_size, seq_len):
+        """2-way head split tensor reshape"""
+        return tensor.reshape(batch_size, seq_len, self.num_heads, 2, self.head_dim)
+    def _restore_shape(self, tensor, batch_size, seq_len):
+        """restore tensor"""
+        return tensor.reshape(batch_size, seq_len, self.num_heads, self.head_dim)
+    def _compute_attention(self, query_states, key_states, value_states, attention_mask, q_len, position_ids,
+                           dropout_rate, sliding_window):
+        """Flash Attention 2 implements"""
+        _input_type = query_states.dtype
+        scale_factor = 1.0 / math.sqrt(self.head_dim)
+        if not self._flash_attn_uses_top_left_mask:
+            causal = self.is_causal
+        else:
+            causal = self.is_causal and q_len != 1
+        attn_out = _flash_attention_forward(query_states.bfloat16(),
+                                            key_states.bfloat16(),
+                                            value_states.bfloat16(),
+                                            attention_mask,
+                                            q_len,
+                                            position_ids=position_ids,
+                                            dropout=dropout_rate,
+                                            sliding_window=sliding_window,
+                                            is_causal=True,
+                                            softmax_scale=scale_factor,
+                                            use_top_left_mask=self._flash_attn_uses_top_left_mask)
+        return attn_out.to(_input_type)
+    def forward(
+            self,
+            hidden_states: torch.Tensor,
+            attention_mask: Optional[torch.Tensor] = None,
+            position_ids: Optional[torch.LongTensor] = None,
+            past_key_value: Optional[Cache] = None,
+            output_attentions: bool = False,
+            use_cache: bool = False,
+            cache_position: Optional[torch.LongTensor] = None,
+            position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,  # will become mandatory in v4.46
+    ):
+        bsz, q_len, _ = hidden_states.size()
+        query_states = self.q_proj(hidden_states)
+        key_states = self.k_proj(hidden_states)
+        value_states = self.v_proj(hidden_states)
+        query_states = query_states.view(bsz, q_len, 2 * self.num_heads, self.head_dim).transpose(1, 2)
+        key_states = key_states.view(bsz, q_len, 2 * self.num_key_value_heads, self.head_dim).transpose(1, 2)
+        value_states = value_states.view(bsz, q_len, self.num_key_value_heads, 2 * self.head_dim).transpose(1, 2)
+        kv_seq_len = key_states.shape[-2]
+        if position_embeddings is None:
+            logger.warning_once(
+                "The attention layers in this model are transitioning from computing the RoPE embeddings internally "
+                "through `position_ids` (2D tensor with the indexes of the tokens), to using externally computed "
+                "`position_embeddings` (Tuple of tensors, containing cos and sin). In v4.46 `position_ids` will be "
+                "removed and `position_embeddings` will be mandatory.")
+            cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
+        else:
+            cos, sin = (self.rotary_emb(value_states, q_len + past_key_value.get_usable_length(q_len, self.layer_idx))
+                        if use_cache else position_embeddings)
+        query_states, key_states = apply_rotary_pos_emb(query_states,
+                                                        key_states,
+                                                        cos,
+                                                        sin,
+                                                        position_ids=position_ids)
+        if past_key_value is not None:
+            cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}  # Specific to RoPE models
+            key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
+        key_states = repeat_kv(key_states, self.num_key_value_groups)
+        value_states = repeat_kv(value_states, self.num_key_value_groups)
+        dropout_rate = 0.0 if not self.training else self.attention_dropout
+        # In PEFT, usually we cast the layer norms in float32 for training stability reasons
+        # therefore the input hidden states gets silently casted in float32. Hence, we need
+        # cast them back in float16 just to be sure everything works as expected.
+        input_dtype = query_states.dtype
+        if input_dtype == torch.float32:
+            if torch.is_autocast_enabled():
+                target_dtype = torch.get_autocast_gpu_dtype()
+            # Handle the case where the model is quantized
+            elif hasattr(self.config, "_pre_quantization_dtype"):
+                target_dtype = self.config._pre_quantization_dtype
+            else:
+                target_dtype = self.q_proj.weight.dtype
+            logger.warning_once(
+                f"The input hidden states seems to be silently casted in float32, this might be related to"
+                f" the fact you have upcasted embedding or layer norm layers in float32. We will cast back the input in"
+                f" {target_dtype}.")
+            query_states = query_states.to(target_dtype)
+            key_states = key_states.to(target_dtype)
+            value_states = value_states.to(target_dtype)
+        q_len = query_states.shape[-2]
+        kv_seq_len = key_states.shape[-2]
+        # Reashape to the expected shape for Flash Attention
+        query_states = query_states.transpose(1, 2)
+        key_states = key_states.transpose(1, 2)
+        value_states = value_states.transpose(1, 2)
+        if (self.config.use_sliding_window and getattr(self.config, "sliding_window", None) is not None
+                and self.layer_idx >= self.config.max_window_layers):
+            sliding_window = self.config.sliding_window
+        else:
+            sliding_window = None
+        q = self._reshape_heads(query_states, bsz, q_len)
+        k = self._reshape_heads(key_states, bsz, kv_seq_len)
+        v = self._reshape_heads(value_states, bsz, kv_seq_len)
+        q1, q2 = q[..., 0, :], q[..., 1, :]
+        k1, k2 = k[..., 0, :], k[..., 1, :]
+        v1, v2 = v[..., 0, :], v[..., 1, :]
+        q1, q2, k1, k2, v1, v2 = map(lambda x: self._restore_shape(x, bsz, q_len if x is q1 or x is q2 else kv_seq_len),
+                                     (q1, q2, k1, k2, v1, v2))
+        q1, q2 = q1.contiguous(), q2.contiguous()
+        k1, k2 = k1.contiguous(), k2.contiguous()
+        v1, v2 = v1.contiguous(), v2.contiguous()
+        attn11, attn12 = self._compute_attention(q1, k1, v1, attention_mask, q_len, position_ids, dropout_rate, sliding_window), \
+                            self._compute_attention(q1, k1, v2, attention_mask, q_len, position_ids, dropout_rate, sliding_window)
+        attn21, attn22 = self._compute_attention(q2, k2, v1, attention_mask, q_len, position_ids, dropout_rate, sliding_window), \
+                            self._compute_attention(q2, k2, v2, attention_mask, q_len, position_ids, dropout_rate, sliding_window)
+        attn1, attn2 = torch.cat([attn11, attn12], dim=-1), torch.cat([attn21, attn22], dim=-1)
+        lambda_q1 = self.lambda_q1.unsqueeze(0).expand([bsz, self.lambda_q1.shape[0]])  # bsz, num_head
+        lambda_q2 = self.lambda_q2.unsqueeze(0).expand([bsz, self.lambda_q2.shape[0]])  # bsz, num_head
+        lambda_1 = torch.exp(torch.sum(lambda_q1 * self.lambda_k1, dim=-1).float()).type_as(attn1)  # bsz
+        lambda_2 = torch.exp(torch.sum(lambda_q2 * self.lambda_k2, dim=-1).float()).type_as(attn2)  # bsz
+        lambda_full = lambda_1 - lambda_2 + self.lambda_init
+        attn_output = attn1 - lambda_full.view([bsz, 1, 1, 1]) * attn2
+        attn_output = self.subln(attn_output)
+        attn_output = attn_output * (1 - self.lambda_init)
+        if attn_output.size() != (bsz, q_len, self.num_heads, self.head_dim * 2):
+            raise ValueError(f"`attn_output` should be of size {(bsz, q_len, self.num_heads, 2*self.head_dim)}, but is"
+                             f" {attn_output.size()}")
+        attn_output = attn_output.reshape(bsz, q_len, self.hidden_size)
+        attn_output = self.o_proj(attn_output)
+        return attn_output, None, past_key_value
+class MotifSdpaAttention(MotifAttention):
+    """
+    Motif attention module using torch.nn.functional.scaled_dot_product_attention. This module inherits from
+    `MotifAttention` as the weights of the module stays untouched. The only changes are on the forward pass to adapt to
+    SDPA API.
+    """
+    def forward(
+            self,
+            hidden_states: torch.Tensor,
+            attention_mask: Optional[torch.Tensor] = None,
+            position_ids: Optional[torch.LongTensor] = None,
+            past_key_value: Optional[Cache] = None,
+            output_attentions: bool = False,
+            use_cache: bool = False,
+            cache_position: Optional[torch.LongTensor] = None,
+            position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,  # will become mandatory in v4.46
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+        if output_attentions:
+            logger.warning_once(
+                "MotifModel is using MotifSdpaAttention, but `torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to the manual attention implementation, "
+                'but specifying the manual implementation will be required from Transformers version v5.0.0 onwards. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.'
+            )
+            return super().forward(
+                hidden_states=hidden_states,
+                attention_mask=attention_mask,
+                position_ids=position_ids,
+                past_key_value=past_key_value,
+                output_attentions=output_attentions,
+                use_cache=use_cache,
+            )
+        bsz, q_len, _ = hidden_states.size()
+        query_states = self.q_proj(hidden_states)
+        key_states = self.k_proj(hidden_states)
+        value_states = self.v_proj(hidden_states)
+        query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+        key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+        value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+        kv_seq_len = key_states.shape[-2]
+        if position_embeddings is None:
+            logger.warning_once(
+                "The attention layers in this model are transitioning from computing the RoPE embeddings internally "
+                "through `position_ids` (2D tensor with the indexes of the tokens), to using externally computed "
+                "`position_embeddings` (Tuple of tensors, containing cos and sin). In v4.46 `position_ids` will be "
+                "removed and `position_embeddings` will be mandatory.")
+            cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
+        else:
+            cos, sin = position_embeddings
+        query_states, key_states = apply_rotary_pos_emb(query_states,
+                                                        key_states,
+                                                        cos,
+                                                        sin)
+        if past_key_value is not None:
+            cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}  # Specific to RoPE models
+            key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
+        query_states = query_states.transpose(1, 2).reshape(bsz, q_len, self.hidden_size)
+        key_states = key_states.transpose(1, 2).reshape(bsz, q_len, self.hidden_size // self.num_key_value_groups)
+        value_states = value_states.transpose(1, 2).reshape(bsz, q_len, self.hidden_size // self.num_key_value_groups)
+        batch, query_length, key_length = query_states.size(0), query_states.size(-2), key_states.size(-2)
+        masked_bias = attention_mask.expand(batch, self.num_heads, query_length, key_length)
+        # Compute Scale Factor
+        scale_factor = 1.0
+        scale_factor /= float(self.head_dim) ** 0.5
+        attn_output = ScaledDotProductAttention(query_states,
+                                                key_states,
+                                                value_states,
+                                                masked_bias,
+                                                dropout_rate=0.0,
+                                                training=self.training,
+                                                attn_weight_scale_factor=scale_factor,
+                                                num_kv_groups=self.num_key_value_groups,
+                                                recompute_mode=False)
+        attn_output = attn_output.to(hidden_states.dtype)
+        attn_output = self.o_proj(attn_output)
+        return attn_output, None, past_key_value
+MOTIF_ATTENTION_CLASSES = {
+    "eager": MotifAttention,
+    "flash_attention_2": MotifFlashAttention2,
+    "sdpa": MotifAttention,
+}
+class MotifDecoderLayer(nn.Module):
+    def __init__(self, config: MotifConfig, layer_idx: int):
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        if config.sliding_window and config._attn_implementation != "flash_attention_2":
+            logger.warning_once(
+                f"Sliding Window Attention is enabled but not implemented for `{config._attn_implementation}`; "
+                "unexpected results may be encountered.")
+        self.self_attn = MOTIF_ATTENTION_CLASSES[config._attn_implementation](config, layer_idx)
+        self.mlp = MotifMLP(config)
+        self.input_layernorm = MotifRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.post_attention_layernorm = MotifRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_value: Optional[Tuple[torch.Tensor]] = None,
+        output_attentions: Optional[bool] = False,
+        use_cache: Optional[bool] = False,
+        cache_position: Optional[torch.LongTensor] = None,
+        position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,  # will become mandatory in v4.46
+        **kwargs,
+    ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]:
+        """
+        Args:
+            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
+            attention_mask (`torch.FloatTensor`, *optional*): attention mask of size
+                `(batch, sequence_length)` where padding elements are indicated by 0.
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+            use_cache (`bool`, *optional*):
+                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
+                (see `past_key_values`).
+            past_key_value (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states
+            cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
+                Indices depicting the position of the input sequence tokens in the sequence.
+            position_embeddings (`Tuple[torch.FloatTensor, torch.FloatTensor]`, *optional*):
+                Tuple containing the cosine and sine positional embeddings of shape `(batch_size, seq_len, head_dim)`,
+                with `head_dim` being the embedding dimension of each attention head.
+            kwargs (`dict`, *optional*):
+                Arbitrary kwargs to be ignored, used for FSDP and other methods that injects code
+                into the model
+        """
+        residual = hidden_states
+        hidden_states = self.input_layernorm(hidden_states)
+        # Self Attention
+        hidden_states, self_attn_weights, present_key_value = self.self_attn(
+            hidden_states=hidden_states,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_value=past_key_value,
+            output_attentions=output_attentions,
+            use_cache=use_cache,
+            cache_position=cache_position,
+            position_embeddings=position_embeddings,
+        )
+        hidden_states = residual + hidden_states
+        # Fully Connected
+        residual = hidden_states
+        hidden_states = self.post_attention_layernorm(hidden_states)
+        hidden_states = self.mlp(hidden_states)
+        hidden_states = residual + hidden_states
+        outputs = (hidden_states, )
+        if output_attentions:
+            outputs += (self_attn_weights, )
+        if use_cache:
+            outputs += (present_key_value, )
+        return outputs
+MOTIF_START_DOCSTRING = r"""
+    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
+    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
+    etc.)
+    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
+    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
+    and behavior.
+    Parameters:
+        config ([`MotifConfig`]):
+            Model configuration class with all the parameters of the model. Initializing with a config file does not
+            load the weights associated with the model, only the configuration. Check out the
+            [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+@add_start_docstrings(
+    "The bare Motif Model outputting raw hidden-states without any specific head on top.",
+    MOTIF_START_DOCSTRING,
+)
+class MotifPreTrainedModel(PreTrainedModel):
+    config_class = MotifConfig
+    base_model_prefix = "model"
+    supports_gradient_checkpointing = True
+    _no_split_modules = ["MotifDecoderLayer"]
+    _skip_keys_device_placement = "past_key_values"
+    _supports_flash_attn_2 = True
+    _supports_sdpa = True
+    _supports_cache_class = True
+    _supports_quantized_cache = True
+    _supports_static_cache = True
+    def _init_weights(self, module):
+        module_std = self.config.initializer_range
+        if isinstance(module, nn.Linear):
+            module.weight.data.normal_(mean=0.0, std=module_std)
+            module.weight.data = torch.where(abs(module.weight.data) > module_std*3, 0, module.weight.data)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=module_std)
+            module.weight.data = torch.where(abs(module.weight.data) > module_std*3, 0, module.weight.data)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+@dataclass
+class MotifModelOutputWithPast(ModelOutput):
+    """
+    This augments `BaseModelOutputWithPast` in `transformers.modeling_outputs` with new optional keys: `causal_mask`, `position_embeddings`.
+    The optional keys are currently used in the following ways:
+    - pass information to the token-wise last attention layers in multi-token training
+    """
+    last_hidden_state: torch.FloatTensor = None
+    past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None
+    hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
+    attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
+    causal_mask: Optional[torch.Tensor] = None
+    position_embeddings: Optional[torch.FloatTensor] = None
+MOTIF_INPUTS_DOCSTRING = r"""
+    Args:
+        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+            Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
+            it.
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+            [What are input IDs?](../glossary#input-ids)
+        attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+            [What are attention masks?](../glossary#attention-mask)
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+            If `past_key_values` is used, optionally only the last `decoder_input_ids` have to be input (see
+            `past_key_values`).
+            If you want to change padding behavior, you should read [`modeling_opt._prepare_decoder_attention_mask`]
+            and modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more
+            information on the default strategy.
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+        position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
+            config.n_positions - 1]`.
+            [What are position IDs?](../glossary#position-ids)
+        past_key_values (`Cache` or `tuple(tuple(torch.FloatTensor))`, *optional*):
+            Pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
+            blocks) that can be used to speed up sequential decoding. This typically consists in the `past_key_values`
+            returned by the model at a previous stage of decoding, when `use_cache=True` or `config.use_cache=True`.
+            Two formats are allowed:
+            - a [`~cache_utils.Cache`] instance, see our
+            [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache);
+            - Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of
+            shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`). This is also known as the legacy
+            cache format.
+            The model will output the same cache format that is fed as input. If no `past_key_values` are passed, the
+            legacy cache format will be returned.
+            If `past_key_values` are used, the user can optionally input only the last `input_ids` (those that don't
+            have their past key value states given to this model) of shape `(batch_size, 1)` instead of all `input_ids`
+            of shape `(batch_size, sequence_length)`.
+        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
+            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
+            model's internal embedding lookup matrix.
+        use_cache (`bool`, *optional*):
+            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
+            `past_key_values`).
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+        cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
+            Indices depicting the position of the input sequence tokens in the sequence. Contrarily to `position_ids`,
+            this tensor is not affected by padding. It is used to update the cache in the correct position and to infer
+            the complete sequence length.
+"""
+@add_start_docstrings(
+    "The bare Motif Model outputting raw hidden-states without any specific head on top.",
+    MOTIF_START_DOCSTRING,
+)
+class MotifModel(MotifPreTrainedModel):
+    """
+    Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`MotifDecoderLayer`]
+    Args:
+        config: MotifConfig
+    """
+    def __init__(self, config: MotifConfig):
+        super().__init__(config)
+        self.padding_idx = config.pad_token_id
+        self.vocab_size = config.vocab_size
+        self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx)
+        num_hidden_layers = config.num_hidden_layers
+        self.layers = nn.ModuleList([MotifDecoderLayer(config = config, layer_idx=layer_idx) for layer_idx in range(num_hidden_layers)])
+        self.norm = MotifRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.hidden_size = config.hidden_size
+        self.num_heads = config.num_attention_heads
+        self.head_dim = self.hidden_size // self.num_heads
+        self.max_position_embeddings = config.max_position_embeddings
+        self.rope_theta = config.rope_theta
+        self.rotary_emb = MotifRotaryEmbeddingWithCache(self.head_dim,
+                                                max_position_embeddings=self.max_position_embeddings,
+                                                base=self.rope_theta)
+        self.gradient_checkpointing = False
+        self.post_init()
+    def get_input_embeddings(self):
+        return self.embed_tokens
+    def set_input_embeddings(self, value):
+        self.embed_tokens = value
+    @add_start_docstrings_to_model_forward(MOTIF_INPUTS_DOCSTRING)
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        outputs_include_causal_mask: bool = False,
+        outputs_include_position_embeddings: bool = False,
+    ) -> Union[Tuple, MotifModelOutputWithPast]:
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (output_hidden_states
+                                if output_hidden_states is not None else self.config.output_hidden_states)
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        if (input_ids is None) ^ (inputs_embeds is not None):
+            raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
+        if self.gradient_checkpointing and self.training:
+            if use_cache:
+                logger.warning_once(
+                    "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...")
+                use_cache = False
+        return_legacy_cache = False
+        if use_cache and not isinstance(past_key_values, Cache):
+            return_legacy_cache = True
+            if past_key_values is None:
+                past_key_values = DynamicCache()
+            else:
+                past_key_values = DynamicCache.from_legacy_cache(past_key_values)
+                logger.warning_once(
+                    "We detected that you are passing `past_key_values` as a tuple of tuples. This is deprecated and "
+                    "will be removed in v4.47. Please convert your cache or use an appropriate `Cache` class "
+                    "(https://huggingface.co/docs/transformers/kv_cache#legacy-cache-format)")
+        if inputs_embeds is None:
+            inputs_embeds = self.embed_tokens(input_ids)
+        if cache_position is None:
+            past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
+            cache_position = torch.arange(past_seen_tokens,
+                                          past_seen_tokens + inputs_embeds.shape[1],
+                                          device=inputs_embeds.device)
+        if position_ids is None:
+            position_ids = cache_position.unsqueeze(0)
+        causal_mask = self._update_causal_mask(attention_mask, inputs_embeds, cache_position, past_key_values,
+                                               output_attentions)
+        hidden_states = inputs_embeds
+        bsz, q_len, _ = hidden_states.size()
+        position_embeddings = self.rotary_emb(hidden_states, seq_len=q_len)
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attns = () if output_attentions else None
+        next_decoder_cache = None
+        for idx, decoder_layer in enumerate(self.layers):
+            if output_hidden_states:
+                all_hidden_states += (hidden_states, )
+            if self.gradient_checkpointing and self.training:
+                layer_outputs = self._gradient_checkpointing_func(
+                    decoder_layer.__call__,
+                    hidden_states,
+                    causal_mask,
+                    position_ids,
+                    past_key_values,
+                    output_attentions,
+                    use_cache,
+                    cache_position,
+                    position_embeddings,
+                )
+            else:
+                layer_outputs = decoder_layer(
+                    hidden_states,
+                    attention_mask=causal_mask,
+                    position_ids=position_ids,
+                    past_key_value=past_key_values,
+                    output_attentions=output_attentions,
+                    use_cache=use_cache,
+                    cache_position=cache_position,
+                    position_embeddings=position_embeddings,
+                )
+            hidden_states = layer_outputs[0]
+            if use_cache:
+                next_decoder_cache = layer_outputs[2 if output_attentions else 1]
+            if output_attentions:
+                all_self_attns += (layer_outputs[1], )
+        hidden_states = self.norm(hidden_states)
+        if output_hidden_states:
+            all_hidden_states += (hidden_states, )
+        next_cache = next_decoder_cache if use_cache else None
+        if return_legacy_cache:
+            next_cache = next_cache.to_legacy_cache()
+        causal_mask_output = causal_mask if outputs_include_causal_mask else None
+        position_embeddings_output = position_embeddings if outputs_include_position_embeddings else None
+        if not return_dict:
+            return tuple(v for v in [
+                hidden_states, next_cache, all_hidden_states, all_self_attns, causal_mask_output,
+                position_embeddings_output
+            ] if v is not None)
+        return MotifModelOutputWithPast(last_hidden_state=hidden_states,
+                                        past_key_values=next_cache,
+                                        hidden_states=all_hidden_states,
+                                        attentions=all_self_attns,
+                                        causal_mask=causal_mask_output,
+                                        position_embeddings=position_embeddings_output)
+    def _update_causal_mask(
+        self,
+        attention_mask: torch.Tensor,
+        input_tensor: torch.Tensor,
+        cache_position: torch.Tensor,
+        past_key_values: Cache,
+        output_attentions: bool,
+    ):
+        if self.config._attn_implementation == "flash_attention_2":
+            if attention_mask is not None and 0.0 in attention_mask:
+                return attention_mask
+            return None
+        # For SDPA, when possible, we will rely on its `is_causal` argument instead of its `attn_mask` argument, in
+        # order to dispatch on Flash Attention 2. This feature is not compatible with static cache, as SDPA will fail
+        # to infer the attention mask.
+        past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
+        using_static_cache = isinstance(past_key_values, StaticCache)
+        using_sliding_window_cache = isinstance(past_key_values, SlidingWindowCache)
+        # When output attentions is True, sdpa implementation's forward method calls the eager implementation's forward
+        if (self.config._attn_implementation == "sdpa" and not (using_static_cache or using_sliding_window_cache)
+                and not output_attentions):
+            if AttentionMaskConverter._ignore_causal_mask_sdpa(
+                    attention_mask,
+                    inputs_embeds=input_tensor,
+                    past_key_values_length=past_seen_tokens,
+                    sliding_window=self.config.sliding_window,
+                    is_training=self.training,
+            ):
+                return None
+        dtype, device = input_tensor.dtype, input_tensor.device
+        min_dtype = torch.finfo(dtype).min
+        sequence_length = input_tensor.shape[1]
+        # SlidingWindowCache or StaticCache
+        if using_sliding_window_cache or using_static_cache:
+            target_length = past_key_values.get_max_cache_shape()
+        # DynamicCache or no cache
+        else:
+            target_length = (attention_mask.shape[-1]
+                             if isinstance(attention_mask, torch.Tensor) else past_seen_tokens + sequence_length + 1)
+        # In case the provided `attention` mask is 2D, we generate a causal mask here (4D).
+        causal_mask = self._prepare_4d_causal_attention_mask_with_cache_position(
+            attention_mask,
+            sequence_length=sequence_length,
+            target_length=target_length,
+            dtype=dtype,
+            device=device,
+            cache_position=cache_position,
+            batch_size=input_tensor.shape[0],
+            config=self.config,
+            past_key_values=past_key_values,
+        )
+        if (self.config._attn_implementation == "sdpa" and attention_mask is not None
+                and attention_mask.device.type == "cuda" and not output_attentions):
+            # Attend to all tokens in fully masked rows in the causal_mask, for example the relevant first rows when
+            # using left padding. This is required by F.scaled_dot_product_attention memory-efficient attention path.
+            # Details: https://github.com/pytorch/pytorch/issues/110213
+            causal_mask = AttentionMaskConverter._unmask_unattended(causal_mask, min_dtype)
+        return causal_mask
+    @staticmethod
+    def _prepare_4d_causal_attention_mask_with_cache_position(
+        attention_mask: torch.Tensor,
+        sequence_length: int,
+        target_length: int,
+        dtype: torch.dtype,
+        device: torch.device,
+        cache_position: torch.Tensor,
+        batch_size: int,
+        config: MotifConfig,
+        past_key_values: Cache,
+    ):
+        """
+        Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape
+        `(batch_size, key_value_length)`, or if the input `attention_mask` is already 4D, do nothing.
+        Args:
+            attention_mask (`torch.Tensor`):
+                A 2D attention mask of shape `(batch_size, key_value_length)` or a 4D attention mask of shape `(batch_size, 1, query_length, key_value_length)`.
+            sequence_length (`int`):
+                The sequence length being processed.
+            target_length (`int`):
+                The target length: when generating with static cache, the mask should be as long as the static cache, to account for the 0 padding, the part of the cache that is not filled yet.
+            dtype (`torch.dtype`):
+                The dtype to use for the 4D attention mask.
+            device (`torch.device`):
+                The device to plcae the 4D attention mask on.
+            cache_position (`torch.Tensor`):
+                Indices depicting the position of the input sequence tokens in the sequence.
+            batch_size (`torch.Tensor`):
+                Batch size.
+            config (`MotifConfig`):
+                The model's configuration class
+            past_key_values (`Cache`):
+                The cache class that is being used currently to generate
+        """
+        if attention_mask is not None and attention_mask.dim() == 4:
+            # In this case we assume that the mask comes already in inverted form and requires no inversion or slicing.
+            causal_mask = attention_mask
+        else:
+            min_dtype = torch.finfo(dtype).min
+            causal_mask = torch.full((sequence_length, target_length), fill_value=min_dtype, dtype=dtype, device=device)
+            diagonal_attend_mask = torch.arange(target_length, device=device) > cache_position.reshape(-1, 1)
+            if config.sliding_window is not None:
+                # if we have sliding window, we should not attend to tokens beyond sliding window length, so we mask them out also
+                # the check is needed to verify is current checkpoint was trained with sliding window or not
+                if not isinstance(past_key_values, SlidingWindowCache) or sequence_length > target_length:
+                    sliding_attend_mask = torch.arange(
+                        target_length, device=device) <= (cache_position.reshape(-1, 1) - config.sliding_window)
+                    diagonal_attend_mask.bitwise_or_(sliding_attend_mask)
+            causal_mask *= diagonal_attend_mask
+            causal_mask = causal_mask[None, None, :, :].expand(batch_size, 1, -1, -1)
+            if attention_mask is not None:
+                causal_mask = causal_mask.clone()  # copy to contiguous memory for in-place edit
+                if attention_mask.shape[-1] > target_length:
+                    attention_mask = attention_mask[:, :target_length]
+                mask_length = attention_mask.shape[-1]
+                padding_mask = causal_mask[:, :, :, :mask_length] + attention_mask[:, None, None, :]
+                padding_mask = padding_mask == 0
+                causal_mask[:, :, :, :mask_length] = causal_mask[:, :, :, :mask_length].masked_fill(
+                    padding_mask, min_dtype)
+        return causal_mask
+class MotifForCausalLM(MotifPreTrainedModel, GenerationMixin):
+    _tied_weights_keys = ["lm_head.weight"]
+    def __init__(self, config: MotifConfig):
+        super().__init__(config)
+        self.model = MotifModel(config)
+        self.vocab_size = config.vocab_size
+        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+        # Initialize weights and apply final processing
+        self.post_init()
+        if getattr(config, "tie_word_embeddings", True):
+            self.tie_weights()
+    def get_input_embeddings(self):
+        return self.model.embed_tokens
+    def set_input_embeddings(self, value):
+        self.model.embed_tokens = value
+    def get_output_embeddings(self):
+        return self.lm_head
+    def set_output_embeddings(self, new_embeddings):
+        self.lm_head = new_embeddings
+    def set_decoder(self, decoder):
+        self.model = decoder
+    def get_decoder(self):
+        return self.model
+    @add_start_docstrings_to_model_forward(MOTIF_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC)
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        num_logits_to_keep: int = 0,
+        **loss_kwargs,
+    ) -> Union[Tuple, CausalLMOutputWithPast]:
+        r"""
+        Args:
+            labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
+                config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
+                (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
+            num_logits_to_keep (`int`, *optional*):
+                Calculate logits for the last `num_logits_to_keep` tokens. If `0`, calculate logits for all
+                `input_ids` (special case). Only last token logits are needed for generation, and calculating them only for that
+                token can save memory, which becomes pretty significant for long sequences or large vocabulary size.
+        Returns:
+        Example:
+        ```python
+        >>> from transformers import AutoTokenizer, MotifForCausalLM
+        >>> model = MotifForCausalLM.from_pretrained(PATH_TO_CONVERTED_WEIGHTS)
+        >>> tokenizer = AutoTokenizer.from_pretrained(PATH_TO_CONVERTED_TOKENIZER)
+        >>> prompt = "Hey, are you conscious? Can you talk to me?"
+        >>> inputs = tokenizer(prompt, return_tensors="pt")
+        >>> # Generate
+        >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
+        >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
+        "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
+        ```"""
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (output_hidden_states
+                                if output_hidden_states is not None else self.config.output_hidden_states)
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
+        outputs: MotifModelOutputWithPast = self.model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            cache_position=cache_position,
+        )
+        hidden_states = outputs[0]
+        # Only compute necessary logits, and do not upcast them to float if we are not computing the loss
+        hidden_states = hidden_states
+        logits = self.lm_head(hidden_states[:, -num_logits_to_keep:, :])
+        logits = logits.float()
+        loss = None
+        if labels is not None:
+            logits = logits
+            # Shift so that tokens < n predict n
+            shift_logits = logits[..., :-1, :].contiguous()
+            shift_labels = labels[..., 1:].contiguous()
+            # Flatten the tokens
+            loss_fct = CrossEntropyLoss()
+            shift_logits = shift_logits.view(-1, self.config.vocab_size)
+            shift_labels = shift_labels.view(-1)
+            shift_labels = shift_labels.to(shift_logits.device)
+            loss = loss_fct(shift_logits, shift_labels)
+        if not return_dict:
+            output = (logits, ) + outputs[1:]
+            return (loss, ) + output if loss is not None else output
+        return CausalLMOutputWithPast(
+            loss=loss,
+            logits=logits,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )

special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,30 @@

+{
+  "bos_token": {
+    "content": "<|beginoftext|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "unk_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

tokenizer.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:275139d476909028da05d4ee035aba88f0ca0dbfd0d395f72b7fc80fd7782e19
+size 17264873

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,1027 @@

+{
+  "add_prefix_space": false,
+  "added_tokens_decoder": {
+    "219395": {
+      "content": "<|endoftext|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "219396": {
+      "content": "<|beginoftext|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "219397": {
+      "content": "<|fim_prefix|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "219398": {
+      "content": "<|fim_middle|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "219399": {
+      "content": "<|fim_suffix|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "219400": {
+      "content": "<|system|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "219401": {
+      "content": "<|user|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "219402": {
+      "content": "<|assistant|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "219403": {
+      "content": "<|startofturn|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "219404": {
+      "content": "<think>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "219405": {
+      "content": "<|endofturn|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "219406": {
+      "content": "</think>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "219407": {
+      "content": "<|dummy_id_3|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "219408": {
+      "content": "<|dummy_id_4|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "219409": {
+      "content": "<|dummy_id_5|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "219410": {
+      "content": "<|dummy_id_6|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "219411": {
+      "content": "<|dummy_id_7|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "219412": {
+      "content": "<|dummy_id_8|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "219413": {
+      "content": "<|dummy_id_9|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "219414": {
+      "content": "<|dummy_id_10|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "219415": {
+      "content": "<|dummy_id_11|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "219416": {
+      "content": "<|endofprompt|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "219417": {
+      "content": "<|dummy_id_12|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "219418": {
+      "content": "<|dummy_id_13|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "219419": {
+      "content": "<|dummy_id_14|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "219420": {
+      "content": "<|dummy_id_15|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "219421": {
+      "content": "<|dummy_id_16|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "219422": {
+      "content": "<|dummy_id_17|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "219423": {
+      "content": "<|dummy_id_18|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "219424": {
+      "content": "<|dummy_id_19|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "219425": {
+      "content": "<|dummy_id_20|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "219426": {
+      "content": "<|dummy_id_21|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "219427": {
+      "content": "<|dummy_id_22|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "219428": {
+      "content": "<|dummy_id_23|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "219429": {
+      "content": "<|dummy_id_24|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "219430": {
+      "content": "<|dummy_id_25|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "219431": {
+      "content": "<|dummy_id_26|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "219432": {
+      "content": "<|dummy_id_27|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "219433": {
+      "content": "<|dummy_id_28|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "219434": {
+      "content": "<|dummy_id_29|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "219435": {
+      "content": "<|dummy_id_30|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "219436": {
+      "content": "<|dummy_id_31|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "219437": {
+      "content": "<|dummy_id_32|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "219438": {
+      "content": "<|dummy_id_33|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "219439": {
+      "content": "<|dummy_id_34|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "219440": {
+      "content": "<|dummy_id_35|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "219441": {
+      "content": "<|dummy_id_36|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "219442": {
+      "content": "<|dummy_id_37|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "219443": {
+      "content": "<|dummy_id_38|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "219444": {
+      "content": "<|dummy_id_39|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "219445": {
+      "content": "<|dummy_id_40|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "219446": {
+      "content": "<|dummy_id_41|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "219447": {
+      "content": "<|dummy_id_42|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "219448": {
+      "content": "<|dummy_id_43|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "219449": {
+      "content": "<|dummy_id_44|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "219450": {
+      "content": "<|dummy_id_45|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "219451": {
+      "content": "<|dummy_id_46|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "219452": {
+      "content": "<|dummy_id_47|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "219453": {
+      "content": "<|dummy_id_48|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "219454": {
+      "content": "<|dummy_id_49|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "219455": {
+      "content": "<|dummy_id_50|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "219456": {
+      "content": "<|dummy_id_51|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "219457": {
+      "content": "<|dummy_id_52|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "219458": {
+      "content": "<|dummy_id_53|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "219459": {
+      "content": "<|dummy_id_54|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "219460": {
+      "content": "<|dummy_id_55|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "219461": {
+      "content": "<|dummy_id_56|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "219462": {
+      "content": "<|dummy_id_57|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "219463": {
+      "content": "<|dummy_id_58|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "219464": {
+      "content": "<|dummy_id_59|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "219465": {
+      "content": "<|dummy_id_60|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "219466": {
+      "content": "<|dummy_id_61|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "219467": {
+      "content": "<|dummy_id_62|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "219468": {
+      "content": "<|dummy_id_63|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "219469": {
+      "content": "<|dummy_id_64|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "219470": {
+      "content": "<|dummy_id_65|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "219471": {
+      "content": "<|dummy_id_66|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "219472": {
+      "content": "<|dummy_id_67|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "219473": {
+      "content": "<|dummy_id_68|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "219474": {
+      "content": "<|dummy_id_69|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "219475": {
+      "content": "<|dummy_id_70|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "219476": {
+      "content": "<|dummy_id_71|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "219477": {
+      "content": "<|dummy_id_72|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "219478": {
+      "content": "<|dummy_id_73|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "219479": {
+      "content": "<|dummy_id_74|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "219480": {
+      "content": "<|dummy_id_75|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "219481": {
+      "content": "<|dummy_id_76|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "219482": {
+      "content": "<|dummy_id_77|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "219483": {
+      "content": "<|dummy_id_78|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "219484": {
+      "content": "<|dummy_id_79|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "219485": {
+      "content": "<|dummy_id_80|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "219486": {
+      "content": "<|dummy_id_81|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "219487": {
+      "content": "<|dummy_id_82|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "219488": {
+      "content": "<|dummy_id_83|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "219489": {
+      "content": "<|dummy_id_84|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "219490": {
+      "content": "<|dummy_id_85|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "219491": {
+      "content": "<|dummy_id_86|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "219492": {
+      "content": "<|dummy_id_87|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "219493": {
+      "content": "<|dummy_id_88|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "219494": {
+      "content": "<|dummy_id_89|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "219495": {
+      "content": "<|dummy_id_90|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "219496": {
+      "content": "<|dummy_id_91|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "219497": {
+      "content": "<|dummy_id_92|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "219498": {
+      "content": "<|dummy_id_93|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "219499": {
+      "content": "<|dummy_id_94|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "219500": {
+      "content": "<|dummy_id_95|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "219501": {
+      "content": "<|dummy_id_96|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "219502": {
+      "content": "<|dummy_id_97|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "219503": {
+      "content": "<|dummy_id_98|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "219504": {
+      "content": "<|dummy_id_99|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "219505": {
+      "content": "<|dummy_id_100|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "219506": {
+      "content": "<|dummy_id_101|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "219507": {
+      "content": "<|dummy_id_102|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "219508": {
+      "content": "<|dummy_id_103|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "219509": {
+      "content": "<|dummy_id_104|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "219510": {
+      "content": "<|dummy_id_105|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "219511": {
+      "content": "<|dummy_id_106|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "219512": {
+      "content": "<|dummy_id_107|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "219513": {
+      "content": "<|dummy_id_108|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "219514": {
+      "content": "<|dummy_id_109|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "219515": {
+      "content": "<|dummy_id_110|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "219516": {
+      "content": "<|dummy_id_111|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "219517": {
+      "content": "<|dummy_id_112|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "219518": {
+      "content": "<|dummy_id_113|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "219519": {
+      "content": "<|dummy_id_114|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "block_size": 2048,
+  "bos_token": "<|beginoftext|>",
+  "chat_template": "{{ bos_token }}{% for message in messages %}{% set content = message['content'] %}{% if message['role'] == 'assistant' and '</think>' in content %}{% set reasoning_content = content.split('</think>')[0].rstrip('\n').split('<think>')[-1].lstrip('\n') %}{% set content = content.split('</think>')[-1].lstrip('\n') %}{{ '<|startofturn|><|assistant|>\n\n<think>\n' + reasoning_content + '\n</think>\n\n' + content + '<|endofturn|>' }}{% else %}{{ '<|startofturn|><|' + message['role'] + '|>\n\n' + content + '<|endofturn|>' }}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ '<|startofturn|><|assistant|>\n\n' }}{% endif %}",
+  "clean_up_tokenization_spaces": false,
+  "corruption_rate": 0.15,
+  "eos_token": "<|endoftext|>",
+  "extra_ids": 0,
+  "extra_special_tokens": {},
+  "fixed_vocab": true,
+  "merges_file_path": "./data/merges.txt",
+  "model_max_length": 1000000000000000019884624838656,
+  "pad_token": "<|endoftext|>",
+  "padding_side": "left",
+  "seq_length": 2048,
+  "tokenizer_class": "GPT2Tokenizer",
+  "tokenizer_name": "/nfs-ssd/motif_1/tokenizers/ver5",
+  "tokens": -1,
+  "unk_token": "<|endoftext|>",
+  "update_tokenizer": false,
+  "use_moreh_tokenizer": false,
+  "vocab_file_path": "./data/vocab.json",
+  "vocab_size": 219395
+}

vocab.json ADDED Viewed

The diff for this file is too large to render. See raw diff