initial commit

Browse files

Files changed (8) hide show

LICENSE.txt +412 -0
config.json +43 -0
model.safetensors +3 -0
modeling_plamo.py +1089 -0
special_tokens_map.json +24 -0
tokenization_plamo.py +191 -0
tokenizer.model +3 -0
tokenizer_config.json +57 -0

LICENSE.txt ADDED Viewed

	@@ -0,0 +1,412 @@

+Copyright 2025- Preferred Networks, Inc. All rights reserved.
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+   1. Definitions.
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+   END OF TERMS AND CONDITIONS
+   APPENDIX: How to apply the Apache License to your work.
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+   Copyright [yyyy] [name of copyright owner]
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+       http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+---
+This software contains modified codes from huggingface trainsformers library which is released under Apache v2.0 license.
+---
+Copyright 2018- The Hugging Face team. All rights reserved.
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+   1. Definitions.
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+   END OF TERMS AND CONDITIONS
+   APPENDIX: How to apply the Apache License to your work.
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+   Copyright [yyyy] [name of copyright owner]
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+       http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.

config.json ADDED Viewed

	@@ -0,0 +1,43 @@

+{
+  "architectures": [
+    "PlamoBiModel"
+  ],
+  "attention_dropout": 0.0,
+  "auto_map": {
+    "AutoConfig": "modeling_plamo.PlamoConfig",
+    "AutoModel": "modeling_plamo.PlamoBiModel"
+  },
+  "bos_token_id": 1,
+  "capacity_factor": 1.0,
+  "eos_token_id": 1,
+  "eval_attention_n_bit": null,
+  "eval_mlp_n_bit": null,
+  "eval_offload_moe": false,
+  "expert_dropout": 0.0,
+  "fp8_accum_dtype": "bfloat16",
+  "group_size": 1024,
+  "hidden_size": 2048,
+  "hidden_size_per_head": 128,
+  "initializer_range": 0.02,
+  "intermediate_size": 8192,
+  "k_expert": null,
+  "linear_type": "fp8",
+  "max_length": 4096,
+  "max_position_embeddings": 4096,
+  "model_type": "plamo",
+  "n_expert": null,
+  "num_attention_heads": 16,
+  "num_hidden_layers": 16,
+  "num_key_value_heads": 1,
+  "pad_token_id": 3,
+  "rms_norm_eps": 1e-06,
+  "shared_intermediate_size": null,
+  "sparse_intermediate_size": null,
+  "sparse_step": null,
+  "tie_word_embeddings": false,
+  "tokenizer_class": "PlamoTokenizer",
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.47.0",
+  "use_cache": false,
+  "vocab_size": 50112
+}

model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:0e9e26cd19d9a90dc79d1c0e8d4755d881fe984a8582d7dcd776b9adb2dcf9f1
+size 2101303432

modeling_plamo.py ADDED Viewed

	@@ -0,0 +1,1089 @@

+import enum
+from typing import Any, List, NamedTuple, Optional, Tuple, Union
+import torch
+from torch import nn
+from torch.nn import functional as F
+from transformers import AutoTokenizer, PretrainedConfig, PreTrainedModel
+from transformers.modeling_attn_mask_utils import (
+    _prepare_4d_causal_attention_mask,
+    _prepare_4d_causal_attention_mask_for_sdpa,
+)
+from transformers.modeling_outputs import BaseModelOutputWithPast
+from transformers.tokenization_utils_base import BatchEncoding
+def _swiglu(h: torch.Tensor) -> torch.Tensor:
+    h0, h1 = h.chunk(2, dim=-1)
+    return torch.nn.functional.silu(h0) * h1
+class PlamoAttentionCache:
+    def __init__(self, key: torch.Tensor, value: torch.Tensor) -> None:
+        B, nh, L, c = key.shape
+        assert len(value.shape) == 4
+        assert value.shape[0] == B
+        assert value.shape[2] == L
+        self.key = key
+        self.value = value
+    def _validate(self, cache: torch.Tensor, new_tensor: torch.Tensor) -> None:
+        assert len(cache.shape) == 4
+        assert len(new_tensor.shape) == 4
+        assert cache.shape[0] == new_tensor.shape[0]
+        assert cache.shape[1] == new_tensor.shape[1]
+        assert cache.shape[3] == new_tensor.shape[3]
+    def append_cache(self, k: torch.Tensor, v: torch.Tensor) -> None:
+        self._validate(self.key, k)
+        self._validate(self.value, v)
+        assert k.shape[2] == v.shape[2]
+        self.key = torch.cat([self.key, k], dim=2)
+        self.value = torch.cat([self.value, v], dim=2)
+    def sequence_length(self) -> int:
+        return self.key.shape[2]
+PlamoLayerCache = PlamoAttentionCache
+PlamoCache = list[PlamoLayerCache]
+class DecoderInput(NamedTuple):
+    hidden_states: torch.Tensor
+    position_ids: torch.Tensor
+    attention_mask: Optional[torch.Tensor] = None
+    past_key_values: Optional[PlamoCache] = None
+    output_hidden_states: Optional[bool] = False
+    output_attentions: Optional[bool] = False
+    use_cache: Optional[bool] = False
+    gradient_checkpointing: bool = False
+    input_ids: Optional[torch.Tensor] = None
+class DecoderOutput(NamedTuple):
+    hidden_states: torch.Tensor
+    all_hidden_states: Optional[Tuple[torch.Tensor, ...]]
+    all_self_attns: Optional[Tuple[torch.Tensor, ...]]
+    next_decoder_cache: Optional[PlamoCache]
+class LinearType(str, enum.Enum):
+    Normal = "normal"
+    Fp8 = "fp8"
+    Fp8Retain = "fp8-retain"
+class PlamoConfig(PretrainedConfig):  # type: ignore
+    model_type: str = "plamo"
+    def __init__(
+        self,
+        vocab_size: int = 32000,
+        hidden_size: int = 4096,
+        intermediate_size: int = 13312,
+        num_hidden_layers: int = 32,
+        num_attention_heads: int = 32,
+        num_key_value_heads: int = 4,
+        hidden_size_per_head: int = 128,
+        max_position_embeddings: int = 2048,
+        initializer_range: float = 0.02,
+        rms_norm_eps: float = 1e-6,
+        use_cache: bool = True,
+        tokenizer_class: str = "PlamoTokenizer",
+        pad_token_id: Optional[int] = None,
+        bos_token_id: int = 1,
+        eos_token_id: int = 2,
+        tie_word_embeddings: bool = False,
+        n_expert: Optional[int] = None,
+        k_expert: Optional[int] = None,
+        expert_dropout: float = 0.0,
+        capacity_factor: float = 1.0,
+        group_size: int = 1024,
+        sparse_step: Optional[int] = None,
+        sparse_intermediate_size: Optional[int] = None,
+        shared_intermediate_size: Optional[int] = None,
+        linear_type: LinearType = LinearType.Normal,
+        fp8_accum_dtype: Optional[str] = None,
+        eval_attention_n_bit: Optional[int] = None,
+        eval_mlp_n_bit: Optional[int] = None,
+        eval_offload_moe: bool = False,
+        attention_dropout: float = 0.0,
+        **kwargs: Any,
+    ) -> None:
+        self.vocab_size = vocab_size
+        self.max_position_embeddings = max_position_embeddings
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.hidden_size_per_head = hidden_size_per_head
+        self.initializer_range = initializer_range
+        self.rms_norm_eps = rms_norm_eps
+        self.use_cache = use_cache
+        self.num_key_value_heads = num_key_value_heads
+        self.n_expert = n_expert
+        self.k_expert = k_expert
+        self.sparse_intermediate_size = sparse_intermediate_size
+        self.shared_intermediate_size = shared_intermediate_size
+        self.expert_dropout = expert_dropout
+        self.capacity_factor = capacity_factor
+        self.group_size = group_size
+        self.sparse_step = sparse_step
+        self.linear_type = linear_type
+        self.fp8_accum_dtype = fp8_accum_dtype
+        self.eval_attention_n_bit = eval_attention_n_bit
+        self.eval_mlp_n_bit = eval_mlp_n_bit
+        self.eval_offload_moe = eval_offload_moe
+        self.attention_dropout = attention_dropout
+        super().__init__(
+            tokenizer_class=tokenizer_class,
+            pad_token_id=pad_token_id,
+            bos_token_id=bos_token_id,
+            eos_token_id=eos_token_id,
+            tie_word_embeddings=tie_word_embeddings,
+            **kwargs,
+        )
+# Copied from transformers.models.bart.modeling_bart._make_causal_mask
+def _make_causal_mask(
+    input_ids_shape: Tuple[int, int],
+    dtype: torch.dtype,
+    device: torch.device,
+    past_key_values_length: int = 0,
+) -> torch.Tensor:
+    """
+    Make causal mask used for bi-directional self-attention.
+    """
+    bsz, tgt_len = input_ids_shape
+    mask = torch.full((tgt_len, tgt_len), torch.finfo(dtype).min, device=device)
+    mask_cond = torch.arange(mask.size(-1), device=device)
+    mask.masked_fill_(mask_cond < (mask_cond + 1).view(mask.size(-1), 1), 0)
+    mask = mask.to(dtype)
+    if past_key_values_length > 0:
+        mask = torch.cat(
+            [
+                torch.zeros(
+                    tgt_len, past_key_values_length, dtype=dtype, device=device
+                ),
+                mask,
+            ],
+            dim=-1,
+        )
+    return mask[None, None, :, :].expand(
+        bsz, 1, tgt_len, tgt_len + past_key_values_length
+    )
+# Copied from transformers.models.bart.modeling_bart._expand_mask
+def _expand_mask(
+    mask: torch.Tensor, dtype: torch.dtype, tgt_len: Optional[int] = None
+) -> torch.Tensor:
+    """
+    Expands attention_mask from `[bsz, seq_len]` to `[bsz, 1, tgt_seq_len, src_seq_len]`.
+    """
+    bsz, src_len = mask.size()
+    tgt_len = tgt_len if tgt_len is not None else src_len
+    expanded_mask = mask[:, None, None, :].expand(bsz, 1, tgt_len, src_len).to(dtype)
+    inverted_mask = 1.0 - expanded_mask
+    return inverted_mask.masked_fill(inverted_mask.to(torch.bool), torch.finfo(dtype).min)  # type: ignore
+class RotaryEmbedding(torch.nn.Module):
+    def __init__(
+        self,
+        dim: int,
+        max_position_embeddings: int = 2048,
+        base: int = 10000,
+        device: Optional[torch.device] = None,
+    ) -> None:
+        super().__init__()
+        self.dim = dim
+        self.max_position_embeddings = max_position_embeddings
+        self.base = base
+        inv_freq = 1.0 / (
+            self.base ** (torch.arange(0, self.dim, 2).float().to(device) / self.dim)
+        )
+        self.register_buffer("inv_freq", inv_freq, persistent=False)
+        # Build here to make `torch.jit.trace` work.
+        self._set_cos_sin_cache(
+            seq_len=max_position_embeddings,
+            device=self.inv_freq.device,
+            dtype=torch.get_default_dtype(),
+        )
+    def _set_cos_sin_cache(self, seq_len: int, device: Any, dtype: Any) -> None:
+        self.max_seq_len_cached = seq_len
+        t = torch.arange(self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype)  # type: ignore
+        freqs = torch.einsum("i,j->ij", t, self.inv_freq)
+        # Different from paper, but it uses a different permutation in order to obtain the same calculation
+        emb = torch.cat((freqs, freqs), dim=-1)
+        self.register_buffer(
+            "cos_cached", emb.cos()[None, None, :, :].to(dtype), persistent=False
+        )
+        self.register_buffer(
+            "sin_cached", emb.sin()[None, None, :, :].to(dtype), persistent=False
+        )
+    def forward(
+        self, x: torch.Tensor, seq_len: int
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        # x: [bs, num_attention_heads, seq_len, head_size]
+        if seq_len > self.max_seq_len_cached:
+            self._set_cos_sin_cache(seq_len=seq_len, device=x.device, dtype=x.dtype)
+        return (
+            self.cos_cached[:, :, :seq_len, ...].to(dtype=x.dtype),  # type: ignore
+            self.sin_cached[:, :, :seq_len, ...].to(dtype=x.dtype),  # type: ignore
+        )
+def _rotate_half(x: torch.Tensor) -> torch.Tensor:
+    """Rotates half the hidden dims of the input."""
+    x1 = x[..., : x.shape[-1] // 2]
+    x2 = x[..., x.shape[-1] // 2 :]
+    return torch.cat((-x2, x1), dim=-1)
+def _rotary_pos_emb(
+    x: torch.Tensor, cos: torch.Tensor, sin: torch.Tensor, position_ids: torch.Tensor
+) -> torch.Tensor:
+    # The first two dimensions of cos and sin are always 1, so we can `squeeze` them.
+    cos = cos.squeeze(1).squeeze(0)  # [seq_len, dim]
+    sin = sin.squeeze(1).squeeze(0)  # [seq_len, dim]
+    cos = cos[position_ids].unsqueeze(1)  # [bs, 1, seq_len, dim]
+    sin = sin[position_ids].unsqueeze(1)  # [bs, 1, seq_len, dim]
+    x_embed = (x * cos) + (_rotate_half(x) * sin)
+    return x_embed
+def _rms_norm(
+    hidden_states: torch.Tensor, weight: Optional[torch.Tensor], eps: float
+) -> torch.Tensor:
+    input_dtype = hidden_states.dtype
+    hidden_states = hidden_states.to(torch.float32)
+    variance = hidden_states.pow(2).mean(-1, keepdim=True)
+    hidden_states = hidden_states * torch.rsqrt(variance + eps)
+    hidden_states = hidden_states.to(input_dtype)
+    if weight is not None:
+        hidden_states = weight * hidden_states
+    return hidden_states
+class RMSNorm(nn.Module):
+    def __init__(
+        self,
+        hidden_size: int,
+        eps: float = 1e-6,
+        device: Optional[Union[torch.device, str]] = None,
+    ) -> None:
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(hidden_size, device=device))
+        self.variance_epsilon = eps
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        return _rms_norm(hidden_states, self.weight, self.variance_epsilon)
+class Attention(torch.nn.Module):
+    def __init__(self, config: PlamoConfig) -> None:
+        super().__init__()
+        self.config = config
+        self.hidden_size = config.hidden_size
+        head_dim = config.hidden_size_per_head
+        self.max_position_embeddings = config.max_position_embeddings
+        self.q_num_heads = config.num_attention_heads
+        self.qk_dim = self.v_dim = head_dim
+        self.k_num_heads = self.v_num_heads = config.num_key_value_heads
+        assert self.q_num_heads % self.k_num_heads == 0
+        self.n_group = self.q_num_heads // self.k_num_heads
+        self.q_proj_dim = self.q_num_heads * self.qk_dim
+        self.k_proj_dim = self.k_num_heads * self.qk_dim
+        self.v_proj_dim = self.k_num_heads * self.v_dim
+        self.qkv_proj = nn.Linear(
+            self.hidden_size,
+            self.q_proj_dim + self.k_proj_dim + self.v_proj_dim,
+            bias=False,
+        )
+        self.o_proj = nn.Linear(
+            self.q_num_heads * self.v_dim, self.hidden_size, bias=False
+        )
+        self.rotary_emb = RotaryEmbedding(
+            self.qk_dim, max_position_embeddings=self.max_position_embeddings
+        )
+        self.q_weight = torch.nn.Parameter(torch.ones((self.q_num_heads, self.qk_dim)))
+        self.k_weight = torch.nn.Parameter(torch.ones((self.k_num_heads, self.qk_dim)))
+        self.is_causal = True
+        self.attention_dropout = config.attention_dropout
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.Tensor] = None,
+        past_key_value: Optional[PlamoLayerCache] = None,
+        output_attentions: bool = False,
+        use_cache: bool = False,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[PlamoLayerCache]]:
+        bsz, q_len, _ = hidden_states.size()
+        qkv = self.qkv_proj(hidden_states)
+        query_states, key_states, value_states = torch.split(
+            qkv, [self.q_proj_dim, self.k_proj_dim, self.v_proj_dim], dim=-1
+        )
+        query_states = query_states.view(
+            bsz, q_len, self.q_num_heads, self.qk_dim
+        ).transpose(1, 2)
+        key_states = key_states.view(
+            bsz, q_len, self.k_num_heads, self.qk_dim
+        ).transpose(1, 2)
+        value_states = value_states.view(
+            bsz, q_len, self.v_num_heads, self.v_dim
+        ).transpose(1, 2)
+        attn_dtype = query_states.dtype
+        query_states = (
+            _rms_norm(query_states, None, 1e-6) * self.q_weight[None, :, None]
+        )
+        key_states = _rms_norm(key_states, None, 1e-6) * self.k_weight[None, :, None]
+        if use_cache and past_key_value is None:
+            bsz, nhead_k, _, c_k = key_states.shape
+            _, nhead_v, _, c_v = value_states.shape
+            past_key_value = PlamoAttentionCache(
+                torch.zeros(
+                    (bsz, nhead_k, 0, c_k),
+                    dtype=key_states.dtype,
+                    device=key_states.device,
+                ),
+                torch.zeros(
+                    (bsz, nhead_v, 0, c_v),
+                    dtype=value_states.dtype,
+                    device=value_states.device,
+                ),
+            )
+        kv_seq_len = key_states.shape[-2]
+        if past_key_value is not None:
+            kv_seq_len += past_key_value.sequence_length()
+        cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
+        assert position_ids is not None
+        query_states = _rotary_pos_emb(query_states, cos, sin, position_ids)
+        key_states = _rotary_pos_emb(key_states, cos, sin, position_ids)
+        # [bsz, nh, t, hd]
+        if past_key_value is not None:
+            # reuse k, v, self_attention
+            past_key_value.append_cache(key_states, value_states)
+            key_states = past_key_value.key
+            value_states = past_key_value.value
+        def _expand_kv(t: torch.Tensor, repeat: int, target: int) -> torch.Tensor:
+            t = torch.repeat_interleave(t, repeat, dim=1)
+            return t[:, :target]
+        # expand shared kv
+        assert self.k_num_heads == self.v_num_heads
+        key_states = _expand_kv(key_states, self.n_group, self.q_num_heads)
+        value_states = _expand_kv(value_states, self.n_group, self.q_num_heads)
+        query_states = query_states.to(attn_dtype)
+        key_states = key_states.to(attn_dtype)
+        value_states = value_states.to(attn_dtype)
+        if attention_mask is not None and attention_mask.dtype != torch.bool:
+            attention_mask = attention_mask.to(attn_dtype)
+        attn_output = F.scaled_dot_product_attention(
+            query_states,
+            key_states,
+            value_states,
+            attn_mask=attention_mask,
+            is_causal=self.is_causal,
+            dropout_p=self.attention_dropout if self.training else 0.0,
+        )
+        attn_output = attn_output.transpose(1, 2)
+        attn_output = attn_output.reshape(bsz, q_len, self.q_num_heads * self.v_dim)
+        attn_output = self.o_proj(attn_output)
+        if not output_attentions:
+            attn_weights = None
+        return attn_output, attn_weights, past_key_value
+class DenseMLP(nn.Module):
+    def __init__(self, config: PlamoConfig) -> None:
+        super().__init__()
+        self.config = config
+        self.hidden_size = config.hidden_size
+        self.intermediate_size = config.intermediate_size
+        self.gate_up_proj = torch.nn.Linear(
+            self.hidden_size, self.intermediate_size * 2, bias=False
+        )
+        self.down_proj = torch.nn.Linear(
+            self.intermediate_size, self.hidden_size, bias=False
+        )
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        h = self.gate_up_proj(x)
+        h = _swiglu(h)
+        return self.down_proj(h)  # type: ignore
+def MLP(config: PlamoConfig, is_sparse: bool) -> torch.nn.Module:
+    return DenseMLP(config)
+class PlamoDecoderLayer(torch.nn.Module):
+    def __init__(self, config: PlamoConfig, is_sparse: bool) -> None:
+        super().__init__()
+        self.config = config
+        self.hidden_size = config.hidden_size
+        self.self_attn = Attention(config)
+        self.mlp = MLP(config, is_sparse=is_sparse)
+        self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.norm2 = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_value: Optional[PlamoLayerCache] = None,
+        output_attentions: Optional[bool] = False,
+        use_cache: Optional[bool] = False,
+    ) -> Tuple[Any, ...]:
+        # from LlamaDecoder
+        residual = hidden_states
+        hidden_states = self.norm(hidden_states)
+        # Self Attention
+        hidden_states_sa, self_attn_weights, present_key_value = self.self_attn(
+            hidden_states=hidden_states,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_value=past_key_value,
+            output_attentions=output_attentions,
+            use_cache=use_cache,
+        )
+        hidden_states = residual + hidden_states_sa
+        residual = hidden_states
+        hidden_states = self.norm2(hidden_states)
+        # Fully Connected
+        hidden_states_mlp = self.mlp(hidden_states)
+        # Residual
+        hidden_states = residual + hidden_states_mlp
+        outputs: Any = (hidden_states,)
+        if output_attentions:
+            outputs += (self_attn_weights,)
+        if use_cache:
+            outputs += (present_key_value,)
+        return outputs  # type: ignore
+def is_sparse(config: PlamoConfig, i: int) -> bool:
+    if config.sparse_step is None:
+        return False
+    if config.sparse_step == 1:
+        return True
+    return (i % config.sparse_step) == 1
+class PlamoDecoder(torch.nn.Module):
+    def __init__(self, config: PlamoConfig) -> None:
+        super().__init__()
+        self.layers = torch.nn.ModuleList(
+            [
+                PlamoDecoderLayer(config, is_sparse=is_sparse(config, i))
+                for i in range(config.num_hidden_layers)
+            ]
+        )
+    def forward(self, x: DecoderInput) -> DecoderOutput:
+        all_hidden_states: Optional[Tuple[torch.Tensor, ...]] = (
+            () if x.output_hidden_states else None
+        )
+        all_self_attns: Optional[Tuple[torch.Tensor, ...]] = (
+            () if x.output_attentions else None
+        )
+        next_decoder_cache: Optional[PlamoCache] = [] if x.use_cache else None
+        hidden_states = x.hidden_states
+        for idx, decoder_layer in enumerate(self.layers):
+            if x.output_hidden_states:
+                assert all_hidden_states is not None
+                all_hidden_states += (hidden_states,)
+            past_key_value = (
+                x.past_key_values[idx] if x.past_key_values is not None else None
+            )
+            if self.training and x.gradient_checkpointing:
+                def create_custom_forward(module):  # type: ignore
+                    def custom_forward(*inputs):  # type: ignore
+                        # None for past_key_value
+                        return module(*inputs, x.output_attentions, None)
+                    return custom_forward
+                layer_outputs = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(decoder_layer),  # type: ignore
+                    hidden_states,
+                    x.attention_mask,
+                    x.position_ids,
+                    None,
+                    use_reentrant=False,
+                )
+            else:
+                layer_outputs = decoder_layer(
+                    hidden_states,
+                    attention_mask=x.attention_mask,
+                    position_ids=x.position_ids,
+                    past_key_value=past_key_value,
+                    output_attentions=x.output_attentions,
+                    use_cache=x.use_cache,
+                )
+            hidden_states = layer_outputs[0]
+            if x.use_cache:
+                cache = layer_outputs[2 if x.output_attentions else 1]
+                assert cache is not None
+                assert next_decoder_cache is not None
+                next_decoder_cache += (cache,)
+            if x.output_attentions:
+                assert layer_outputs[1] is not None
+                assert all_self_attns is not None
+                all_self_attns += (layer_outputs[1],)
+        return DecoderOutput(
+            hidden_states, all_hidden_states, all_self_attns, next_decoder_cache
+        )
+class PlamoPreTrainedModel(PreTrainedModel):  # type: ignore
+    config_class = PlamoConfig
+    _no_split_modules: List[str]
+    base_model_prefix = "model"
+    supports_gradient_checkpointing = True
+    _supports_sdpa = True
+    _no_split_modules = ["PlamoDecoderLayer"]
+    _skip_keys_device_placement = "past_key_values"
+    _keys_to_ignore_on_load_unexpected = [r"decoder\.version"]
+    def _init_weights(self, module: torch.nn.Module) -> None:
+        std = self.config.initializer_range
+        if isinstance(module, nn.Linear):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+    def _set_gradient_checkpointing(
+        self, module: torch.nn.Module, value: bool = False
+    ) -> None:
+        module.gradient_checkpointing = value  # type: ignore
+class PlamoModel(PlamoPreTrainedModel):
+    def __init__(self, config: PlamoConfig):
+        super().__init__(config)
+        assert config.eval_attention_n_bit is None
+        assert config.eval_mlp_n_bit is None
+        assert not config.eval_offload_moe
+        self.padding_idx = config.pad_token_id
+        self.vocab_size = config.vocab_size
+        self.embed_tokens = nn.Embedding(
+            config.vocab_size, config.hidden_size, self.padding_idx
+        )
+        self.layers = PlamoDecoder(config)  # type: ignore
+        self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.gradient_checkpointing = False
+        # Initialize weights and apply final processing
+        self.post_init()
+    def get_input_embeddings(self) -> torch.nn.Embedding:
+        return self.embed_tokens
+    def set_input_embeddings(self, value: torch.nn.Embedding) -> None:
+        self.embed_tokens = value
+    # Copied from transformers.models.bart.modeling_bart.BartDecoder._prepare_decoder_attention_mask
+    def _prepare_decoder_attention_mask(
+        self,
+        attention_mask: torch.Tensor,
+        input_shape: Tuple[int, int],
+        inputs_embeds: Optional[torch.Tensor],
+        past_key_values_length: int,
+    ) -> Optional[torch.Tensor]:
+        # create causal mask
+        # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
+        combined_attention_mask: Optional[torch.Tensor] = None
+        if input_shape[-1] > 1:
+            assert inputs_embeds is not None
+            combined_attention_mask = _make_causal_mask(
+                input_shape,
+                inputs_embeds.dtype,
+                device=inputs_embeds.device,
+                past_key_values_length=past_key_values_length,
+            )
+        if attention_mask is not None:
+            # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
+            assert inputs_embeds is not None
+            expanded_attn_mask = _expand_mask(
+                attention_mask, inputs_embeds.dtype, tgt_len=input_shape[-1]
+            ).to(inputs_embeds.device)
+            combined_attention_mask = (
+                expanded_attn_mask
+                if combined_attention_mask is None
+                else expanded_attn_mask + combined_attention_mask
+            )
+        return combined_attention_mask
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.Tensor] = None,
+        past_key_values: Optional[PlamoCache] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, BaseModelOutputWithPast]:
+        assert input_ids is not None
+        output_attentions = (
+            output_attentions
+            if output_attentions is not None
+            else self.config.output_attentions
+        )
+        output_hidden_states = (
+            output_hidden_states
+            if output_hidden_states is not None
+            else self.config.output_hidden_states
+        )
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+        return_dict = (
+            return_dict if return_dict is not None else self.config.use_return_dict
+        )
+        # retrieve input_ids and inputs_embeds
+        if input_ids is not None and inputs_embeds is not None:
+            raise ValueError(
+                "You cannot specify both decoder_input_ids and decoder_inputs_embeds at the same time"
+            )
+        elif input_ids is not None:
+            batch_size, seq_length = input_ids.shape
+        else:
+            raise ValueError(
+                "You have to specify either decoder_input_ids or decoder_inputs_embeds"
+            )
+        seq_length_with_past = seq_length
+        past_key_values_length = 0
+        if past_key_values is not None:
+            past_key_values_length = past_key_values[0].sequence_length()
+            seq_length_with_past = seq_length_with_past + past_key_values_length
+        if position_ids is None:
+            device = input_ids.device
+            position_ids = torch.arange(
+                past_key_values_length,
+                seq_length + past_key_values_length,
+                dtype=torch.long,
+                device=device,
+            )
+            position_ids = position_ids.unsqueeze(0).view(-1, seq_length)
+        else:
+            position_ids = position_ids.view(-1, seq_length).long()
+        if inputs_embeds is None:
+            inputs_embeds = self.embed_tokens(input_ids)
+        # embed positions
+        if (
+            attention_mask is not None
+            or not self.training
+            or past_key_values is not None
+        ):
+            if attention_mask is None:
+                attention_mask = torch.ones(
+                    (batch_size, seq_length_with_past),
+                    dtype=torch.bool,
+                    device=inputs_embeds.device,
+                )
+            # attention_mask = self._prepare_decoder_attention_mask(
+            #     attention_mask, (batch_size, seq_length), inputs_embeds, past_key_values_length
+            # )
+            attention_mask = _prepare_4d_causal_attention_mask_for_sdpa(
+                attention_mask,
+                (batch_size, seq_length),
+                inputs_embeds,
+                past_key_values_length,
+            )
+        hidden_states = inputs_embeds
+        if self.gradient_checkpointing and self.training:
+            if use_cache:
+                use_cache = False
+        # decoder layers
+        out = self.layers(
+            DecoderInput(
+                hidden_states,
+                position_ids,
+                attention_mask,
+                past_key_values,
+                output_hidden_states,
+                output_attentions,
+                use_cache,
+                self.gradient_checkpointing,
+            )
+        )
+        assert isinstance(out, DecoderOutput)
+        hidden_states = out.hidden_states
+        all_hidden_states = out.all_hidden_states
+        all_self_attns = out.all_self_attns
+        next_decoder_cache = out.next_decoder_cache
+        hidden_states = self.norm(hidden_states)
+        # add hidden states from the last decoder layer
+        if output_hidden_states:
+            assert all_hidden_states is not None
+            all_hidden_states += (hidden_states,)
+        next_cache = next_decoder_cache if use_cache else None
+        if not return_dict:
+            return tuple(
+                v
+                for v in [hidden_states, next_cache, all_hidden_states, all_self_attns]
+                if v is not None
+            )
+        return BaseModelOutputWithPast(
+            last_hidden_state=hidden_states,
+            past_key_values=next_cache,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attns,
+        )
+class ModifiedAttention(Attention):
+    def __init__(self, config: PlamoConfig, **kwargs):
+        super().__init__(config, **kwargs)
+        self.is_causal = False
+PLAMO_ATTENTION_CLASSES = {
+    "sdpa": ModifiedAttention,
+}
+class ModifiedPlamoDecoderLayer(PlamoDecoderLayer):
+    def __init__(self, config: PlamoConfig, is_sparse: bool):
+        nn.Module.__init__(self)
+        self.config = config
+        self.hidden_size = config.hidden_size
+        self.self_attn = PLAMO_ATTENTION_CLASSES[config._attn_implementation](
+            config=config
+        )
+        self.mlp = MLP(config, is_sparse=is_sparse)
+        self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.norm2 = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+class ModifiedPlamoDecoder(PlamoDecoder):
+    def __init__(self, config: PlamoConfig) -> None:
+        nn.Module.__init__(self)
+        self.layers = nn.ModuleList(
+            [
+                ModifiedPlamoDecoderLayer(
+                    config, is_sparse=is_sparse(config, layer_idx)
+                )
+                for layer_idx in range(config.num_hidden_layers)
+            ]
+        )
+class PlamoBiModel(PlamoModel):
+    _no_split_modules = ["ModifiedPlamoDecoderLayer"]
+    def __init__(self, config: PlamoConfig):
+        PlamoPreTrainedModel.__init__(self, config)
+        self.padding_idx = config.pad_token_id
+        self.vocab_size = config.vocab_size
+        self.embed_tokens = nn.Embedding(
+            config.vocab_size, config.hidden_size, self.padding_idx
+        )
+        self.layers = ModifiedPlamoDecoder(config)
+        self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.gradient_checkpointing = False
+        self._attn_implementation = config._attn_implementation
+        self.post_init()
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.Tensor] = None,
+        past_key_values: Optional[PlamoCache] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, BaseModelOutputWithPast]:
+        assert input_ids is not None
+        output_attentions = (
+            output_attentions
+            if output_attentions is not None
+            else self.config.output_attentions
+        )
+        output_hidden_states = (
+            output_hidden_states
+            if output_hidden_states is not None
+            else self.config.output_hidden_states
+        )
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+        return_dict = (
+            return_dict if return_dict is not None else self.config.use_return_dict
+        )
+        if input_ids is not None and inputs_embeds is not None:
+            raise ValueError(
+                "You cannot specify both decoder_input_ids and decoder_inputs_embeds at the same time"
+            )
+        elif input_ids is not None:
+            batch_size, seq_length = input_ids.shape
+        else:
+            raise ValueError(
+                "You have to specify either decoder_input_ids or decoder_inputs_embeds"
+            )
+        seq_length_with_past = seq_length
+        past_key_values_length = 0
+        if past_key_values is not None:
+            past_key_values_length = past_key_values[0].sequence_length()
+            seq_length_with_past = seq_length_with_past + past_key_values_length
+        if position_ids is None:
+            device = input_ids.device
+            position_ids = torch.arange(
+                past_key_values_length,
+                seq_length + past_key_values_length,
+                dtype=torch.long,
+                device=device,
+            )
+            position_ids = position_ids.unsqueeze(0).view(-1, seq_length)
+        else:
+            position_ids = position_ids.view(-1, seq_length).long()
+        if inputs_embeds is None:
+            inputs_embeds = self.embed_tokens(input_ids)
+        if self._attn_implementation == "sdpa" and not output_attentions:
+            attention_mask = _prepare_4d_causal_attention_mask_for_sdpa(
+                attention_mask,
+                (batch_size, seq_length),
+                inputs_embeds,
+                past_key_values_length,
+            )
+        else:
+            attention_mask = _prepare_4d_causal_attention_mask(
+                attention_mask,
+                (batch_size, seq_length),
+                inputs_embeds,
+                past_key_values_length,
+                sliding_window=self.config.sliding_window,
+            )
+        hidden_states = inputs_embeds
+        if self.gradient_checkpointing and self.training:
+            if use_cache:
+                use_cache = False
+        out = self.layers(
+            DecoderInput(
+                hidden_states,
+                position_ids,
+                attention_mask,
+                past_key_values,
+                output_hidden_states,
+                output_attentions,
+                use_cache,
+                self.gradient_checkpointing,
+            )
+        )
+        assert isinstance(out, DecoderOutput)
+        hidden_states = out.hidden_states
+        all_hidden_states = out.all_hidden_states
+        all_self_attns = out.all_self_attns
+        next_decoder_cache = out.next_decoder_cache
+        hidden_states = self.norm(hidden_states)
+        if output_hidden_states:
+            assert all_hidden_states is not None
+            all_hidden_states += (hidden_states,)
+        next_cache = next_decoder_cache if use_cache else None
+        if not return_dict:
+            return tuple(
+                v
+                for v in [hidden_states, next_cache, all_hidden_states, all_self_attns]
+                if v is not None
+            )
+        return BaseModelOutputWithPast(
+            last_hidden_state=hidden_states,
+            past_key_values=next_cache,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attns,
+        )
+    def _tokenize(
+        self,
+        texts: List[str],
+        tokenizer: AutoTokenizer,
+        add_special_tokens: bool = True,
+    ) -> BatchEncoding:
+        tokenizer.pad_token = tokenizer.eos_token
+        tokenizer.padding_side = "left"
+        return tokenizer(
+            texts,
+            return_tensors="pt",
+            truncation=True,
+            padding=True,
+            max_length=self.config.max_length,
+            add_special_tokens=add_special_tokens,
+        )
+    def _tokenize_with_instruction(
+        self,
+        sentences: List[str],
+        tokenizer: AutoTokenizer,
+        instruction: str,
+        add_special_tokens: bool = True,
+    ) -> Tuple[BatchEncoding, torch.Tensor]:
+        sentence_features = self._tokenize(
+            sentences, tokenizer, add_special_tokens=False
+        )
+        sentences_with_instruction = [instruction + sentence for sentence in sentences]
+        sentence_features_with_instruction = self._tokenize(
+            sentences_with_instruction, tokenizer, add_special_tokens
+        )
+        embed_mask_list = []
+        for i in range(len(sentences)):
+            n_tokens = int(sentence_features["attention_mask"][i].sum().item())
+            mask = torch.zeros_like(
+                sentence_features_with_instruction["attention_mask"][i]
+            )
+            if n_tokens > 0:
+                mask[-n_tokens:] = torch.ones(n_tokens, dtype=mask.dtype)
+            embed_mask_list.append(mask.unsqueeze(0))
+        embed_mask = torch.cat(embed_mask_list, dim=0)
+        return sentence_features_with_instruction, embed_mask
+    def _mean_pooling(
+        self,
+        sentence_features: BatchEncoding,
+        last_hidden_state: torch.Tensor,
+        embed_mask: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        if embed_mask is None:
+            mask = sentence_features["attention_mask"]
+        else:
+            mask = embed_mask
+        sum_hidden = (
+            last_hidden_state * mask.unsqueeze(-1).type_as(last_hidden_state)
+        ).sum(dim=1)
+        lengths = mask.sum(dim=1, keepdim=True).clamp(min=1)
+        return sum_hidden / lengths
+    def encode(
+        self,
+        sentences: Union[str, List[str]],
+        tokenizer: AutoTokenizer,
+        instruction: str,
+    ) -> torch.Tensor:
+        if isinstance(sentences, str):
+            sentences = [sentences]
+        sentence_features, embed_mask = self._tokenize_with_instruction(
+            sentences,
+            tokenizer,
+            instruction=instruction,
+        )
+        sentence_features = sentence_features.to(self.device)
+        embed_mask = embed_mask.to(self.device)
+        reps = self(**sentence_features)
+        return self._mean_pooling(sentence_features, reps.last_hidden_state, embed_mask)
+    def encode_document(
+        self,
+        sentences: Union[str, List[str]],
+        tokenizer: AutoTokenizer,
+    ) -> torch.Tensor:
+        default_document_instruction = ""
+        return self.encode(sentences, tokenizer, default_document_instruction)
+    def encode_query(
+        self,
+        sentences: Union[str, List[str]],
+        tokenizer: AutoTokenizer,
+    ) -> torch.Tensor:
+        default_query_instruction = "次の文章に対して、関連する文章を検索してください: "
+        return self.encode(sentences, tokenizer, default_query_instruction)

special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,24 @@

+{
+  "bos_token": {
+    "content": "<|startoftext|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "<|startoftext|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": "<|startoftext|>",
+  "unk_token": {
+    "content": "<|unknown|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

tokenization_plamo.py ADDED Viewed

	@@ -0,0 +1,191 @@

+import os
+import warnings
+from shutil import copyfile
+from typing import Any, Dict, List, Optional, Tuple
+import sentencepiece as spm
+from transformers.tokenization_utils import PreTrainedTokenizer
+from transformers.utils import logging
+VOCAB_FILES_NAMES = {"vocab_file": "tokenizer.model"}
+logger = logging.get_logger(__name__)
+def _get_tokenizer_threads(default: int = -1) -> int:
+    env_names = [
+        "PLAMO_TOKENIZER_NUM_THREADS",
+        "RAYON_NUM_THREADS",
+    ]
+    for name in env_names:
+        v = os.environ.get(name, None)
+        if v:
+            try:
+                return int(v)
+            except ValueError:
+                warnings.warn(
+                    f"Value assigned to env `{name}` is not an integer. Current value is {v}",
+                    category=RuntimeWarning,
+                    stacklevel=2,
+                )
+    return default
+class PlamoTokenizer(PreTrainedTokenizer):  # type: ignore
+    vocab_files_names = VOCAB_FILES_NAMES
+    model_input_names = ["input_ids", "attention_mask"]
+    def __init__(
+        self,
+        vocab_file: str,
+        unk_token: str = "<unk>",
+        bos_token: str = "<s>",
+        eos_token: str = "</s>",
+        pad_token: str = "<pad>",
+        cls_token: str = "<cls>",
+        sep_token: str = "<sep>",
+        mask_token: str = "<mask>",
+        sp_model_kwargs: Optional[Dict[str, Any]] = None,
+        clean_up_tokenization_spaces: bool = False,
+        num_threads: int = -1,
+        **kwargs: Any,
+    ) -> None:
+        """Tokenizer for PLaMo.
+        Args:
+            vocab_file (str): Vocabrary file path.
+            unk_token (str): Unknown token.
+            bos_token (str): Beginning of sentence token.
+            eos_token (str): End of sentence token.
+            pad_token (str): Padding token.
+            cls_token (str):
+                Classification token, to extract a summary of an input sequence leveraging self-attention along the
+                full depth of the model.
+            sep_token (str): Separation token, to separate context and query in an input sequence.
+            mask_token (str): Mask token, to use when training a model with masked-language modeling.
+            sp_model_kwargs (Dict[atr, Any] or None): kwargs for sentencepiece model.
+            clean_up_tokenization_spaces (bool): Whether or not to clean up the tokenization spaces.
+            num_threads (int):
+                Number of threads. This value will be ignored if one of `PLAMO_TOKENIZER_NUM_THREADS` or
+                `RAYON_NUM_THREADS` is set as an environment variable.
+        """
+        if "add_bos_token" not in kwargs:
+            kwargs["add_bos_token"] = False
+        if "add_eos_token" not in kwargs:
+            kwargs["add_eos_token"] = False
+        self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs
+        self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
+        self.sp_model.Init(model_file=vocab_file, num_threads=_get_tokenizer_threads(num_threads))
+        self.vocab_file = vocab_file
+        self.add_bos_token = kwargs["add_bos_token"]
+        self.add_eos_token = kwargs["add_eos_token"]
+        super().__init__(
+            vocab_file=vocab_file,
+            unk_token=unk_token,
+            bos_token=bos_token,
+            eos_token=eos_token,
+            pad_token=pad_token,
+            cls_token=cls_token,
+            sep_token=sep_token,
+            mask_token=mask_token,
+            sp_model_kwargs=sp_model_kwargs,
+            clean_up_tokenization_spaces=clean_up_tokenization_spaces,
+            **kwargs,
+        )
+    # the functions below are copied from hf transformers LlamaTokenizer's implementation to fix the behaviour of the tokenizer
+    # https://github.com/huggingface/transformers/blob/v4.30.2/src/transformers/models/llama/tokenization_llama.py
+    def __getstate__(self) -> dict[str, Any]:
+        state = self.__dict__.copy()
+        state["sp_model"] = None
+        state["sp_model_proto"] = self.sp_model.serialized_model_proto()
+        return state
+    def __setstate__(self, d: dict[str, Any]) -> None:
+        self.__dict__ = d
+        self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
+        self.sp_model.LoadFromSerializedProto(self.sp_model_proto)
+    @property
+    def vocab_size(self) -> Any:
+        """Returns vocab size"""
+        return self.sp_model.get_piece_size()
+    def get_vocab(self) -> dict[str, int]:
+        """Returns vocab as a dict"""
+        vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)}
+        vocab.update(self.added_tokens_encoder)
+        return vocab
+    def convert_tokens_to_string(self, tokens: List[int]) -> str:
+        """Converts a sequence of tokens (string) in a single string."""
+        current_sub_tokens: List[int] = []
+        out_string = ""
+        prev_is_special = False
+        for i, token in enumerate(tokens):
+            # make sure that special tokens are not decoded using sentencepiece model
+            if token in self.all_special_tokens:
+                if not prev_is_special and i != 0:
+                    out_string += " "
+                out_string += self.sp_model.decode(current_sub_tokens) + token
+                prev_is_special = True
+                current_sub_tokens = []
+            else:
+                current_sub_tokens.append(token)
+                prev_is_special = False
+        out_string += self.sp_model.decode(current_sub_tokens)
+        return out_string
+    def _tokenize(self, text: str) -> Any:
+        """Returns a tokenized string."""
+        return self.sp_model.encode(text, out_type=str)
+    def _convert_token_to_id(self, token: str) -> Any:
+        """Converts a token (str) in an id using the vocab."""
+        return self.sp_model.piece_to_id(token)
+    def _convert_id_to_token(self, index: int) -> Any:
+        """Converts an index (integer) in a token (str) using the vocab."""
+        token = self.sp_model.IdToPiece(index)
+        return token
+    def build_inputs_with_special_tokens(
+        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
+    ) -> List[int]:
+        bos_token_id = [self.bos_token_id] if self.add_bos_token else []
+        eos_token_id = [self.eos_token_id] if self.add_eos_token else []
+        output = bos_token_id + token_ids_0 + eos_token_id
+        if token_ids_1 is not None:
+            output = output + bos_token_id + token_ids_1 + eos_token_id
+        return output
+    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
+        """
+        Save the vocabulary and special tokens file to a directory.
+        Args:
+            save_directory (`str`):
+                The directory in which to save the vocabulary.
+        Returns:
+            `Tuple(str)`: Paths to the files saved.
+        """
+        if not os.path.isdir(save_directory):
+            logger.error(f"Vocabulary path ({save_directory}) should be a directory")
+            return ("",)
+        out_vocab_file = os.path.join(
+            save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
+        )
+        if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file) and os.path.isfile(self.vocab_file):
+            copyfile(self.vocab_file, out_vocab_file)
+        elif not os.path.isfile(self.vocab_file):
+            with open(out_vocab_file, "wb") as fi:
+                content_spiece_model = self.sp_model.serialized_model_proto()
+                fi.write(content_spiece_model)
+        return (out_vocab_file,)

tokenizer.model ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9603895be773fe5807f5183bf9279da4df3a81ce5941a1a9521e8b496201c69a
+size 805457

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,57 @@

+{
+  "add_bos_token": true,
+  "add_eos_token": false,
+  "added_tokens_decoder": {
+    "0": {
+      "content": "<|unknown|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "1": {
+      "content": "<|startoftext|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "2": {
+      "content": "<|endoftext|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "3": {
+      "content": "<|pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "auto_map": {
+    "AutoTokenizer": [
+      "tokenization_plamo.PlamoTokenizer",
+      null
+    ]
+  },
+  "bos_token": "<|startoftext|>",
+  "clean_up_tokenization_spaces": false,
+  "cls_token": null,
+  "eos_token": "<|startoftext|>",
+  "extra_special_tokens": {},
+  "local_file_only": true,
+  "mask_token": null,
+  "model_max_length": 1000000000000000019884624838656,
+  "pad_token": "<|startoftext|>",
+  "sep_token": null,
+  "sp_model_kwargs": {},
+  "tokenizer_class": "PlamoTokenizer",
+  "unk_token": "<|unknown|>"
+}