microsoft
/

Phi-3-mini-4k-instruct

@@ -16,8 +16,9 @@
 """ Phi-3 model configuration"""
-from transformers.configuration_utils import PretrainedConfig
-from transformers.utils import logging
 logger = logging.get_logger(__name__)
@@ -72,8 +73,8 @@ class Phi3Config(PretrainedConfig):
             original RoPE embeddings when using long scaling.
         initializer_range (`float`, *optional*, defaults to 0.02):
             The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
-        layer_norm_eps (`float`, *optional*, defaults to 1e-05):
-            The epsilon used by the rms normalization layers.
         use_cache (`bool`, *optional*, defaults to `True`):
             Whether or not the model should return the last key/values attentions (not used by all models). Only
             relevant if `config.is_decoder=True`. Whether to tie weight embeddings or not.
@@ -81,11 +82,17 @@ class Phi3Config(PretrainedConfig):
             Whether to tie weight embeddings
         rope_theta (`float`, *optional*, defaults to 10000.0):
             The base period of the RoPE embeddings.
-        rope_scaling (`dict`, *optional*, defaults to `None`):
             The scaling factor for the RoPE embeddings. If `None`, no scaling is applied. If a dictionary, it must
             contain the following keys: `type`, `short_factor` and `long_factor`. The `type` must be `longrope` and
             the `short_factor` and `long_factor` must be lists of numbers with the same length as the hidden size
             divided by the number of attention heads divided by 2.
     Example:
@@ -178,7 +185,7 @@ class Phi3Config(PretrainedConfig):
         short_factor = self.rope_scaling["short_factor"]
         assert isinstance(short_factor, list) and all(
-            [isinstance(x, (int, float)) for x in short_factor]
         ), f"RoPE scaling factor must be a list of numbers, got {short_factor}."
         assert (
             len(short_factor) == self.hidden_size // self.num_attention_heads // 2
@@ -186,7 +193,7 @@ class Phi3Config(PretrainedConfig):
         long_factor = self.rope_scaling["long_factor"]
         assert isinstance(long_factor, list) and all(
-            [isinstance(x, (int, float)) for x in long_factor]
         ), f"RoPE scaling factor must be a list of numbers, got {long_factor}."
         assert (
             len(long_factor) == self.hidden_size // self.num_attention_heads // 2

 """ Phi-3 model configuration"""
+from ...configuration_utils import PretrainedConfig
+from ...utils import logging
 logger = logging.get_logger(__name__)
             original RoPE embeddings when using long scaling.
         initializer_range (`float`, *optional*, defaults to 0.02):
             The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        rms_norm_eps (`float`, *optional*, defaults to 1e-05):
+            The epsilon value used for the RMSNorm.
         use_cache (`bool`, *optional*, defaults to `True`):
             Whether or not the model should return the last key/values attentions (not used by all models). Only
             relevant if `config.is_decoder=True`. Whether to tie weight embeddings or not.
             Whether to tie weight embeddings
         rope_theta (`float`, *optional*, defaults to 10000.0):
             The base period of the RoPE embeddings.
+        rope_scaling (`dict`, *optional*):
             The scaling factor for the RoPE embeddings. If `None`, no scaling is applied. If a dictionary, it must
             contain the following keys: `type`, `short_factor` and `long_factor`. The `type` must be `longrope` and
             the `short_factor` and `long_factor` must be lists of numbers with the same length as the hidden size
             divided by the number of attention heads divided by 2.
+        eos_token_id (`int`, *optional*, defaults to 32000):
+            The id of the "end-of-sequence" token.
+        pad_token_id (`int`, *optional*, defaults to 32000):
+            The id of the padding token.
+        sliding_window (`int`, *optional*):
+            Sliding window attention window size. If `None`, no sliding window is applied.
     Example:
         short_factor = self.rope_scaling["short_factor"]
         assert isinstance(short_factor, list) and all(
+            isinstance(x, (int, float)) for x in short_factor
         ), f"RoPE scaling factor must be a list of numbers, got {short_factor}."
         assert (
             len(short_factor) == self.hidden_size // self.num_attention_heads // 2
         long_factor = self.rope_scaling["long_factor"]
         assert isinstance(long_factor, list) and all(
+            isinstance(x, (int, float)) for x in long_factor
         ), f"RoPE scaling factor must be a list of numbers, got {long_factor}."
         assert (
             len(long_factor) == self.hidden_size // self.num_attention_heads // 2