Update configuration_phi3.py
Browse files- configuration_phi3.py +14 -7
configuration_phi3.py
CHANGED
@@ -16,8 +16,9 @@
|
|
16 |
""" Phi-3 model configuration"""
|
17 |
|
18 |
|
19 |
-
from
|
20 |
-
from
|
|
|
21 |
|
22 |
logger = logging.get_logger(__name__)
|
23 |
|
@@ -72,8 +73,8 @@ class Phi3Config(PretrainedConfig):
|
|
72 |
original RoPE embeddings when using long scaling.
|
73 |
initializer_range (`float`, *optional*, defaults to 0.02):
|
74 |
The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
|
75 |
-
|
76 |
-
The epsilon used
|
77 |
use_cache (`bool`, *optional*, defaults to `True`):
|
78 |
Whether or not the model should return the last key/values attentions (not used by all models). Only
|
79 |
relevant if `config.is_decoder=True`. Whether to tie weight embeddings or not.
|
@@ -81,11 +82,17 @@ class Phi3Config(PretrainedConfig):
|
|
81 |
Whether to tie weight embeddings
|
82 |
rope_theta (`float`, *optional*, defaults to 10000.0):
|
83 |
The base period of the RoPE embeddings.
|
84 |
-
rope_scaling (`dict`, *optional
|
85 |
The scaling factor for the RoPE embeddings. If `None`, no scaling is applied. If a dictionary, it must
|
86 |
contain the following keys: `type`, `short_factor` and `long_factor`. The `type` must be `longrope` and
|
87 |
the `short_factor` and `long_factor` must be lists of numbers with the same length as the hidden size
|
88 |
divided by the number of attention heads divided by 2.
|
|
|
|
|
|
|
|
|
|
|
|
|
89 |
|
90 |
Example:
|
91 |
|
@@ -178,7 +185,7 @@ class Phi3Config(PretrainedConfig):
|
|
178 |
|
179 |
short_factor = self.rope_scaling["short_factor"]
|
180 |
assert isinstance(short_factor, list) and all(
|
181 |
-
|
182 |
), f"RoPE scaling factor must be a list of numbers, got {short_factor}."
|
183 |
assert (
|
184 |
len(short_factor) == self.hidden_size // self.num_attention_heads // 2
|
@@ -186,7 +193,7 @@ class Phi3Config(PretrainedConfig):
|
|
186 |
|
187 |
long_factor = self.rope_scaling["long_factor"]
|
188 |
assert isinstance(long_factor, list) and all(
|
189 |
-
|
190 |
), f"RoPE scaling factor must be a list of numbers, got {long_factor}."
|
191 |
assert (
|
192 |
len(long_factor) == self.hidden_size // self.num_attention_heads // 2
|
|
|
16 |
""" Phi-3 model configuration"""
|
17 |
|
18 |
|
19 |
+
from ...configuration_utils import PretrainedConfig
|
20 |
+
from ...utils import logging
|
21 |
+
|
22 |
|
23 |
logger = logging.get_logger(__name__)
|
24 |
|
|
|
73 |
original RoPE embeddings when using long scaling.
|
74 |
initializer_range (`float`, *optional*, defaults to 0.02):
|
75 |
The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
|
76 |
+
rms_norm_eps (`float`, *optional*, defaults to 1e-05):
|
77 |
+
The epsilon value used for the RMSNorm.
|
78 |
use_cache (`bool`, *optional*, defaults to `True`):
|
79 |
Whether or not the model should return the last key/values attentions (not used by all models). Only
|
80 |
relevant if `config.is_decoder=True`. Whether to tie weight embeddings or not.
|
|
|
82 |
Whether to tie weight embeddings
|
83 |
rope_theta (`float`, *optional*, defaults to 10000.0):
|
84 |
The base period of the RoPE embeddings.
|
85 |
+
rope_scaling (`dict`, *optional*):
|
86 |
The scaling factor for the RoPE embeddings. If `None`, no scaling is applied. If a dictionary, it must
|
87 |
contain the following keys: `type`, `short_factor` and `long_factor`. The `type` must be `longrope` and
|
88 |
the `short_factor` and `long_factor` must be lists of numbers with the same length as the hidden size
|
89 |
divided by the number of attention heads divided by 2.
|
90 |
+
eos_token_id (`int`, *optional*, defaults to 32000):
|
91 |
+
The id of the "end-of-sequence" token.
|
92 |
+
pad_token_id (`int`, *optional*, defaults to 32000):
|
93 |
+
The id of the padding token.
|
94 |
+
sliding_window (`int`, *optional*):
|
95 |
+
Sliding window attention window size. If `None`, no sliding window is applied.
|
96 |
|
97 |
Example:
|
98 |
|
|
|
185 |
|
186 |
short_factor = self.rope_scaling["short_factor"]
|
187 |
assert isinstance(short_factor, list) and all(
|
188 |
+
isinstance(x, (int, float)) for x in short_factor
|
189 |
), f"RoPE scaling factor must be a list of numbers, got {short_factor}."
|
190 |
assert (
|
191 |
len(short_factor) == self.hidden_size // self.num_attention_heads // 2
|
|
|
193 |
|
194 |
long_factor = self.rope_scaling["long_factor"]
|
195 |
assert isinstance(long_factor, list) and all(
|
196 |
+
isinstance(x, (int, float)) for x in long_factor
|
197 |
), f"RoPE scaling factor must be a list of numbers, got {long_factor}."
|
198 |
assert (
|
199 |
len(long_factor) == self.hidden_size // self.num_attention_heads // 2
|