jupyterjazz
commited on
Commit
•
79c3c93
1
Parent(s):
c380b5a
feat: initialize models with or without adapters
Browse filesSigned-off-by: jupyterjazz <[email protected]>
- configuration_xlm_roberta.py +2 -0
- modeling_lora.py +13 -15
configuration_xlm_roberta.py
CHANGED
@@ -22,6 +22,7 @@ class XLMRobertaFlashConfig(PretrainedConfig):
|
|
22 |
use_cache=True,
|
23 |
classifier_dropout=None,
|
24 |
num_loras=5,
|
|
|
25 |
**kwargs,
|
26 |
):
|
27 |
super().__init__(pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)
|
@@ -42,3 +43,4 @@ class XLMRobertaFlashConfig(PretrainedConfig):
|
|
42 |
self.use_cache = use_cache
|
43 |
self.classifier_dropout = classifier_dropout
|
44 |
self.num_loras = num_loras
|
|
|
|
22 |
use_cache=True,
|
23 |
classifier_dropout=None,
|
24 |
num_loras=5,
|
25 |
+
load_trained_adapters=False,
|
26 |
**kwargs,
|
27 |
):
|
28 |
super().__init__(pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)
|
|
|
43 |
self.use_cache = use_cache
|
44 |
self.classifier_dropout = classifier_dropout
|
45 |
self.num_loras = num_loras
|
46 |
+
self.load_trained_adapters = load_trained_adapters
|
modeling_lora.py
CHANGED
@@ -61,8 +61,6 @@ class LoRAParametrization(nn.Module):
|
|
61 |
fan_in_fan_out = layer_type == "embedding"
|
62 |
self.swap = (lambda x: (x[1], x[0])) if fan_in_fan_out else (lambda x: x)
|
63 |
|
64 |
-
# For the officially "correct" LoRA initialization, check here: https://github.com/microsoft/LoRA
|
65 |
-
# TODO: Ensure that the initialization here is correct
|
66 |
if layer_type == "linear":
|
67 |
self.lora_A = nn.Parameter(
|
68 |
initialized_weights((rank, fan_in), num_adaptions, init="kaiming")
|
@@ -207,13 +205,16 @@ class LoRAParametrization(nn.Module):
|
|
207 |
class XLMRobertaLoRA(XLMRobertaPreTrainedModel):
|
208 |
def __init__(self, config: XLMRobertaFlashConfig, roberta: Optional[XLMRobertaModel] = None, add_pooling_layer=True):
|
209 |
super().__init__(config)
|
|
|
210 |
if roberta is None:
|
211 |
self.roberta = XLMRobertaModel(config, add_pooling_layer=add_pooling_layer)
|
212 |
else:
|
213 |
self.roberta = roberta
|
|
|
214 |
self._is_merged = False
|
215 |
self._num_adaptions = config.num_loras
|
216 |
self._register_lora(self._num_adaptions)
|
|
|
217 |
self.main_params_trainable = False
|
218 |
self._task_idx = None
|
219 |
# By default, we select the first LoRA
|
@@ -236,12 +237,6 @@ class XLMRobertaLoRA(XLMRobertaPreTrainedModel):
|
|
236 |
if "lora" not in name:
|
237 |
param.requires_grad_(val)
|
238 |
|
239 |
-
@classmethod
|
240 |
-
def from_roberta(cls, *args, **kwargs):
|
241 |
-
roberta = XLMRobertaModel.from_pretrained(*args, **kwargs)
|
242 |
-
config = XLMRobertaFlashConfig.from_pretrained(*args, **kwargs)
|
243 |
-
return cls(config, roberta=roberta)
|
244 |
-
|
245 |
def merge_lora(self):
|
246 |
"""Merges currently selected LoRA into main weights."""
|
247 |
if self._is_merged:
|
@@ -264,13 +259,16 @@ class XLMRobertaLoRA(XLMRobertaPreTrainedModel):
|
|
264 |
use_safetensors: bool = None,
|
265 |
**kwargs,
|
266 |
):
|
267 |
-
|
268 |
-
|
269 |
-
|
270 |
-
|
271 |
-
|
272 |
-
|
273 |
-
|
|
|
|
|
|
|
274 |
|
275 |
def _register_lora(self, num_adaptions=1, rank=4, lora_dropout_p=0.0, lora_alpha=1):
|
276 |
self.apply(
|
|
|
61 |
fan_in_fan_out = layer_type == "embedding"
|
62 |
self.swap = (lambda x: (x[1], x[0])) if fan_in_fan_out else (lambda x: x)
|
63 |
|
|
|
|
|
64 |
if layer_type == "linear":
|
65 |
self.lora_A = nn.Parameter(
|
66 |
initialized_weights((rank, fan_in), num_adaptions, init="kaiming")
|
|
|
205 |
class XLMRobertaLoRA(XLMRobertaPreTrainedModel):
|
206 |
def __init__(self, config: XLMRobertaFlashConfig, roberta: Optional[XLMRobertaModel] = None, add_pooling_layer=True):
|
207 |
super().__init__(config)
|
208 |
+
|
209 |
if roberta is None:
|
210 |
self.roberta = XLMRobertaModel(config, add_pooling_layer=add_pooling_layer)
|
211 |
else:
|
212 |
self.roberta = roberta
|
213 |
+
|
214 |
self._is_merged = False
|
215 |
self._num_adaptions = config.num_loras
|
216 |
self._register_lora(self._num_adaptions)
|
217 |
+
|
218 |
self.main_params_trainable = False
|
219 |
self._task_idx = None
|
220 |
# By default, we select the first LoRA
|
|
|
237 |
if "lora" not in name:
|
238 |
param.requires_grad_(val)
|
239 |
|
|
|
|
|
|
|
|
|
|
|
|
|
240 |
def merge_lora(self):
|
241 |
"""Merges currently selected LoRA into main weights."""
|
242 |
if self._is_merged:
|
|
|
259 |
use_safetensors: bool = None,
|
260 |
**kwargs,
|
261 |
):
|
262 |
+
config = XLMRobertaFlashConfig.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
|
263 |
+
if config.load_trained_adapters:
|
264 |
+
return super().from_pretrained(
|
265 |
+
pretrained_model_name_or_path,
|
266 |
+
*model_args,
|
267 |
+
**kwargs
|
268 |
+
)
|
269 |
+
else:
|
270 |
+
roberta = XLMRobertaModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
|
271 |
+
return cls(config, roberta=roberta)
|
272 |
|
273 |
def _register_lora(self, num_adaptions=1, rank=4, lora_dropout_p=0.0, lora_alpha=1):
|
274 |
self.apply(
|