feat: initialize models with or without adapters

Browse files

Signed-off-by: jupyterjazz <[email protected]>

Files changed (2) hide show

configuration_xlm_roberta.py +2 -0
modeling_lora.py +13 -15

configuration_xlm_roberta.py CHANGED Viewed

@@ -22,6 +22,7 @@ class XLMRobertaFlashConfig(PretrainedConfig):
             use_cache=True,
             classifier_dropout=None,
             num_loras=5,
             **kwargs,
     ):
         super().__init__(pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)
@@ -42,3 +43,4 @@ class XLMRobertaFlashConfig(PretrainedConfig):
         self.use_cache = use_cache
         self.classifier_dropout = classifier_dropout
         self.num_loras = num_loras

             use_cache=True,
             classifier_dropout=None,
             num_loras=5,
+            load_trained_adapters=False,
             **kwargs,
     ):
         super().__init__(pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)
         self.use_cache = use_cache
         self.classifier_dropout = classifier_dropout
         self.num_loras = num_loras
+        self.load_trained_adapters = load_trained_adapters

modeling_lora.py CHANGED Viewed

@@ -61,8 +61,6 @@ class LoRAParametrization(nn.Module):
         fan_in_fan_out = layer_type == "embedding"
         self.swap = (lambda x: (x[1], x[0])) if fan_in_fan_out else (lambda x: x)
-        # For the officially "correct" LoRA initialization, check here: https://github.com/microsoft/LoRA
-        # TODO: Ensure that the initialization here is correct
         if layer_type == "linear":
             self.lora_A = nn.Parameter(
                 initialized_weights((rank, fan_in), num_adaptions, init="kaiming")
@@ -207,13 +205,16 @@ class LoRAParametrization(nn.Module):
 class XLMRobertaLoRA(XLMRobertaPreTrainedModel):
     def __init__(self, config: XLMRobertaFlashConfig, roberta: Optional[XLMRobertaModel] = None, add_pooling_layer=True):
         super().__init__(config)
         if roberta is None:
             self.roberta = XLMRobertaModel(config, add_pooling_layer=add_pooling_layer)
         else:
             self.roberta = roberta
         self._is_merged = False
         self._num_adaptions = config.num_loras
         self._register_lora(self._num_adaptions)
         self.main_params_trainable = False
         self._task_idx = None
         # By default, we select the first LoRA
@@ -236,12 +237,6 @@ class XLMRobertaLoRA(XLMRobertaPreTrainedModel):
             if "lora" not in name:
                 param.requires_grad_(val)
-    @classmethod
-    def from_roberta(cls, *args, **kwargs):
-        roberta = XLMRobertaModel.from_pretrained(*args, **kwargs)
-        config = XLMRobertaFlashConfig.from_pretrained(*args, **kwargs)
-        return cls(config, roberta=roberta)
     def merge_lora(self):
         """Merges currently selected LoRA into main weights."""
         if self._is_merged:
@@ -264,13 +259,16 @@ class XLMRobertaLoRA(XLMRobertaPreTrainedModel):
         use_safetensors: bool = None,
         **kwargs,
     ):
-        """
-        TODO: choose between from_roberta and super().from_pretrained
-        We want to be able to load both a pretrained XLMRoBertaModel, and a trained
-        XLMRobertaLoRA via this method. To this end, we need to check which of these
-        models we are expected to load.
-        """
-        return cls.from_roberta(pretrained_model_name_or_path)
     def _register_lora(self, num_adaptions=1, rank=4, lora_dropout_p=0.0, lora_alpha=1):
         self.apply(

         fan_in_fan_out = layer_type == "embedding"
         self.swap = (lambda x: (x[1], x[0])) if fan_in_fan_out else (lambda x: x)
         if layer_type == "linear":
             self.lora_A = nn.Parameter(
                 initialized_weights((rank, fan_in), num_adaptions, init="kaiming")
 class XLMRobertaLoRA(XLMRobertaPreTrainedModel):
     def __init__(self, config: XLMRobertaFlashConfig, roberta: Optional[XLMRobertaModel] = None, add_pooling_layer=True):
         super().__init__(config)
         if roberta is None:
             self.roberta = XLMRobertaModel(config, add_pooling_layer=add_pooling_layer)
         else:
             self.roberta = roberta
         self._is_merged = False
         self._num_adaptions = config.num_loras
         self._register_lora(self._num_adaptions)
         self.main_params_trainable = False
         self._task_idx = None
         # By default, we select the first LoRA
             if "lora" not in name:
                 param.requires_grad_(val)
     def merge_lora(self):
         """Merges currently selected LoRA into main weights."""
         if self._is_merged:
         use_safetensors: bool = None,
         **kwargs,
     ):
+        config = XLMRobertaFlashConfig.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
+        if config.load_trained_adapters:
+            return super().from_pretrained(
+                pretrained_model_name_or_path,
+                *model_args,
+                **kwargs
+            )
+        else:
+            roberta = XLMRobertaModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
+            return cls(config, roberta=roberta)
     def _register_lora(self, num_adaptions=1, rank=4, lora_dropout_p=0.0, lora_alpha=1):
         self.apply(