2-adapter-tuning-initial-impl (#30)

Browse files

- 2 adapter tuning (3fd28cf83a7aeb3b39b4da99337ae29c84f1b424)

Co-authored-by: Jack Min Ong <[email protected]>

Files changed (6) hide show

block.py +11 -1
embedding.py +26 -4
mha.py +37 -5
mlp.py +21 -3
modeling_lora.py +0 -1
modeling_xlm_roberta.py +18 -5

block.py CHANGED Viewed

@@ -233,7 +233,17 @@ class Block(nn.Module):
                     is_rms_norm=isinstance(self.norm1, RMSNorm),
                 )
             if not isinstance(self.mlp, nn.Identity):
-                mlp_out = self.mlp(hidden_states, task_type=mixer_kwargs.get('task_type'))
                 if self.return_residual:  # mlp out is actually a pair here
                     mlp_out, hidden_states = mlp_out
                 if not self.fused_dropout_add_ln:

                     is_rms_norm=isinstance(self.norm1, RMSNorm),
                 )
             if not isinstance(self.mlp, nn.Identity):
+                task_type = mixer_kwargs.get('task_type')
+                if task_type:
+                    if isinstance(task_type, tuple):
+                        assert mixer_kwargs['cu_seqlens'].shape[0] % 9 == 1
+                        split_index = int((mixer_kwargs['cu_seqlens'].shape[0] - 1) / 9)
+                        split = mixer_kwargs['cu_seqlens'][split_index]
+                        mlp_out = self.mlp(hidden_states, task_type=mixer_kwargs.get('task_type'), split=split)
+                    else:
+                        mlp_out = self.mlp(hidden_states, task_type=task_type)
+                else:
+                    mlp_out = self.mlp(hidden_states)
                 if self.return_residual:  # mlp out is actually a pair here
                     mlp_out, hidden_states = mlp_out
                 if not self.fused_dropout_add_ln:

embedding.py CHANGED Viewed

@@ -47,8 +47,18 @@ class XLMRobertaEmbeddings(nn.Module):
         token_type_ids: (batch, seqlen)
         """
         batch_size, seqlen = input_ids.shape
-        lora_kwargs = {'task_type': task_type} if task_type is not None else {}
-        embeddings = self.word_embeddings(input_ids, **lora_kwargs)
         if self.max_position_embeddings > 0:
             if position_ids is None:
                 position_ids = create_position_ids_from_input_ids(input_ids, padding_idx=self.word_embeddings.padding_idx).to(input_ids.device)
@@ -58,6 +68,18 @@ class XLMRobertaEmbeddings(nn.Module):
         if self.type_vocab_size > 0:
             if token_type_ids is None:
                 token_type_ids = torch.zeros(seqlen, dtype=torch.long, device=input_ids.device)
-            token_type_embeddings = self.token_type_embeddings(token_type_ids, **lora_kwargs)
-            embeddings = embeddings + token_type_embeddings
         return embeddings

         token_type_ids: (batch, seqlen)
         """
         batch_size, seqlen = input_ids.shape
+        if isinstance(task_type, tuple):
+            assert input_ids.shape[0] % 9 == 0
+            split = int(input_ids.shape[0] / 9)
+            tensor1 = input_ids[:split, :]
+            tensor2 = input_ids[split:, :]
+            emb1 = self.word_embeddings(tensor1, task_type=task_type[0])
+            emb2 = self.word_embeddings(tensor2, task_type=task_type[1])
+            embeddings = torch.cat((emb1, emb2), dim=0)
+        else:
+            lora_kwargs = {'task_type': task_type} if task_type is not None else {}
+            embeddings = self.word_embeddings(input_ids, **lora_kwargs)
         if self.max_position_embeddings > 0:
             if position_ids is None:
                 position_ids = create_position_ids_from_input_ids(input_ids, padding_idx=self.word_embeddings.padding_idx).to(input_ids.device)
         if self.type_vocab_size > 0:
             if token_type_ids is None:
                 token_type_ids = torch.zeros(seqlen, dtype=torch.long, device=input_ids.device)
+            if isinstance(task_type, tuple):
+                assert embeddings.shape[0] % 9 == 0
+                split = int(embeddings.shape[0] / 9)
+                emb1 = embeddings[:split, :, :]
+                emb2 = embeddings[split:, :, :]
+                token_type_embs1 = self.token_type_embeddings(token_type_ids, task_type=task_type[0])
+                token_type_embs2 = self.token_type_embeddings(token_type_ids, task_type=task_type[1])
+                emb1 = emb1 + token_type_embs1
+                emb2 = emb2 + token_type_embs2
+                embeddings = torch.cat((emb1, emb2), dim=0)
+            else:
+                lora_kwargs = {'task_type': task_type} if task_type is not None else {}
+                token_type_embeddings = self.token_type_embeddings(token_type_ids, **lora_kwargs)
+                embeddings = embeddings + token_type_embeddings
         return embeddings

mha.py CHANGED Viewed

@@ -643,15 +643,39 @@ class MHA(nn.Module):
             inference_params.max_sequence_len if inference_params is not None else max_seqlen
         )
         batch, seqlen = x.shape[:2]
         if not self.cross_attn and self.num_heads_kv == self.num_heads:
             assert x_kv is None and mixer_subset is None
             lora_kwargs = {'task_type': task_type} if task_type is not None else {}
             if not self.return_residual:
-                qkv = self.Wqkv(x, **lora_kwargs)
             else:
-                if lora_kwargs:
-                    lora_kwargs['residual'] = True
-                qkv, x = self.Wqkv(x, **lora_kwargs)
             if self.dwconv:
                 qkv = rearrange(
@@ -739,5 +763,13 @@ class MHA(nn.Module):
                 context = self._apply_rotary_update_kvcache_attention(q, kv, inference_params)
         lora_kwargs.pop('residual', None)
-        out = self.out_proj(rearrange(context, "... h d -> ... (h d)"), **lora_kwargs)
         return out if not self.return_residual else (out, x)

             inference_params.max_sequence_len if inference_params is not None else max_seqlen
         )
         batch, seqlen = x.shape[:2]
+        lora_kwargs = {}
         if not self.cross_attn and self.num_heads_kv == self.num_heads:
             assert x_kv is None and mixer_subset is None
+            split = None
+            if isinstance(task_type, tuple):
+                assert cu_seqlens.shape[0] % 9 == 1
+                split_index = int((cu_seqlens.shape[0] - 1) / 9)
+                split = cu_seqlens[split_index]
             lora_kwargs = {'task_type': task_type} if task_type is not None else {}
             if not self.return_residual:
+                if isinstance(task_type, tuple):
+                    tensor1 = x[:split, :]
+                    tensor2 = x[split:, :]
+                    qkv1 = self.Wqkv(tensor1, task_type=task_type[0])
+                    qkv2 = self.Wqkv(tensor2, task_type=task_type[1])
+                    qkv = torch.cat((qkv1, qkv2), dim=0)
+                else:
+                    qkv = self.Wqkv(x, **lora_kwargs)
             else:
+                if isinstance(task_type, tuple):
+                    tensor1 = x[:split, :]
+                    tensor2 = x[split:, :]
+                    qkv1, tensor1 = self.Wqkv(tensor1, task_type=task_type[0], residual=True)
+                    qkv2, tensor2 = self.Wqkv(tensor2, task_type=task_type[1], residual=True)
+                    qkv = torch.cat((qkv1, qkv2), dim=0)
+                    x = torch.cat((tensor1, tensor2), dim=0)
+                else:
+                    if lora_kwargs:
+                        lora_kwargs['residual'] = True
+                    qkv, x = self.Wqkv(x, **lora_kwargs)
             if self.dwconv:
                 qkv = rearrange(
                 context = self._apply_rotary_update_kvcache_attention(q, kv, inference_params)
         lora_kwargs.pop('residual', None)
+        inp = rearrange(context, "... h d -> ... (h d)")
+        if isinstance(task_type, tuple):
+            tensor1 = inp[:split, :]
+            tensor2 = inp[split:, :]
+            out1 = self.out_proj(tensor1, task_type=task_type[0])
+            out2 = self.out_proj(tensor2, task_type=task_type[1])
+            out = torch.cat((out1, out2), dim=0)
+        else:
+            out = self.out_proj(inp, **lora_kwargs)
         return out if not self.return_residual else (out, x)

mlp.py CHANGED Viewed

@@ -47,11 +47,29 @@ class Mlp(nn.Module):
         self.activation = activation
         self.fc2 = nn.Linear(hidden_features, out_features, bias=bias2, **factory_kwargs)
-    def forward(self, x, task_type=None):
         lora_kwargs = {'task_type': task_type} if task_type is not None else {}
-        y = self.fc1(x, **lora_kwargs)
         y = self.activation(y)
-        y = self.fc2(y, **lora_kwargs)
         return y if not self.return_residual else (y, x)

         self.activation = activation
         self.fc2 = nn.Linear(hidden_features, out_features, bias=bias2, **factory_kwargs)
+    def forward(self, x, task_type=None, split=None):
         lora_kwargs = {'task_type': task_type} if task_type is not None else {}
+        if split:
+            assert isinstance(task_type, tuple)
+            tensor1 = x[:split, :]
+            tensor2 = x[split:, :]
+            y1 = self.fc1(tensor1, task_type=task_type[0])
+            y2 = self.fc1(tensor2, task_type=task_type[1])
+            y = torch.cat((y1, y2), dim=0)
+        else:
+            y = self.fc1(x, **lora_kwargs)
         y = self.activation(y)
+        if split:
+            assert isinstance(task_type, tuple)
+            tensor1 = y[:split, :]
+            tensor2 = y[split:, :]
+            y1 = self.fc2(tensor1, task_type=task_type[0])
+            y2 = self.fc2(tensor2, task_type=task_type[1])
+            y = torch.cat((y1, y2), dim=0)
+        else:
+            y = self.fc2(y, **lora_kwargs)
         return y if not self.return_residual else (y, x)

modeling_lora.py CHANGED Viewed

@@ -227,7 +227,6 @@ class XLMRobertaLoRA(XLMRobertaPreTrainedModel):
         roberta: Optional[XLMRobertaModel] = None
     ):
         super().__init__(config)
         if roberta is None:
             self.roberta = XLMRobertaModel(config)
         else:

         roberta: Optional[XLMRobertaModel] = None
     ):
         super().__init__(config)
         if roberta is None:
             self.roberta = XLMRobertaModel(config)
         else:

modeling_xlm_roberta.py CHANGED Viewed

@@ -210,10 +210,12 @@ class XLMRobertaEncoder(nn.Module):
         subset_mask: (batch, seqlen), dtype=torch.bool
         """
         if key_padding_mask is None or not self.use_flash_attn:
-            mixer_kwargs = {'task_type': task_type}
-            if key_padding_mask is not None:
-                mixer_kwargs['key_padding_mask'] = key_padding_mask.bool()
             for layer in self.layers:
                 if self._grad_checkpointing:
                     hidden_states = torch.utils.checkpoint.checkpoint(
@@ -314,7 +316,18 @@ class XLMRobertaPooler(nn.Module):
         lora_kwargs = {'task_type': task_type} if task_type is not None else {}
         first_token_tensor = hidden_states[:, 0] if pool else hidden_states
-        pooled_output = self.dense(first_token_tensor, **lora_kwargs)
         pooled_output = self.activation(pooled_output)
         return pooled_output

         subset_mask: (batch, seqlen), dtype=torch.bool
         """
         if key_padding_mask is None or not self.use_flash_attn:
+            mixer_kwargs = (
+                {"key_padding_mask": key_padding_mask.bool()}
+                if key_padding_mask is not None
+                else None
+            )
+            mixer_kwargs['task_type'] = task_type
             for layer in self.layers:
                 if self._grad_checkpointing:
                     hidden_states = torch.utils.checkpoint.checkpoint(
         lora_kwargs = {'task_type': task_type} if task_type is not None else {}
         first_token_tensor = hidden_states[:, 0] if pool else hidden_states
+        if isinstance(task_type, tuple):
+            assert first_token_tensor.shape[0] % 9 == 0
+            split = int(first_token_tensor.shape[0] / 9)
+            tensor1 = first_token_tensor[:split, :]
+            tensor2 = first_token_tensor[split:, :]
+            pooled_out1 = self.dense(tensor1, task_type=task_type[0])
+            pooled_out2 = self.dense(tensor2, task_type=task_type[0])
+            pooled_output = torch.cat((pooled_out1, pooled_out2), dim=0)
+        else:
+            pooled_output = self.dense(first_token_tensor, **lora_kwargs)
         pooled_output = self.activation(pooled_output)
         return pooled_output