gretelai
/

mpt-7b

@@ -248,7 +248,7 @@ class MPTModel(MPTPreTrainedModel):
                     return custom_forward
-                (x, past_key_value) = torch.utils.checkpoint.checkpoint(
                     create_custom_forward(block),
                     x,
                     past_key_value,
@@ -256,15 +256,13 @@ class MPTModel(MPTPreTrainedModel):
                     attention_mask,
                     self.is_causal,
                 )
-                if past_key_values is not None:
-                    past_key_values[b_idx] = past_key_value
             else:
                 (x, attn_weights, present) = block(x, past_key_value=past_key_value, attn_bias=attn_bias, attention_mask=attention_mask, is_causal=self.is_causal, output_attentions=bool(output_attentions))
-                if presents is not None:
-                    presents += (present,)
-                if output_attentions:
-                    assert all_self_attns is not None
-                    all_self_attns = all_self_attns + (attn_weights,)
         x = self.norm_f(x)

                     return custom_forward
+                (x, attn_weights, present) = torch.utils.checkpoint.checkpoint(
                     create_custom_forward(block),
                     x,
                     past_key_value,
                     attention_mask,
                     self.is_causal,
                 )
             else:
                 (x, attn_weights, present) = block(x, past_key_value=past_key_value, attn_bias=attn_bias, attention_mask=attention_mask, is_causal=self.is_causal, output_attentions=bool(output_attentions))
+            if presents is not None:
+                presents += (present,)
+            if output_attentions:
+                assert all_self_attns is not None
+                all_self_attns = all_self_attns + (attn_weights,)
         x = self.norm_f(x)