zai-org
/

chatglm3-6b-128k

@@ -416,7 +416,10 @@ class SelfAttention(torch.nn.Module):
             key_layer = torch.cat((cache_k, key_layer), dim=0)
             value_layer = torch.cat((cache_v, value_layer), dim=0)
         if use_cache:
-            kv_cache = (key_layer, value_layer)
         else:
             kv_cache = None
@@ -627,12 +630,8 @@ class GLMTransformer(torch.nn.Module):
         if not kv_caches:
             kv_caches = [None for _ in range(self.num_layers)]
         presents = () if use_cache else None
-        if self.gradient_checkpointing and self.training:
-            if use_cache:
-                logger.warning_once(
-                    "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
-                )
-                use_cache = False
         all_self_attentions = None
         all_hidden_states = () if output_hidden_states else None
@@ -660,7 +659,15 @@ class GLMTransformer(torch.nn.Module):
                 )
             hidden_states, kv_cache = layer_ret
             if use_cache:
-                presents = presents + (kv_cache,)
         if output_hidden_states:
             all_hidden_states = all_hidden_states + (hidden_states,)
@@ -845,6 +852,12 @@ class ChatGLMModel(ChatGLMPreTrainedModel):
             inputs_embeds, full_attention_mask, rotary_pos_emb=rotary_pos_emb,
             kv_caches=past_key_values, use_cache=use_cache, output_hidden_states=output_hidden_states
         )
         if not return_dict:
             return tuple(v for v in [hidden_states, presents, all_hidden_states, all_self_attentions] if v is not None)
@@ -1036,7 +1049,7 @@ class ChatGLMForConditionalGeneration(ChatGLMPreTrainedModel):
     @torch.inference_mode()
     def chat(self, tokenizer, query: str, history: List[Dict] = None, role: str = "user",
-             max_length: int = 131072, num_beams=1, do_sample=True, top_p=0.8, temperature=0.8, logits_processor=None,
              **kwargs):
         if history is None:
             history = []
@@ -1058,7 +1071,7 @@ class ChatGLMForConditionalGeneration(ChatGLMPreTrainedModel):
     @torch.inference_mode()
     def stream_chat(self, tokenizer, query: str, history: List[Dict] = None, role: str = "user",
-                    past_key_values=None,max_length: int = 131072, do_sample=True, top_p=0.8, temperature=0.8,
                     logits_processor=None, return_past_key_values=False, **kwargs):
         if history is None:
             history = []

             key_layer = torch.cat((cache_k, key_layer), dim=0)
             value_layer = torch.cat((cache_v, value_layer), dim=0)
         if use_cache:
+            if kv_cache is None:
+                kv_cache = torch.cat((key_layer.unsqueeze(0).unsqueeze(0), value_layer.unsqueeze(0).unsqueeze(0)), dim=1)
+            else:
+                kv_cache = (key_layer, value_layer)
         else:
             kv_cache = None
         if not kv_caches:
             kv_caches = [None for _ in range(self.num_layers)]
         presents = () if use_cache else None
+        if self.training:
+            use_cache = False
         all_self_attentions = None
         all_hidden_states = () if output_hidden_states else None
                 )
             hidden_states, kv_cache = layer_ret
             if use_cache:
+                # token by token decoding, use tuple format
+                if kv_caches[0] is not None:
+                    presents = presents + (kv_cache,)
+                # prefilling in decoding, use tensor format to save cuda memory
+                else:
+                    if len(presents) == 0:
+                        presents = kv_cache
+                    else:
+                        presents = torch.cat((presents, kv_cache), dim=0)
         if output_hidden_states:
             all_hidden_states = all_hidden_states + (hidden_states,)
             inputs_embeds, full_attention_mask, rotary_pos_emb=rotary_pos_emb,
             kv_caches=past_key_values, use_cache=use_cache, output_hidden_states=output_hidden_states
         )
+        if presents is not None and type(presents) is torch.Tensor:
+            presents = presents.split(1, dim=0)
+            presents = list(presents)
+            presents = [list(x.squeeze(0).split(1, dim=0)) for x in presents]
+            presents = [tuple([x.squeeze(0) for x in y]) for y in presents]
+            presents = tuple(presents)
         if not return_dict:
             return tuple(v for v in [hidden_states, presents, all_hidden_states, all_self_attentions] if v is not None)
     @torch.inference_mode()
     def chat(self, tokenizer, query: str, history: List[Dict] = None, role: str = "user",
+             max_length: int = 131072, num_beams=1, do_sample=True, top_p=0.7, temperature=0.95, logits_processor=None,
              **kwargs):
         if history is None:
             history = []
     @torch.inference_mode()
     def stream_chat(self, tokenizer, query: str, history: List[Dict] = None, role: str = "user",
+                    past_key_values=None,max_length: int = 131072, do_sample=True, top_p=0.7, temperature=0.95,
                     logits_processor=None, return_past_key_values=False, **kwargs):
         if history is None:
             history = []