Update cache format (#3)

Browse files

- Update custom_generate/generate.py (18740b73b7543c98b8961bc5489661f181889a2f)
- Update custom_generate/generate.py (9b4bf516f80bb86996ecf6c8e8c99ed1a1afdead)
- Update custom_generate/generate.py (def1b87d83f9a9d7a52c517e21e517946ffcf67b)
- Update custom_generate/generate.py (525fd175fe5561603d20af6cd032c33f5f91b52c)

Co-authored-by: Cyril Vallez <[email protected]>

Files changed (1) hide show

custom_generate/generate.py +60 -176

custom_generate/generate.py CHANGED Viewed

@@ -1,18 +1,22 @@
-from typing import Union, Optional, TYPE_CHECKING
 import torch
-from transformers import LogitsProcessorList, StoppingCriteriaList, GenerationConfig
 from transformers.generation.utils import (
-    GenerationMixin,
-    GenerateNonBeamOutput,
     GenerateDecoderOnlyOutput,
 )
-from transformers.cache_utils import Cache, EncoderDecoderCache, DynamicCache
 from transformers.modeling_outputs import CausalLMOutputWithPast, Seq2SeqLMOutput
-from transformers.generation.utils import GenerateEncoderDecoderOutput, ALL_CACHE_NAMES
 from transformers.utils import ModelOutput
-from transformers.configuration_utils import PretrainedConfig
-import torch.nn as nn
-import logging
 if TYPE_CHECKING:
     from transformers.generation.streamers import BaseStreamer
@@ -20,9 +24,7 @@ if TYPE_CHECKING:
 logger = logging.getLogger(__name__)
-def stack_model_outputs(
-    model_outputs: list[ModelOutput], config: PretrainedConfig
-) -> ModelOutput:
     """
     Stack a list of ModelOutput objects (or its subclasses) along the batch_size dimension. The function infers the
     specific ModelOutput subclass from the list provided.
@@ -50,17 +52,11 @@ def stack_model_outputs(
             # If the elements of the tuple are also tuples (e.g., past_key_values in our earlier example)
             if isinstance(data[0][0], tuple):
                 return tuple(
-                    tuple(
-                        torch.cat([attr[i][j] for attr in data], dim=0)
-                        for j in range(len(data[0][0]))
-                    )
                     for i in range(len(data[0]))
                 )
             else:
-                return tuple(
-                    torch.cat([attr[i] for attr in data], dim=0)
-                    for i in range(len(data[0]))
-                )
         elif isinstance(data[0], (int, float)):
             # If the elements are integers or floats, return a tensor
             return torch.tensor(data)
@@ -92,9 +88,7 @@ def _ranking_fast(
     """
     norm_context_hidden = context_hidden / context_hidden.norm(dim=2, keepdim=True)
     norm_next_hidden = next_hidden / next_hidden.norm(dim=2, keepdim=True)
-    cosine_matrix = torch.matmul(
-        norm_context_hidden, norm_next_hidden.transpose(1, 2)
-    ).squeeze(-1)  # [B*K, S]
     # Penalize cosine_matrix based on the cosine_matrix_mask (ignore padding positions)
     # Using a large negative value for masked positions
@@ -105,9 +99,7 @@ def _ranking_fast(
     degeneration_penalty, _ = torch.max(cosine_matrix, dim=-1)  # [B*K]
     next_top_k_probs = next_top_k_probs.view(-1)  # [B*K]
     contrastive_score = (1.0 - alpha) * next_top_k_probs - alpha * degeneration_penalty
-    contrastive_score = torch.stack(
-        torch.split(contrastive_score, beam_width)
-    )  # [B, K]
     _, selected_idx = contrastive_score.max(dim=-1)  # [B]
     return selected_idx
@@ -163,9 +155,7 @@ def _contrastive_search(
             f"contrastive search is not supported with stateful models, such as {model.__class__.__name__}"
         )
     # init values
-    has_eos_stopping_criteria = any(
-        hasattr(criteria, "eos_token_id") for criteria in stopping_criteria
-    )
     top_k = generation_config.top_k
     penalty_alpha = generation_config.penalty_alpha
     pad_token_id = generation_config._pad_token_tensor
@@ -181,39 +171,22 @@ def _contrastive_search(
     scores = () if (return_dict_in_generate and output_scores) else None
     decoder_attentions = () if (return_dict_in_generate and output_attentions) else None
     cross_attentions = () if (return_dict_in_generate and output_attentions) else None
-    decoder_hidden_states = (
-        () if (return_dict_in_generate and output_hidden_states) else None
-    )
     # if model is an encoder-decoder, retrieve encoder attention weights and hidden states
     if return_dict_in_generate and model.config.is_encoder_decoder:
-        encoder_attentions = (
-            model_kwargs["encoder_outputs"].get("attentions")
-            if output_attentions
-            else None
-        )
-        encoder_hidden_states = (
-            model_kwargs["encoder_outputs"].get("hidden_states")
-            if output_hidden_states
-            else None
-        )
     # keep track of which sequences are already finished
     batch_size, cur_len = input_ids.shape[:2]
-    unfinished_sequences = torch.ones(
-        batch_size, dtype=torch.long, device=input_ids.device
-    )
-    model_kwargs = model._get_initial_cache_position(
-        cur_len, input_ids.device, model_kwargs
-    )
     # Create cosine_matrix_mask based on the attention_mask
     cosine_matrix_mask = torch.ones_like(input_ids, dtype=torch.long)
     if model.config.is_encoder_decoder:
-        if (
-            "decoder_attention_mask" in model_kwargs
-            and model_kwargs["decoder_attention_mask"] is not None
-        ):
             cosine_matrix_mask = model_kwargs["decoder_attention_mask"]
     else:
         cosine_matrix_mask = model_kwargs["attention_mask"]
@@ -221,9 +194,7 @@ def _contrastive_search(
     this_peer_finished = False
-    while model._has_unfinished_sequences(
-        this_peer_finished, synced_gpus, device=input_ids.device
-    ):
         # if the first step in the loop, encode all the prefix and obtain: (1) past_key_values;
         # (2) last_hidden_states; (3) logit_for_next_step; (4) update model kwargs for the next step
         if model_kwargs.get("past_key_values") is None or (
@@ -232,9 +203,7 @@ def _contrastive_search(
         ):
             # prepare inputs
             model_kwargs["use_cache"] = True
-            model_inputs = model.prepare_inputs_for_generation(
-                input_ids, **model_kwargs
-            )
             # encode the given prefix and prepare model inputs; encoder-decoder model process the prefix and save
             # the `encoder_outputs`
@@ -256,9 +225,7 @@ def _contrastive_search(
             # Copy is needed to avoid keeping a hanging ref to outputs.logits which may be very large for this first iteration
             # (the clone itmodel is always small)
             # torch.float32 is needed to retain precision for later logits manipulations
-            logit_for_next_step = outputs.logits[:, -1, :].to(
-                copy=True, dtype=torch.float32, device=input_ids.device
-            )
             model_kwargs = model._update_model_kwargs_for_generation(
                 outputs,
@@ -282,13 +249,17 @@ def _contrastive_search(
                     f"{model.__class__.__name__} does not support caching and therefore **can't** be used "
                     "for contrastive search."
                 )
-            elif (
-                not isinstance(past_key_values[0], (tuple, torch.Tensor))
-                or past_key_values[0][0].shape[0] != batch_size
             ):
                 raise ValueError(
-                    f"{model.__class__.__name__} does not have a standard cache format and therefore **can't** be "
-                    "used for contrastive search without further modifications."
                 )
         # contrastive_search main logic start:
@@ -307,18 +278,14 @@ def _contrastive_search(
                 scores += (processed_logit_for_next_step,)
             if output_attentions:
                 decoder_attentions += (
-                    (outputs.decoder_attentions,)
-                    if model.config.is_encoder_decoder
-                    else (outputs.attentions,)
                 )
                 if model.config.is_encoder_decoder:
                     cross_attentions += (outputs.cross_attentions,)
             if output_hidden_states:
                 decoder_hidden_states += (
-                    (outputs.decoder_hidden_states,)
-                    if model.config.is_encoder_decoder
-                    else (outputs.hidden_states,)
                 )
         # This is needed to properly delete outputs.logits which may be very large for this first iteration
@@ -327,33 +294,13 @@ def _contrastive_search(
         if not sequential:
             # Replicates the new past_key_values to match the `top_k` candidates
-            past = model_kwargs["past_key_values"]
-            # If it is a static cache, modify it in-place layer after layer to save memory
-            if isinstance(past, DynamicCache) or (
-                isinstance(past, EncoderDecoderCache)
-                and isinstance(past.self_attention_cache, DynamicCache)
-            ):
-                past.batch_repeat_interleave(top_k)
-            else:
-                new_key_values = []
-                for layer in past:
-                    items = []
-                    # item is either the key or the value matrix
-                    for item in layer:
-                        items.append(item.repeat_interleave(top_k, dim=0))
-                    new_key_values.append(tuple(items))
-                past = tuple(new_key_values)
-            model_kwargs["past_key_values"] = past
         if sequential:
             all_outputs = []
             for i in range(top_k):
                 # compute the candidate tokens by the language model and collect their hidden_states
-                next_model_inputs = model.prepare_inputs_for_generation(
-                    top_k_ids[:, i].view(-1, 1), **model_kwargs
-                )
                 outputs = model(
                     **next_model_inputs,
@@ -361,21 +308,10 @@ def _contrastive_search(
                     output_hidden_states=True,
                     output_attentions=output_attentions,
                 )
-                if isinstance(outputs["past_key_values"], DynamicCache) or (
-                    isinstance(outputs["past_key_values"], EncoderDecoderCache)
-                    and isinstance(
-                        outputs["past_key_values"].self_attention_cache, DynamicCache
-                    )
-                ):
-                    # Remove past K-V from output since we don't need to stack later
-                    outputs["past_key_values"] = None
-                    # Remove last token from past K-V since we don't want to append it at this point
-                    model_kwargs["past_key_values"].crop(-1)
-                else:
-                    raise ValueError(
-                        f"Unsupported cache type: {type(outputs['past_key_values'])}. Contrastive search requires "
-                        "dynamic cache, so set `cache_implementation='dynamic'` in the generation config."
-                    )
                 all_outputs.append(outputs)
             outputs = stack_model_outputs(all_outputs, model.config.get_text_config())
@@ -383,9 +319,7 @@ def _contrastive_search(
         else:
             # compute the candidate tokens by the language model and collect their hidden_states
             # assembles top_k_ids into batch of size k
-            next_model_inputs = model.prepare_inputs_for_generation(
-                top_k_ids.view(-1, 1), **model_kwargs
-            )
             outputs = model(
                 **next_model_inputs,
@@ -431,9 +365,7 @@ def _contrastive_search(
         selected_idx = selected_idx.to("cpu")
         # This will be used instead of the previous inneficient torch.stack(torch.split())
-        augmented_idx = torch.tensor(
-            [x + i * top_k for i, x in enumerate(selected_idx)]
-        )
         # prepare for the next step: (1) next token_id; (2) past_key_values; (3) last_hidden_states for computing
         # the degeneration penalty; (4) logits for selecting next top-k candidates; (5) selected tokens scores
@@ -441,15 +373,11 @@ def _contrastive_search(
         next_tokens = top_k_ids[range(len(top_k_ids)), selected_idx]
         next_hidden = torch.stack(torch.split(next_hidden.squeeze(dim=1), top_k))
         next_hidden = next_hidden[range(batch_size), selected_idx, :]
-        last_hidden_states = torch.cat(
-            [last_hidden_states, next_hidden.unsqueeze(1)], dim=1
-        )
         next_decoder_hidden_states = ()
         for layer in full_hidden_states:
-            layer = torch.stack(torch.split(layer, top_k))[
-                range(batch_size), selected_idx, :
-            ]
             next_decoder_hidden_states += (layer,)
         # generate past_key_values cache of only the selected token
@@ -469,29 +397,10 @@ def _contrastive_search(
         else:
             next_past_key_values = None
             for possible_cache_name in ALL_CACHE_NAMES:
-                next_past_key_values = next_past_key_values or getattr(
-                    outputs, possible_cache_name, None
-                )
-            # Do it in-place layer per layer to save memory
-            if isinstance(next_past_key_values, DynamicCache) or (
-                isinstance(next_past_key_values, EncoderDecoderCache)
-                and isinstance(next_past_key_values.self_attention_cache, DynamicCache)
-            ):
-                next_past_key_values.batch_select_indices(augmented_idx)
-            else:
-                new_key_values = []
-                for layer in next_past_key_values:
-                    items = []
-                    # item is either the key or the value matrix
-                    for item in layer:
-                        items.append(item[augmented_idx, ...])
-                    new_key_values.append(tuple(items))
-                next_past_key_values = tuple(new_key_values)
-        logit_for_next_step = torch.stack(torch.split(logits, top_k))[
-            range(batch_size), selected_idx, :
-        ]
         logit_for_next_step = logit_for_next_step.to(input_ids.device)
         # Rebuilds the relevant parts of the model output for the selected token, for use in the next iteration
@@ -500,14 +409,10 @@ def _contrastive_search(
             next_step_decoder_attentions = ()
             if output_attentions:
                 for layer in outputs.cross_attentions:
-                    layer = torch.stack(torch.split(layer, top_k, dim=0))[
-                        range(batch_size), selected_idx, ...
-                    ]
                     next_step_cross_attentions += (layer,)
                 for layer in outputs.decoder_attentions:
-                    layer = torch.stack(torch.split(layer, top_k, dim=0))[
-                        range(batch_size), selected_idx, ...
-                    ]
                     next_step_decoder_attentions += (layer,)
             outputs = Seq2SeqLMOutput(
                 past_key_values=next_past_key_values,
@@ -519,9 +424,7 @@ def _contrastive_search(
             next_step_attentions = ()
             if output_attentions:
                 for layer in outputs.attentions:
-                    layer = torch.stack(torch.split(layer, top_k, dim=0))[
-                        range(batch_size), selected_idx, ...
-                    ]
                     next_step_attentions += (layer,)
             outputs = CausalLMOutputWithPast(
                 past_key_values=next_past_key_values,
@@ -541,9 +444,7 @@ def _contrastive_search(
         # finished sentences should have their next token be a padding token
         if has_eos_stopping_criteria:
-            next_tokens = next_tokens * unfinished_sequences + pad_token_id * (
-                1 - unfinished_sequences
-            )
         # update generated ids, model inputs, and length for next step
         input_ids = torch.cat([input_ids, next_tokens[:, None]], dim=-1)
@@ -551,9 +452,7 @@ def _contrastive_search(
             streamer.put(next_tokens.cpu())
         # stop when each sentence is finished
-        unfinished_sequences = unfinished_sequences & ~stopping_criteria(
-            input_ids, scores
-        )
         this_peer_finished = unfinished_sequences.max() == 0
     if streamer is not None:
@@ -563,21 +462,7 @@ def _contrastive_search(
         # Contrastive search works by forward looking at the next token, so we need to exclude it from
         # `past_key_values` to be consistent with the other decoding methods
         if model_kwargs.get("past_key_values") is not None:
-            if isinstance(model_kwargs["past_key_values"], DynamicCache) or (
-                isinstance(model_kwargs["past_key_values"], EncoderDecoderCache)
-                and isinstance(
-                    model_kwargs["past_key_values"].self_attention_cache, DynamicCache
-                )
-            ):
-                model_kwargs["past_key_values"].crop(-1)
-            else:
-                past_key_values = []
-                for layer in model_kwargs["past_key_values"]:
-                    layer_past_key_values = []
-                    for item in layer:
-                        layer_past_key_values.append(item[..., :-1, :])
-                    past_key_values.append(tuple(layer_past_key_values))
-                model_kwargs["past_key_values"] = tuple(past_key_values)
         if model.config.is_encoder_decoder:
             return GenerateEncoderDecoderOutput(
@@ -614,8 +499,7 @@ def generate(model, *args, **kwargs):
     """
     cache_implementation = kwargs.pop("cache_implementation", "dynamic_full")
     if cache_implementation != "dynamic_full" and (
-        "sliding_attention"
-        in getattr(model.config.get_text_config(), "layer_types", [])
         or getattr(model.config.get_text_config(), "sliding_window", 0) > 0
     ):
         logger.warning_once(

+import logging
+from typing import TYPE_CHECKING, Optional, Union
 import torch
+import torch.nn as nn
+from transformers import GenerationConfig, LogitsProcessorList, StoppingCriteriaList
+from transformers.cache_utils import Cache, DynamicCache, EncoderDecoderCache
+from transformers.configuration_utils import PretrainedConfig
 from transformers.generation.utils import (
+    ALL_CACHE_NAMES,
     GenerateDecoderOnlyOutput,
+    GenerateEncoderDecoderOutput,
+    GenerateNonBeamOutput,
+    GenerationMixin,
 )
 from transformers.modeling_outputs import CausalLMOutputWithPast, Seq2SeqLMOutput
 from transformers.utils import ModelOutput
 if TYPE_CHECKING:
     from transformers.generation.streamers import BaseStreamer
 logger = logging.getLogger(__name__)
+def stack_model_outputs(model_outputs: list[ModelOutput], config: PretrainedConfig) -> ModelOutput:
     """
     Stack a list of ModelOutput objects (or its subclasses) along the batch_size dimension. The function infers the
     specific ModelOutput subclass from the list provided.
             # If the elements of the tuple are also tuples (e.g., past_key_values in our earlier example)
             if isinstance(data[0][0], tuple):
                 return tuple(
+                    tuple(torch.cat([attr[i][j] for attr in data], dim=0) for j in range(len(data[0][0])))
                     for i in range(len(data[0]))
                 )
             else:
+                return tuple(torch.cat([attr[i] for attr in data], dim=0) for i in range(len(data[0])))
         elif isinstance(data[0], (int, float)):
             # If the elements are integers or floats, return a tensor
             return torch.tensor(data)
     """
     norm_context_hidden = context_hidden / context_hidden.norm(dim=2, keepdim=True)
     norm_next_hidden = next_hidden / next_hidden.norm(dim=2, keepdim=True)
+    cosine_matrix = torch.matmul(norm_context_hidden, norm_next_hidden.transpose(1, 2)).squeeze(-1)  # [B*K, S]
     # Penalize cosine_matrix based on the cosine_matrix_mask (ignore padding positions)
     # Using a large negative value for masked positions
     degeneration_penalty, _ = torch.max(cosine_matrix, dim=-1)  # [B*K]
     next_top_k_probs = next_top_k_probs.view(-1)  # [B*K]
     contrastive_score = (1.0 - alpha) * next_top_k_probs - alpha * degeneration_penalty
+    contrastive_score = torch.stack(torch.split(contrastive_score, beam_width))  # [B, K]
     _, selected_idx = contrastive_score.max(dim=-1)  # [B]
     return selected_idx
             f"contrastive search is not supported with stateful models, such as {model.__class__.__name__}"
         )
     # init values
+    has_eos_stopping_criteria = any(hasattr(criteria, "eos_token_id") for criteria in stopping_criteria)
     top_k = generation_config.top_k
     penalty_alpha = generation_config.penalty_alpha
     pad_token_id = generation_config._pad_token_tensor
     scores = () if (return_dict_in_generate and output_scores) else None
     decoder_attentions = () if (return_dict_in_generate and output_attentions) else None
     cross_attentions = () if (return_dict_in_generate and output_attentions) else None
+    decoder_hidden_states = () if (return_dict_in_generate and output_hidden_states) else None
     # if model is an encoder-decoder, retrieve encoder attention weights and hidden states
     if return_dict_in_generate and model.config.is_encoder_decoder:
+        encoder_attentions = model_kwargs["encoder_outputs"].get("attentions") if output_attentions else None
+        encoder_hidden_states = model_kwargs["encoder_outputs"].get("hidden_states") if output_hidden_states else None
     # keep track of which sequences are already finished
     batch_size, cur_len = input_ids.shape[:2]
+    unfinished_sequences = torch.ones(batch_size, dtype=torch.long, device=input_ids.device)
+    model_kwargs = model._get_initial_cache_position(cur_len, input_ids.device, model_kwargs)
     # Create cosine_matrix_mask based on the attention_mask
     cosine_matrix_mask = torch.ones_like(input_ids, dtype=torch.long)
     if model.config.is_encoder_decoder:
+        if "decoder_attention_mask" in model_kwargs and model_kwargs["decoder_attention_mask"] is not None:
             cosine_matrix_mask = model_kwargs["decoder_attention_mask"]
     else:
         cosine_matrix_mask = model_kwargs["attention_mask"]
     this_peer_finished = False
+    while model._has_unfinished_sequences(this_peer_finished, synced_gpus, device=input_ids.device):
         # if the first step in the loop, encode all the prefix and obtain: (1) past_key_values;
         # (2) last_hidden_states; (3) logit_for_next_step; (4) update model kwargs for the next step
         if model_kwargs.get("past_key_values") is None or (
         ):
             # prepare inputs
             model_kwargs["use_cache"] = True
+            model_inputs = model.prepare_inputs_for_generation(input_ids, **model_kwargs)
             # encode the given prefix and prepare model inputs; encoder-decoder model process the prefix and save
             # the `encoder_outputs`
             # Copy is needed to avoid keeping a hanging ref to outputs.logits which may be very large for this first iteration
             # (the clone itmodel is always small)
             # torch.float32 is needed to retain precision for later logits manipulations
+            logit_for_next_step = outputs.logits[:, -1, :].to(copy=True, dtype=torch.float32, device=input_ids.device)
             model_kwargs = model._update_model_kwargs_for_generation(
                 outputs,
                     f"{model.__class__.__name__} does not support caching and therefore **can't** be used "
                     "for contrastive search."
                 )
+            # Only those caches have the necesary methods
+            elif not (
+                isinstance(past_key_values, DynamicCache)
+                or (
+                    isinstance(past_key_values, EncoderDecoderCache)
+                    and isinstance(past_key_values.self_attention_cache, DynamicCache)
+                )
             ):
                 raise ValueError(
+                    f"Unsupported cache type: {type(outputs['past_key_values'])}. Contrastive search requires "
+                    "dynamic cache, so set `cache_implementation='dynamic'` in the generation config."
                 )
         # contrastive_search main logic start:
                 scores += (processed_logit_for_next_step,)
             if output_attentions:
                 decoder_attentions += (
+                    (outputs.decoder_attentions,) if model.config.is_encoder_decoder else (outputs.attentions,)
                 )
                 if model.config.is_encoder_decoder:
                     cross_attentions += (outputs.cross_attentions,)
             if output_hidden_states:
                 decoder_hidden_states += (
+                    (outputs.decoder_hidden_states,) if model.config.is_encoder_decoder else (outputs.hidden_states,)
                 )
         # This is needed to properly delete outputs.logits which may be very large for this first iteration
         if not sequential:
             # Replicates the new past_key_values to match the `top_k` candidates
+            model_kwargs["past_key_values"].batch_repeat_interleave(top_k)
         if sequential:
             all_outputs = []
             for i in range(top_k):
                 # compute the candidate tokens by the language model and collect their hidden_states
+                next_model_inputs = model.prepare_inputs_for_generation(top_k_ids[:, i].view(-1, 1), **model_kwargs)
                 outputs = model(
                     **next_model_inputs,
                     output_hidden_states=True,
                     output_attentions=output_attentions,
                 )
+                # Remove past K-V from output since we don't need to stack later
+                outputs["past_key_values"] = None
+                # Remove last token from past K-V since we don't want to append it at this point
+                model_kwargs["past_key_values"].crop(-1)
                 all_outputs.append(outputs)
             outputs = stack_model_outputs(all_outputs, model.config.get_text_config())
         else:
             # compute the candidate tokens by the language model and collect their hidden_states
             # assembles top_k_ids into batch of size k
+            next_model_inputs = model.prepare_inputs_for_generation(top_k_ids.view(-1, 1), **model_kwargs)
             outputs = model(
                 **next_model_inputs,
         selected_idx = selected_idx.to("cpu")
         # This will be used instead of the previous inneficient torch.stack(torch.split())
+        augmented_idx = torch.tensor([x + i * top_k for i, x in enumerate(selected_idx)])
         # prepare for the next step: (1) next token_id; (2) past_key_values; (3) last_hidden_states for computing
         # the degeneration penalty; (4) logits for selecting next top-k candidates; (5) selected tokens scores
         next_tokens = top_k_ids[range(len(top_k_ids)), selected_idx]
         next_hidden = torch.stack(torch.split(next_hidden.squeeze(dim=1), top_k))
         next_hidden = next_hidden[range(batch_size), selected_idx, :]
+        last_hidden_states = torch.cat([last_hidden_states, next_hidden.unsqueeze(1)], dim=1)
         next_decoder_hidden_states = ()
         for layer in full_hidden_states:
+            layer = torch.stack(torch.split(layer, top_k))[range(batch_size), selected_idx, :]
             next_decoder_hidden_states += (layer,)
         # generate past_key_values cache of only the selected token
         else:
             next_past_key_values = None
             for possible_cache_name in ALL_CACHE_NAMES:
+                next_past_key_values = next_past_key_values or getattr(outputs, possible_cache_name, None)
+            next_past_key_values.batch_select_indices(augmented_idx)
+        logit_for_next_step = torch.stack(torch.split(logits, top_k))[range(batch_size), selected_idx, :]
         logit_for_next_step = logit_for_next_step.to(input_ids.device)
         # Rebuilds the relevant parts of the model output for the selected token, for use in the next iteration
             next_step_decoder_attentions = ()
             if output_attentions:
                 for layer in outputs.cross_attentions:
+                    layer = torch.stack(torch.split(layer, top_k, dim=0))[range(batch_size), selected_idx, ...]
                     next_step_cross_attentions += (layer,)
                 for layer in outputs.decoder_attentions:
+                    layer = torch.stack(torch.split(layer, top_k, dim=0))[range(batch_size), selected_idx, ...]
                     next_step_decoder_attentions += (layer,)
             outputs = Seq2SeqLMOutput(
                 past_key_values=next_past_key_values,
             next_step_attentions = ()
             if output_attentions:
                 for layer in outputs.attentions:
+                    layer = torch.stack(torch.split(layer, top_k, dim=0))[range(batch_size), selected_idx, ...]
                     next_step_attentions += (layer,)
             outputs = CausalLMOutputWithPast(
                 past_key_values=next_past_key_values,
         # finished sentences should have their next token be a padding token
         if has_eos_stopping_criteria:
+            next_tokens = next_tokens * unfinished_sequences + pad_token_id * (1 - unfinished_sequences)
         # update generated ids, model inputs, and length for next step
         input_ids = torch.cat([input_ids, next_tokens[:, None]], dim=-1)
             streamer.put(next_tokens.cpu())
         # stop when each sentence is finished
+        unfinished_sequences = unfinished_sequences & ~stopping_criteria(input_ids, scores)
         this_peer_finished = unfinished_sequences.max() == 0
     if streamer is not None:
         # Contrastive search works by forward looking at the next token, so we need to exclude it from
         # `past_key_values` to be consistent with the other decoding methods
         if model_kwargs.get("past_key_values") is not None:
+            model_kwargs["past_key_values"].crop(-1)
         if model.config.is_encoder_decoder:
             return GenerateEncoderDecoderOutput(
     """
     cache_implementation = kwargs.pop("cache_implementation", "dynamic_full")
     if cache_implementation != "dynamic_full" and (
+        "sliding_attention" in getattr(model.config.get_text_config(), "layer_types", [])
         or getattr(model.config.get_text_config(), "sliding_window", 0) > 0
     ):
         logger.warning_once(