manueldeprada HF Staff commited on
Commit
598979a
·
verified ·
1 Parent(s): c4c7e92

Upload folder using huggingface_hub

Browse files
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ tokenizer.json filter=lfs diff=lfs merge=lfs -text
README.md ADDED
@@ -0,0 +1,126 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+
3
+ library_name: transformers
4
+ tags:
5
+ - custom_generate
6
+ ---
7
+
8
+ ## Description
9
+
10
+ Implementation of [Decoding by Contrasting Layers (DoLa)](https://huggingface.co/papers/2309.03883),
11
+ a contrastive decoding strategy for improving factuality and reducing hallucinations in language model outputs.
12
+
13
+ DoLa works by **contrasting the logits** from the final layer with those from earlier layers of the model,
14
+ amplifying factual knowledge localized in specific layers and suppressing spurious information.
15
+
16
+ This can be useful for:
17
+
18
+ * **Short-answer tasks** (e.g., TruthfulQA) — using higher layers (`dola_layers="high"`)
19
+ * **Long-answer reasoning tasks** (e.g., GSM8K, StrategyQA, FACTOR, VicunaQA) — using lower layers (`dola_layers="low"`)
20
+
21
+ DoLa is **not recommended for smaller models** such as GPT-2, as the improvement may be negligible.
22
+
23
+ This implementation matches the `DoLa` functionality present in `transformers<4.53.0`.
24
+
25
+ ---
26
+
27
+ ## Base model
28
+
29
+ * [Qwen/Qwen3-0.6B](https://huggingface.co/Qwen/Qwen3-0.6B)
30
+
31
+ ---
32
+
33
+ ## Model compatibility
34
+
35
+ * Decoder-only transformer models
36
+
37
+ ---
38
+
39
+ ## Additional Arguments
40
+
41
+ * **`dola_layers`** (*str* or *List\[int]*, optional):
42
+ Which earlier layers to contrast with the final layer. Can be:
43
+
44
+ * `"low"` — lower half of layers (recommended for long answers)
45
+ * `"high"` — upper half of layers (recommended for short answers)
46
+ * List of integer indices (e.g., `[18, 20]`)
47
+
48
+ **Note:**
49
+
50
+ * Layer 0 is the word embedding; layer 1 is the first transformer block.
51
+ * If the model has tied word embeddings, layer 0 is skipped and counting starts at layer 2.
52
+ * Typical defaults:
53
+
54
+ | # Layers | `"low"` range | `"high"` range |
55
+ | -------- | ------------------- | ------------------- |
56
+ | > 40 | `(0, 20, 2)` | `(N - 20, N, 2)` |
57
+ | ≤ 40 | `range(0, N//2, 2)` | `range(N//2, N, 2)` |
58
+
59
+ * **`repetition_penalty`** (*float*, optional, defaults to `None`):
60
+ Helps reduce repetition. A value of `1.2` is recommended.
61
+
62
+ ---
63
+
64
+ ## Output Type changes
65
+
66
+ * The `generate` method output remains the same as default `transformers` generation,
67
+ but logits are post-processed using the DoLa contrastive scoring before token selection.
68
+
69
+ ---
70
+
71
+ ## Example usage
72
+
73
+ ### Using higher layers (short-answer tasks)
74
+
75
+ ```python
76
+ # requires `transformers>=4.56.0`, previously, it was part of the library
77
+ import torch
78
+ from transformers import AutoModelForCausalLM, AutoTokenizer
79
+
80
+ tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen3-0.6B")
81
+ model = AutoModelForCausalLM.from_pretrained(
82
+ "Qwen/Qwen3-0.6B", torch_dtype=torch.float16
83
+ ).to("cuda")
84
+
85
+ inputs = tokenizer("What is the highest peak in the world?", return_tensors="pt").to("cuda")
86
+
87
+ outputs = model.generate(
88
+ **inputs,
89
+ max_new_tokens=50,
90
+ do_sample=False,
91
+ custom_generate="transformers-community/dola",
92
+ trust_remote_code=True,
93
+ dola_layers="high"
94
+ )
95
+
96
+ print(tokenizer.batch_decode(outputs, skip_special_tokens=True))
97
+ ```
98
+
99
+ ---
100
+
101
+ ### Contrasting specific layers
102
+
103
+ ```python
104
+ import torch
105
+ from transformers import AutoModelForCausalLM, AutoTokenizer
106
+
107
+ tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen3-0.6B")
108
+ model = AutoModelForCausalLM.from_pretrained(
109
+ "Qwen/Qwen3-0.6B", torch_dtype=torch.float16
110
+ ).to("cuda")
111
+
112
+ inputs = tokenizer("What is the highest peak in the world?", return_tensors="pt").to("cuda")
113
+
114
+ outputs = model.generate(
115
+ **inputs,
116
+ max_new_tokens=50,
117
+ do_sample=False,
118
+ repetition_penalty=1.2,
119
+ custom_generate="transformers-community/dola",
120
+ trust_remote_code=True,
121
+ dola_layers=[18, 20]
122
+ )
123
+
124
+ # Only decode the newly generated tokens
125
+ print(tokenizer.batch_decode(outputs[:, inputs.input_ids.shape[-1]:], skip_special_tokens=True))
126
+ ```
config.json ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "Qwen3ForCausalLM"
4
+ ],
5
+ "attention_bias": false,
6
+ "attention_dropout": 0.0,
7
+ "bos_token_id": 151643,
8
+ "eos_token_id": 151645,
9
+ "head_dim": 128,
10
+ "hidden_act": "silu",
11
+ "hidden_size": 1024,
12
+ "initializer_range": 0.02,
13
+ "intermediate_size": 3072,
14
+ "max_position_embeddings": 40960,
15
+ "max_window_layers": 28,
16
+ "model_type": "qwen3",
17
+ "num_attention_heads": 16,
18
+ "num_hidden_layers": 28,
19
+ "num_key_value_heads": 8,
20
+ "rms_norm_eps": 1e-06,
21
+ "rope_scaling": null,
22
+ "rope_theta": 1000000,
23
+ "sliding_window": null,
24
+ "tie_word_embeddings": true,
25
+ "torch_dtype": "bfloat16",
26
+ "transformers_version": "4.56.0",
27
+ "use_cache": true,
28
+ "use_sliding_window": false,
29
+ "vocab_size": 151936
30
+ }
custom_generate/generate.py ADDED
@@ -0,0 +1,336 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Union
2
+ import torch
3
+ from transformers import LogitsProcessorList, StoppingCriteriaList, GenerationConfig
4
+ from transformers.generation.utils import GenerateNonBeamOutput, GenerateDecoderOnlyOutput
5
+ import torch.nn as nn
6
+ import torch.nn.functional as F
7
+ import numpy as np
8
+ import logging
9
+
10
+ logger = logging.getLogger(__name__)
11
+
12
+
13
+ def _relative_top_filter(
14
+ scores: torch.FloatTensor,
15
+ baseline_scores: torch.FloatTensor,
16
+ relative_top: float = 0.1,
17
+ filter_value: float = -float("Inf"),
18
+ base_filter_value=-1e-3,
19
+ min_tokens_to_keep: int = 1,
20
+ ) -> tuple[torch.FloatTensor, torch.FloatTensor]:
21
+ """
22
+ Reference: https://github.com/XiangLi1999/ContrastiveDecoding/blob/170e9142e92159c1237d731e240f5eb14aabf428/transformers/src/transformers/generation_logits_process.py#L235
23
+ Apply filtering to only keep tokens with a probability above a certain threshold. The threshold is defined as `relative_top` * max probability in the distribution.
24
+ """
25
+ scores_normalized = scores.log_softmax(dim=-1)
26
+ baseline_scores_normalized = baseline_scores.log_softmax(dim=-1)
27
+ sorted_logits, sorted_indices = torch.sort(scores_normalized, descending=True)
28
+ min_thresh = sorted_logits[..., min_tokens_to_keep - 1]
29
+ probs_max = torch.max(scores_normalized, dim=-1).values
30
+ probs_thresh = probs_max + np.log(relative_top)
31
+ probs_thresh = torch.min(min_thresh, probs_thresh)
32
+ probs_thresh = probs_thresh.unsqueeze(-1)
33
+ baseline_scores_normalized[scores_normalized < probs_thresh] = base_filter_value
34
+ scores_normalized[scores_normalized < probs_thresh] = filter_value
35
+ return scores_normalized, baseline_scores_normalized
36
+
37
+
38
+
39
+ def _dola_select_contrast(
40
+ candidate_premature_layers: list[int],
41
+ candidate_premature_logits: dict[int, torch.FloatTensor],
42
+ final_logits: torch.FloatTensor,
43
+ ) -> torch.FloatTensor:
44
+ if len(candidate_premature_layers) == 1:
45
+ base_logits = candidate_premature_logits[candidate_premature_layers[0]]
46
+ final_logits, base_logits = _relative_top_filter(final_logits, base_logits)
47
+ logits = final_logits - base_logits
48
+ return logits
49
+
50
+ # 1. Stacking all premature_layers into a new dimension
51
+ stacked_premature_layers = torch.stack([candidate_premature_logits[i] for i in candidate_premature_layers], dim=0)
52
+
53
+ # 2. Calculate the softmax values for mature_layer and all premature_layers
54
+ # shape: (batch_size, vocab_size)
55
+ softmax_mature_layer = F.softmax(final_logits, dim=-1)
56
+ # shape: (num_premature_layers, batch_size, vocab_size)
57
+ softmax_premature_layers = F.softmax(stacked_premature_layers, dim=-1)
58
+
59
+ # 3. Calculate the average distribution
60
+ # shape: (num_premature_layers, batch_size, vocab_size)
61
+ avg_dist = 0.5 * (softmax_mature_layer[None, :, :] + softmax_premature_layers)
62
+
63
+ # 4. Calculate log-softmax for the KL divergence
64
+ # shape: (batch_size, vocab_size)
65
+ log_softmax_mature_layer = F.log_softmax(final_logits, dim=-1)
66
+ # shape: (num_premature_layers, batch_size, vocab_size)
67
+ log_softmax_premature_layers = F.log_softmax(stacked_premature_layers, dim=-1)
68
+
69
+ # 5. Calculate the KL divergences and then the JS divergences
70
+ # shape: (num_premature_layers, batch_size)
71
+ kl1 = F.kl_div(log_softmax_mature_layer[None, :, :], avg_dist, reduction="none").mean(-1)
72
+ # shape: (num_premature_layers, batch_size)
73
+ kl2 = F.kl_div(log_softmax_premature_layers, avg_dist, reduction="none").mean(-1)
74
+ js_divs = 0.5 * (kl1 + kl2) # shape: (num_premature_layers, batch_size)
75
+
76
+ # 6. Reduce the batchmean
77
+ js_divs = js_divs.mean(-1) # shape: (num_premature_layers,)
78
+ premature_layer = candidate_premature_layers[int(js_divs.argmax().item())]
79
+
80
+ base_logits = candidate_premature_logits[premature_layer]
81
+ final_logits, base_logits = _relative_top_filter(final_logits, base_logits)
82
+ logits = final_logits - base_logits
83
+ return logits
84
+
85
+ def _dola_decoding(
86
+ model,
87
+ input_ids: torch.LongTensor,
88
+ logits_processor: LogitsProcessorList,
89
+ stopping_criteria: StoppingCriteriaList,
90
+ generation_config: GenerationConfig,
91
+ synced_gpus: bool,
92
+ streamer: "BaseStreamer",
93
+ **model_kwargs,
94
+ ) -> Union[GenerateNonBeamOutput, torch.LongTensor]:
95
+ r"""
96
+ Generates sequences of token ids for models with a language modeling head using **dola decoding** and can be
97
+ used for decoder-only text models.
98
+ The method is based on the paper "DoLa: Decoding by Contrasting Layers Improves Factuality in Large Language
99
+ Models" (https://huggingface.co/papers/2309.03883) in ICLR 2024.
100
+
101
+ Parameters:
102
+ input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
103
+ The sequence used as a prompt for the generation.
104
+ dola_layers (`Union[str, list[int]]`):
105
+ The candidate layers used in contrasting layers of DoLa. It can be either 1) 'low' or 'high', which
106
+ means the lower part or higher part of the model layers, respectively, or 2) a list of layer indices
107
+ to be used for candidate layers. The 0-th layer is the word embedding layer of the model.
108
+ logits_processor (`LogitsProcessorList`):
109
+ An instance of [`LogitsProcessorList`]. List of instances of class derived from [`LogitsProcessor`]
110
+ used to modify the prediction scores of the language modeling head applied at each generation step.
111
+ stopping_criteria (`StoppingCriteriaList`, *optional*):
112
+ An instance of [`StoppingCriteriaList`]. List of instances of class derived from [`StoppingCriteria`]
113
+ used to tell if the generation loop should stop.
114
+ generation_config ([`~generation.GenerationConfig`]):
115
+ The generation configuration to be used as parametrization of the decoding method.
116
+ synced_gpus (`bool`):
117
+ Whether to continue running the while loop until max_length (needed to avoid deadlocking with
118
+ `FullyShardedDataParallel` and DeepSpeed ZeRO Stage 3).
119
+ streamer (`BaseStreamer`, *optional*):
120
+ Streamer object that will be used to stream the generated sequences. Generated tokens are passed
121
+ through `streamer.put(token_ids)` and the streamer is responsible for any further processing.
122
+ model_kwargs:
123
+ Additional model specific keyword arguments will be forwarded to the `forward` function of the model.
124
+ If model is an encoder-decoder model the kwargs should include `encoder_outputs`.
125
+
126
+ Return:
127
+ [`~generation.GenerateDecoderOnlyOutput`], [`~generation.GenerateEncoderDecoderOutput`]
128
+ or `torch.LongTensor`: A `torch.LongTensor` containing the generated tokens (default behaviour) or a
129
+ [`~generation.GenerateDecoderOnlyOutput`] if `model.config.is_encoder_decoder=False` and
130
+ `return_dict_in_generate=True` or a [`~generation.GenerateEncoderDecoderOutput`] if
131
+ `model.config.is_encoder_decoder=True`.
132
+ """
133
+ dola_layers: Union[str, list[int]] = generation_config.dola_layers
134
+
135
+
136
+ # 1. General sanity checks
137
+ # A few arguments are not allowed, especially arguments that control caches.
138
+ assert dola_layers is not None, "dola_layers must be set to use DoLa decoding"
139
+
140
+ # DoLa generation needs num_beams == 1
141
+ if getattr(generation_config, "num_beams", 1) != 1:
142
+ raise ValueError("DoLa generation needs num_beams == 1")
143
+
144
+ if model.config.is_encoder_decoder:
145
+ raise ValueError("DoLa decoding is only available for decoder-only models.")
146
+
147
+ if generation_config.repetition_penalty < 1.2:
148
+ logger.warning(
149
+ f"`repetition_penalty` is set to a value of {generation_config.repetition_penalty}, which could induce unwanted repetition. "
150
+ "The recommended value for DoLa decoding is `repetition_penalty>=1.2`.",
151
+ )
152
+
153
+ if getattr(model, "_is_stateful", False):
154
+ # DoLa decoding was not designed for stateful models, and would require some changes
155
+ raise ValueError(
156
+ f"DoLa decoding is not supported with stateful models, such as {model.__class__.__name__}"
157
+ )
158
+
159
+ if model.config.is_encoder_decoder:
160
+ raise ValueError("DoLa decoding is only available for decoder-only models.")
161
+
162
+ # init values
163
+ pad_token_id = generation_config._pad_token_tensor
164
+ output_attentions = generation_config.output_attentions
165
+ output_hidden_states = generation_config.output_hidden_states
166
+ output_scores = generation_config.output_scores
167
+ output_logits = generation_config.output_logits
168
+ return_dict_in_generate = generation_config.return_dict_in_generate
169
+ has_eos_stopping_criteria = any(hasattr(criteria, "eos_token_id") for criteria in stopping_criteria)
170
+ do_sample = generation_config.do_sample
171
+
172
+ # init attention / hidden states / scores tuples
173
+ scores = () if (return_dict_in_generate and output_scores) else None
174
+ raw_logits = () if (return_dict_in_generate and output_logits) else None
175
+ decoder_attentions = () if (return_dict_in_generate and output_attentions) else None
176
+ cross_attentions = () if (return_dict_in_generate and output_attentions) else None
177
+ decoder_hidden_states = () if (return_dict_in_generate and output_hidden_states) else None
178
+
179
+ # keep track of which sequences are already finished
180
+ batch_size, cur_length = input_ids.shape[:2]
181
+ unfinished_sequences = torch.ones(batch_size, dtype=torch.long, device=input_ids.device)
182
+ model_kwargs = model._get_initial_cache_position(cur_length, input_ids.device, model_kwargs)
183
+
184
+ this_peer_finished = False
185
+
186
+ # prepare layers for DoLa decoding
187
+ final_layer = model.config.get_text_config().num_hidden_layers
188
+ # if the model has tied word embeddings, we skip the word embeddings (0-th) layer and start from the 2nd layer,
189
+ # as the early exit from word embeddings will become identity function
190
+ # if the model is really shallow (<=2 layers), we use the 1st layer if it's not the final layer and the 0-th
191
+ # layer otherwise. Notice that DoLa does not help shallow models much.
192
+ if not model.config.tie_word_embeddings:
193
+ start_layer = 0
194
+ elif final_layer > 2:
195
+ start_layer = 2
196
+ elif final_layer == 2:
197
+ start_layer = 1
198
+ else:
199
+ start_layer = 0
200
+
201
+ # For `N`-layer models with `N <= 40` layers, the layers of `range(0, N // 2, 2)` and `range(N // 2, N, 2)`
202
+ # are used for `'low'` and `'high'` layers, respectively.
203
+ # For models with `N > 40` layers, the layers of `range(0, 20, 2)` and `range(N - 20, N, 2)` are used for
204
+ # `'low'` and `'high'` layers, respectively.
205
+ if isinstance(dola_layers, str) and dola_layers == "low":
206
+ if start_layer == final_layer // 2:
207
+ candidate_premature_layers = [start_layer]
208
+ else:
209
+ candidate_premature_layers = (
210
+ list(range(start_layer, final_layer // 2, 2))
211
+ if final_layer <= 40
212
+ else list(range(start_layer, 20, 2))
213
+ )
214
+ elif isinstance(dola_layers, str) and dola_layers == "high":
215
+ candidate_premature_layers = (
216
+ list(range(final_layer // 2, final_layer, 2))
217
+ if final_layer <= 40
218
+ else list(range(final_layer - 20, final_layer, 2))
219
+ )
220
+ # Set the `dola_layers` to a list of integers for layer indices to contrast manually specified layers.
221
+ elif isinstance(dola_layers, list):
222
+ candidate_premature_layers = [i for i in dola_layers if i < final_layer]
223
+ else:
224
+ raise ValueError("dola_layers must be either 'low', 'high' or a list of integers.")
225
+
226
+ lm_head = model.get_output_embeddings()
227
+ if lm_head is None:
228
+ raise ValueError("DoLa is not supported for models that don't have output embeddings.")
229
+
230
+ while model._has_unfinished_sequences(this_peer_finished, synced_gpus, device=input_ids.device):
231
+ # prepare model inputs
232
+ model_inputs = model.prepare_inputs_for_generation(input_ids, **model_kwargs)
233
+
234
+ # forward pass to get next token
235
+ outputs = model(
236
+ **model_inputs,
237
+ return_dict=True,
238
+ output_attentions=output_attentions,
239
+ output_hidden_states=True,
240
+ )
241
+
242
+ # .float() is needed to retain precision for later logits manipulations
243
+ final_layer_next_token_logits = outputs.logits[:, -1, :].detach().to(copy=True, dtype=torch.float32)
244
+ final_logits = outputs.logits[:, -1, :].float()
245
+ candidate_premature_logits = {}
246
+ for candidate_premature_layer in candidate_premature_layers:
247
+ candidate_premature_logits[candidate_premature_layer] = lm_head(
248
+ outputs.hidden_states[candidate_premature_layer][:, -1, :]
249
+ ).to(final_logits.device)
250
+
251
+ # synced_gpus: don't waste resources running the code we don't need; kwargs must be updated before skipping
252
+ model_kwargs = model._update_model_kwargs_for_generation(
253
+ outputs,
254
+ model_kwargs,
255
+ is_encoder_decoder=model.config.is_encoder_decoder,
256
+ )
257
+ if synced_gpus and this_peer_finished:
258
+ continue
259
+
260
+ next_token_logits = _dola_select_contrast(
261
+ candidate_premature_layers, candidate_premature_logits, final_logits
262
+ )
263
+ next_token_logits = next_token_logits.to(input_ids.device)
264
+ # pre-process distribution
265
+ next_token_scores = logits_processor(input_ids, next_token_logits)
266
+
267
+ # Store scores, attentions and hidden_states when required
268
+ if return_dict_in_generate:
269
+ if output_scores:
270
+ scores += (next_token_scores,)
271
+ if output_logits:
272
+ raw_logits += (final_layer_next_token_logits,)
273
+ if output_attentions:
274
+ decoder_attentions += (
275
+ (outputs.decoder_attentions,) if model.config.is_encoder_decoder else (outputs.attentions,)
276
+ )
277
+ if model.config.is_encoder_decoder:
278
+ cross_attentions += (outputs.cross_attentions,)
279
+
280
+ if output_hidden_states:
281
+ decoder_hidden_states += (
282
+ (outputs.decoder_hidden_states,)
283
+ if model.config.is_encoder_decoder
284
+ else (outputs.hidden_states,)
285
+ )
286
+
287
+ if do_sample: # sample
288
+ probs = nn.functional.softmax(next_token_scores, dim=-1)
289
+ next_tokens = torch.multinomial(probs, num_samples=1).squeeze(1)
290
+ else: # argmax
291
+ next_tokens = torch.argmax(next_token_scores, dim=-1)
292
+
293
+ # finished sentences should have their next token be a padding token
294
+ if has_eos_stopping_criteria:
295
+ next_tokens = next_tokens * unfinished_sequences + pad_token_id * (1 - unfinished_sequences)
296
+
297
+ # update generated ids, model inputs, and length for next step
298
+ input_ids = torch.cat([input_ids, next_tokens[:, None]], dim=-1)
299
+ if streamer is not None:
300
+ streamer.put(next_tokens.cpu())
301
+
302
+ # stop when each sentence is finished
303
+ unfinished_sequences = unfinished_sequences & ~stopping_criteria(input_ids, scores)
304
+ this_peer_finished = unfinished_sequences.max() == 0
305
+
306
+ if streamer is not None:
307
+ streamer.end()
308
+
309
+ if return_dict_in_generate:
310
+ return GenerateDecoderOnlyOutput(
311
+ sequences=input_ids,
312
+ scores=scores,
313
+ logits=raw_logits,
314
+ attentions=decoder_attentions,
315
+ hidden_states=decoder_hidden_states,
316
+ past_key_values=model_kwargs.get("past_key_values"),
317
+ )
318
+ else:
319
+ return input_ids
320
+
321
+ def generate(model, *args, **kwargs):
322
+ """Custom generate function for DoLa decoding.
323
+ Args:
324
+ model (`PreTrainedModel`):
325
+ The model to generate from.
326
+ dola_layers (`Union[str, list[int]]`): The layers to use for DoLa decoding. If `None`, DoLa decoding is not used. If a string, it must
327
+ be one of "low" or "high", which means using the lower part or higher part of the model layers, respectively.
328
+ "low" means the first half of the layers up to the first 20 layers, and "high" means the last half of the
329
+ layers up to the last 20 layers.
330
+ If a list of integers, it must contain the indices of the layers to use for candidate premature layers in DoLa.
331
+ The 0-th layer is the word embedding layer of the model. Set to `'low'` to improve long-answer reasoning tasks,
332
+ `'high'` to improve short-answer tasks. Check the [documentation](https://huggingface.co/transformers-community/dola)
333
+ or [the paper](https://huggingface.co/papers/2309.03883) for more details.
334
+ """
335
+ generation_outputs = model.generate(*args, custom_generate=_dola_decoding, **kwargs)
336
+ return generation_outputs
generation_config.json ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token_id": 151643,
3
+ "do_sample": true,
4
+ "eos_token_id": [
5
+ 151645,
6
+ 151643
7
+ ],
8
+ "pad_token_id": 151643,
9
+ "temperature": 0.6,
10
+ "top_k": 20,
11
+ "top_p": 0.95,
12
+ "transformers_version": "4.56.0"
13
+ }
merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f47f71177f32bcd101b7573ec9171e6a57f4f4d31148d38e382306f42996874b
3
+ size 1503300328
tokenizer.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:aeb13307a71acd8fe81861d94ad54ab689df773318809eed3cbe794b4492dae4
3
+ size 11422654
tokenizer_config.json ADDED
@@ -0,0 +1,239 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": false,
3
+ "add_prefix_space": false,
4
+ "added_tokens_decoder": {
5
+ "151643": {
6
+ "content": "<|endoftext|>",
7
+ "lstrip": false,
8
+ "normalized": false,
9
+ "rstrip": false,
10
+ "single_word": false,
11
+ "special": true
12
+ },
13
+ "151644": {
14
+ "content": "<|im_start|>",
15
+ "lstrip": false,
16
+ "normalized": false,
17
+ "rstrip": false,
18
+ "single_word": false,
19
+ "special": true
20
+ },
21
+ "151645": {
22
+ "content": "<|im_end|>",
23
+ "lstrip": false,
24
+ "normalized": false,
25
+ "rstrip": false,
26
+ "single_word": false,
27
+ "special": true
28
+ },
29
+ "151646": {
30
+ "content": "<|object_ref_start|>",
31
+ "lstrip": false,
32
+ "normalized": false,
33
+ "rstrip": false,
34
+ "single_word": false,
35
+ "special": true
36
+ },
37
+ "151647": {
38
+ "content": "<|object_ref_end|>",
39
+ "lstrip": false,
40
+ "normalized": false,
41
+ "rstrip": false,
42
+ "single_word": false,
43
+ "special": true
44
+ },
45
+ "151648": {
46
+ "content": "<|box_start|>",
47
+ "lstrip": false,
48
+ "normalized": false,
49
+ "rstrip": false,
50
+ "single_word": false,
51
+ "special": true
52
+ },
53
+ "151649": {
54
+ "content": "<|box_end|>",
55
+ "lstrip": false,
56
+ "normalized": false,
57
+ "rstrip": false,
58
+ "single_word": false,
59
+ "special": true
60
+ },
61
+ "151650": {
62
+ "content": "<|quad_start|>",
63
+ "lstrip": false,
64
+ "normalized": false,
65
+ "rstrip": false,
66
+ "single_word": false,
67
+ "special": true
68
+ },
69
+ "151651": {
70
+ "content": "<|quad_end|>",
71
+ "lstrip": false,
72
+ "normalized": false,
73
+ "rstrip": false,
74
+ "single_word": false,
75
+ "special": true
76
+ },
77
+ "151652": {
78
+ "content": "<|vision_start|>",
79
+ "lstrip": false,
80
+ "normalized": false,
81
+ "rstrip": false,
82
+ "single_word": false,
83
+ "special": true
84
+ },
85
+ "151653": {
86
+ "content": "<|vision_end|>",
87
+ "lstrip": false,
88
+ "normalized": false,
89
+ "rstrip": false,
90
+ "single_word": false,
91
+ "special": true
92
+ },
93
+ "151654": {
94
+ "content": "<|vision_pad|>",
95
+ "lstrip": false,
96
+ "normalized": false,
97
+ "rstrip": false,
98
+ "single_word": false,
99
+ "special": true
100
+ },
101
+ "151655": {
102
+ "content": "<|image_pad|>",
103
+ "lstrip": false,
104
+ "normalized": false,
105
+ "rstrip": false,
106
+ "single_word": false,
107
+ "special": true
108
+ },
109
+ "151656": {
110
+ "content": "<|video_pad|>",
111
+ "lstrip": false,
112
+ "normalized": false,
113
+ "rstrip": false,
114
+ "single_word": false,
115
+ "special": true
116
+ },
117
+ "151657": {
118
+ "content": "<tool_call>",
119
+ "lstrip": false,
120
+ "normalized": false,
121
+ "rstrip": false,
122
+ "single_word": false,
123
+ "special": false
124
+ },
125
+ "151658": {
126
+ "content": "</tool_call>",
127
+ "lstrip": false,
128
+ "normalized": false,
129
+ "rstrip": false,
130
+ "single_word": false,
131
+ "special": false
132
+ },
133
+ "151659": {
134
+ "content": "<|fim_prefix|>",
135
+ "lstrip": false,
136
+ "normalized": false,
137
+ "rstrip": false,
138
+ "single_word": false,
139
+ "special": false
140
+ },
141
+ "151660": {
142
+ "content": "<|fim_middle|>",
143
+ "lstrip": false,
144
+ "normalized": false,
145
+ "rstrip": false,
146
+ "single_word": false,
147
+ "special": false
148
+ },
149
+ "151661": {
150
+ "content": "<|fim_suffix|>",
151
+ "lstrip": false,
152
+ "normalized": false,
153
+ "rstrip": false,
154
+ "single_word": false,
155
+ "special": false
156
+ },
157
+ "151662": {
158
+ "content": "<|fim_pad|>",
159
+ "lstrip": false,
160
+ "normalized": false,
161
+ "rstrip": false,
162
+ "single_word": false,
163
+ "special": false
164
+ },
165
+ "151663": {
166
+ "content": "<|repo_name|>",
167
+ "lstrip": false,
168
+ "normalized": false,
169
+ "rstrip": false,
170
+ "single_word": false,
171
+ "special": false
172
+ },
173
+ "151664": {
174
+ "content": "<|file_sep|>",
175
+ "lstrip": false,
176
+ "normalized": false,
177
+ "rstrip": false,
178
+ "single_word": false,
179
+ "special": false
180
+ },
181
+ "151665": {
182
+ "content": "<tool_response>",
183
+ "lstrip": false,
184
+ "normalized": false,
185
+ "rstrip": false,
186
+ "single_word": false,
187
+ "special": false
188
+ },
189
+ "151666": {
190
+ "content": "</tool_response>",
191
+ "lstrip": false,
192
+ "normalized": false,
193
+ "rstrip": false,
194
+ "single_word": false,
195
+ "special": false
196
+ },
197
+ "151667": {
198
+ "content": "<think>",
199
+ "lstrip": false,
200
+ "normalized": false,
201
+ "rstrip": false,
202
+ "single_word": false,
203
+ "special": false
204
+ },
205
+ "151668": {
206
+ "content": "</think>",
207
+ "lstrip": false,
208
+ "normalized": false,
209
+ "rstrip": false,
210
+ "single_word": false,
211
+ "special": false
212
+ }
213
+ },
214
+ "additional_special_tokens": [
215
+ "<|im_start|>",
216
+ "<|im_end|>",
217
+ "<|object_ref_start|>",
218
+ "<|object_ref_end|>",
219
+ "<|box_start|>",
220
+ "<|box_end|>",
221
+ "<|quad_start|>",
222
+ "<|quad_end|>",
223
+ "<|vision_start|>",
224
+ "<|vision_end|>",
225
+ "<|vision_pad|>",
226
+ "<|image_pad|>",
227
+ "<|video_pad|>"
228
+ ],
229
+ "bos_token": null,
230
+ "chat_template": "{%- if tools %}\n {{- '<|im_start|>system\\n' }}\n {%- if messages[0].role == 'system' %}\n {{- messages[0].content + '\\n\\n' }}\n {%- endif %}\n {{- \"# Tools\\n\\nYou may call one or more functions to assist with the user query.\\n\\nYou are provided with function signatures within <tools></tools> XML tags:\\n<tools>\" }}\n {%- for tool in tools %}\n {{- \"\\n\" }}\n {{- tool | tojson }}\n {%- endfor %}\n {{- \"\\n</tools>\\n\\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\\n<tool_call>\\n{\\\"name\\\": <function-name>, \\\"arguments\\\": <args-json-object>}\\n</tool_call><|im_end|>\\n\" }}\n{%- else %}\n {%- if messages[0].role == 'system' %}\n {{- '<|im_start|>system\\n' + messages[0].content + '<|im_end|>\\n' }}\n {%- endif %}\n{%- endif %}\n{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %}\n{%- for message in messages[::-1] %}\n {%- set index = (messages|length - 1) - loop.index0 %}\n {%- if ns.multi_step_tool and message.role == \"user\" and message.content is string and not(message.content.startswith('<tool_response>') and message.content.endswith('</tool_response>')) %}\n {%- set ns.multi_step_tool = false %}\n {%- set ns.last_query_index = index %}\n {%- endif %}\n{%- endfor %}\n{%- for message in messages %}\n {%- if message.content is string %}\n {%- set content = message.content %}\n {%- else %}\n {%- set content = '' %}\n {%- endif %}\n {%- if (message.role == \"user\") or (message.role == \"system\" and not loop.first) %}\n {{- '<|im_start|>' + message.role + '\\n' + content + '<|im_end|>' + '\\n' }}\n {%- elif message.role == \"assistant\" %}\n {%- set reasoning_content = '' %}\n {%- if message.reasoning_content is string %}\n {%- set reasoning_content = message.reasoning_content %}\n {%- else %}\n {%- if '</think>' in content %}\n {%- set reasoning_content = content.split('</think>')[0].rstrip('\\n').split('<think>')[-1].lstrip('\\n') %}\n {%- set content = content.split('</think>')[-1].lstrip('\\n') %}\n {%- endif %}\n {%- endif %}\n {%- if loop.index0 > ns.last_query_index %}\n {%- if loop.last or (not loop.last and reasoning_content) %}\n {{- '<|im_start|>' + message.role + '\\n<think>\\n' + reasoning_content.strip('\\n') + '\\n</think>\\n\\n' + content.lstrip('\\n') }}\n {%- else %}\n {{- '<|im_start|>' + message.role + '\\n' + content }}\n {%- endif %}\n {%- else %}\n {{- '<|im_start|>' + message.role + '\\n' + content }}\n {%- endif %}\n {%- if message.tool_calls %}\n {%- for tool_call in message.tool_calls %}\n {%- if (loop.first and content) or (not loop.first) %}\n {{- '\\n' }}\n {%- endif %}\n {%- if tool_call.function %}\n {%- set tool_call = tool_call.function %}\n {%- endif %}\n {{- '<tool_call>\\n{\"name\": \"' }}\n {{- tool_call.name }}\n {{- '\", \"arguments\": ' }}\n {%- if tool_call.arguments is string %}\n {{- tool_call.arguments }}\n {%- else %}\n {{- tool_call.arguments | tojson }}\n {%- endif %}\n {{- '}\\n</tool_call>' }}\n {%- endfor %}\n {%- endif %}\n {{- '<|im_end|>\\n' }}\n {%- elif message.role == \"tool\" %}\n {%- if loop.first or (messages[loop.index0 - 1].role != \"tool\") %}\n {{- '<|im_start|>user' }}\n {%- endif %}\n {{- '\\n<tool_response>\\n' }}\n {{- content }}\n {{- '\\n</tool_response>' }}\n {%- if loop.last or (messages[loop.index0 + 1].role != \"tool\") %}\n {{- '<|im_end|>\\n' }}\n {%- endif %}\n {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n {{- '<|im_start|>assistant\\n' }}\n {%- if enable_thinking is defined and enable_thinking is false %}\n {{- '<think>\\n\\n</think>\\n\\n' }}\n {%- endif %}\n{%- endif %}",
231
+ "clean_up_tokenization_spaces": false,
232
+ "eos_token": "<|im_end|>",
233
+ "errors": "replace",
234
+ "model_max_length": 131072,
235
+ "pad_token": "<|endoftext|>",
236
+ "split_special_tokens": false,
237
+ "tokenizer_class": "Qwen2Tokenizer",
238
+ "unk_token": null
239
+ }
vocab.json ADDED
The diff for this file is too large to render. See raw diff