cgus commited on
Commit
6090014
·
verified ·
1 Parent(s): b2da0b7

Upload 8 files

Browse files
README.md ADDED
@@ -0,0 +1,375 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ license: apache-2.0
3
+ datasets:
4
+ - bigcode/the-stack
5
+ - bigcode/the-stack-v2
6
+ - bigcode/starcoderdata
7
+ - bigcode/commitpack
8
+ library_name: transformers
9
+ tags:
10
+ - code
11
+ model-index:
12
+ - name: Mellum-4b-base
13
+ results:
14
+ - task:
15
+ type: text-generation
16
+ dataset:
17
+ type: tianyang/repobench_python_v1.1
18
+ name: RepoBench 1.1 (Python)
19
+ metrics:
20
+ - name: EM
21
+ type: exact_match
22
+ value: 0.2591
23
+ verified: false
24
+ - name: EM ≤ 8k
25
+ type: exact_match
26
+ value: 0.2797
27
+ verified: false
28
+ - task:
29
+ type: text-generation
30
+ dataset:
31
+ type: tianyang/repobench_python_v1.1
32
+ name: RepoBench 1.1 (Python, 2k)
33
+ metrics:
34
+ - name: EM
35
+ type: exact_match
36
+ value: 0.2820
37
+ verified: false
38
+ - task:
39
+ type: text-generation
40
+ dataset:
41
+ type: tianyang/repobench_python_v1.1
42
+ name: RepoBench 1.1 (Python, 4k)
43
+ metrics:
44
+ - name: EM
45
+ type: exact_match
46
+ value: 0.2795
47
+ verified: false
48
+ - task:
49
+ type: text-generation
50
+ dataset:
51
+ type: tianyang/repobench_python_v1.1
52
+ name: RepoBench 1.1 (Python, 8k)
53
+ metrics:
54
+ - name: EM
55
+ type: exact_match
56
+ value: 0.2777
57
+ verified: false
58
+ - task:
59
+ type: text-generation
60
+ dataset:
61
+ type: tianyang/repobench_python_v1.1
62
+ name: RepoBench 1.1 (Python, 12k)
63
+ metrics:
64
+ - name: EM
65
+ type: exact_match
66
+ value: 0.2453
67
+ verified: false
68
+ - task:
69
+ type: text-generation
70
+ dataset:
71
+ type: tianyang/repobench_python_v1.1
72
+ name: RepoBench 1.1 (Python, 16k)
73
+ metrics:
74
+ - name: EM
75
+ type: exact_match
76
+ value: 0.2110
77
+ verified: false
78
+ - task:
79
+ type: text-generation
80
+ dataset:
81
+ type: tianyang/repobench_java_v1.1
82
+ name: RepoBench 1.1 (Java)
83
+ metrics:
84
+ - name: EM
85
+ type: exact_match
86
+ value: 0.2858
87
+ verified: false
88
+ - name: EM ≤ 8k
89
+ type: exact_match
90
+ value: 0.3108
91
+ verified: false
92
+ - task:
93
+ type: text-generation
94
+ dataset:
95
+ type: tianyang/repobench_java_v1.1
96
+ name: RepoBench 1.1 (Java, 2k)
97
+ metrics:
98
+ - name: EM
99
+ type: exact_match
100
+ value: 0.3202
101
+ verified: false
102
+ - task:
103
+ type: text-generation
104
+ dataset:
105
+ type: tianyang/repobench_java_v1.1
106
+ name: RepoBench 1.1 (Java, 4k)
107
+ metrics:
108
+ - name: EM
109
+ type: exact_match
110
+ value: 0.3212
111
+ verified: false
112
+ - task:
113
+ type: text-generation
114
+ dataset:
115
+ type: tianyang/repobench_java_v1.1
116
+ name: RepoBench 1.1 (Java, 8k)
117
+ metrics:
118
+ - name: EM
119
+ type: exact_match
120
+ value: 0.2910
121
+ verified: false
122
+ - task:
123
+ type: text-generation
124
+ dataset:
125
+ type: tianyang/repobench_java_v1.1
126
+ name: RepoBench 1.1 (Java, 12k)
127
+ metrics:
128
+ - name: EM
129
+ type: exact_match
130
+ value: 0.2492
131
+ verified: false
132
+ - task:
133
+ type: text-generation
134
+ dataset:
135
+ type: tianyang/repobench_java_v1.1
136
+ name: RepoBench 1.1 (Java, 16k)
137
+ metrics:
138
+ - name: EM
139
+ type: exact_match
140
+ value: 0.2474
141
+ verified: false
142
+ - task:
143
+ type: text-generation
144
+ dataset:
145
+ type: gonglinyuan/safim
146
+ name: SAFIM
147
+ metrics:
148
+ - name: pass@1
149
+ type: pass@1
150
+ value: 0.3811
151
+ verified: false
152
+ - task:
153
+ type: text-generation
154
+ dataset:
155
+ type: gonglinyuan/safim
156
+ name: SAFIM (Algorithmic)
157
+ metrics:
158
+ - name: pass@1
159
+ type: pass@1
160
+ value: 0.2530
161
+ verified: false
162
+ - task:
163
+ type: text-generation
164
+ dataset:
165
+ type: gonglinyuan/safim
166
+ name: SAFIM (Control)
167
+ metrics:
168
+ - name: pass@1
169
+ type: pass@1
170
+ value: 0.3839
171
+ verified: false
172
+ - task:
173
+ type: text-generation
174
+ dataset:
175
+ type: gonglinyuan/safim
176
+ name: SAFIM (API)
177
+ metrics:
178
+ - name: pass@1
179
+ type: pass@1
180
+ value: 0.5065
181
+ verified: false
182
+ - task:
183
+ type: text-generation
184
+ dataset:
185
+ type: loubnabnl/humaneval_infilling
186
+ name: HumanEval Infilling (Single-Line)
187
+ metrics:
188
+ - name: pass@1
189
+ type: pass@1
190
+ value: 0.6621
191
+ verified: false
192
+ - task:
193
+ type: text-generation
194
+ dataset:
195
+ type: loubnabnl/humaneval_infilling
196
+ name: HumanEval Infilling (Multi-Line)
197
+ metrics:
198
+ - name: pass@1
199
+ type: pass@1
200
+ value: 0.3852
201
+ verified: false
202
+ - task:
203
+ type: text-generation
204
+ dataset:
205
+ type: loubnabnl/humaneval_infilling
206
+ name: HumanEval Infilling (Random Span)
207
+ metrics:
208
+ - name: pass@1
209
+ type: pass@1
210
+ value: 0.2969
211
+ verified: false
212
+ ---
213
+
214
+ # Model Description
215
+ Mellum-4b-base is JetBrains' first open-source large language model (LLM) optimized for code-related tasks.
216
+
217
+ Trained on over 4 trillion tokens with a context window of 8192 tokens across multiple programming languages, Mellum-4b-base is tailored specifically for code completion.
218
+ The model follows a LLaMA-style architecture with 4 billion parameters, making it efficient for both cloud inference (e.g., via vLLM) and local deployment (e.g., using llama.cpp or Ollama).
219
+
220
+ Mellum was trained using Automatic Mixed Precision (AMP) with bf16 precision.
221
+ The uploaded version on Hugging Face retains the bf16 format for public use.
222
+
223
+ Designed for integration into professional developer tooling (e.g., intelligent code suggestions in IDEs), AI-powered coding assistants, and research on code understanding and generation, Mellum is also well-suited for educational applications and fine-tuning experiments.
224
+
225
+ This release includes a base model, and Python SFT models as well.
226
+ Models for other languages will be released soon.
227
+ Keep in mind that base model is not fine-tuned for downstream tasks out-of-the-box, however, it is fully capable of supporting supervised fine-tuning (SFT) and reinforcement learning (RL) for adaptation to specific applications.
228
+
229
+ # Training Data
230
+ - Total Training Tokens: ~4.2 trillion tokens
231
+ - Corpus: The Stack, StarCoder Training Dataset, The Stack v2, CommitPack, English Wikipedia
232
+
233
+ # Training Details
234
+ - Context Window: 8,192 tokens
235
+ - Optimization: Standard language modeling objective.
236
+ - Hardware: Cluster of 256 x H200 NVIDIA GPUs with Infiniband
237
+ - Training Duration: ~20 days
238
+
239
+ # Benchmarks
240
+ In addition to the base model scores, we are providing scores for a Mellum fine-tuned for Python to provide model’s users with some estimation about potential capabilities.
241
+
242
+ ## RepoBench 1.1
243
+ - Type: single-line
244
+ - Languages: Python and Java
245
+ - Metric: Exact Match (EM), %
246
+
247
+ Since Mellum has a maximum context window of 8k, we report here both the average performance across all evaluated context lengths (2k, 4k, 8k, 12k, and 16k) and the average over context lengths within its supported range (≤ 8k).
248
+
249
+ ### Python Subset
250
+ | Model | 2k | 4k | 8k | 12k | 16k | Avg | Avg ≤ 8k |
251
+ |----------------------|--------|--------|--------|--------|--------|--------|----------|
252
+ | Mellum-4b-sft-python | 29.24% | 30.60% | 29.77% | 26.80% | 25.43% | 28.37% | 29.87% |
253
+ | Mellum-4b-base | 28.20% | 27.95% | 27.77% | 24.53% | 21.10% | 25.91% | 27.97% |
254
+
255
+ ### Java Subset
256
+ | Model | 2k | 4k | 8k | 12k | 16k | Avg | Avg ≤ 8k |
257
+ |----------------|--------|--------|--------|--------|--------|--------|----------|
258
+ | Mellum-4b-base | 32.02% | 32.12% | 29.10% | 24.92% | 24.74% | 28.58% | 31.08% |
259
+
260
+ ## Syntax-Aware Fill-in-the-Middle (SAFIM)
261
+ - Type: mix of multi-line and single-line
262
+ - Languages: multi-language
263
+ - Metric: pass@1, %
264
+
265
+ | Model | Algorithmic | Control | API | Average |
266
+ |----------------------|-------------|---------|--------|---------|
267
+ | Mellum-4b-sft-python | 33.16% | 36.11% | 57.10% | 42.12% |
268
+ | Mellum-4b-base | 25.30% | 38.39% | 50.65% | 38.11% |
269
+
270
+ ## HumanEval Infilling
271
+ - Type: single-line and multi-line
272
+ - Languages: Python
273
+ - Metric: pass@1, %
274
+
275
+ | Model | Single-Line | Multi-Line | Random Span |
276
+ |----------------------|-------------|------------|-------------|
277
+ | Mellum-4b-sft-python | 80.45% | 48.19% | 37.68% |
278
+ | Mellum-4b-base | 66.21% | 38.52% | 29.70% |
279
+
280
+ We continue to work on model improvements and will share the next iteration soon.
281
+
282
+ # Limitations
283
+ - Biases: May reflect biases present in public codebases. For example it will likely produce code which is similar in style to the open-source repositories.
284
+ - Security: Code suggestions should not be assumed to be secure or free of vulnerabilities.
285
+
286
+ # Sample Usage
287
+ Here are examples of how to run and sample from the model.
288
+
289
+ ## Generic generaion
290
+ ```python
291
+ from transformers import AutoTokenizer, AutoModelForCausalLM
292
+
293
+ example = """
294
+ import sys
295
+ import os
296
+ import time
297
+
298
+ sys.path.append(os.getcwd())
299
+
300
+ from cluster.prepare_data import get_headers_pairs_list, write_dist_matrix
301
+ from cluster.token_edit_distance import get_distance_matrix
302
+
303
+ if len(sys.argv) < 3:
304
+ print(
305
+ "Too few arguments. You should provide: \n1. dataset_filename" +
306
+ "\n2. output_data_filename"
307
+ )
308
+ sys.exit()
309
+
310
+ start = time.perf_counter()
311
+ dataset_filename_ = sys.argv[1]
312
+ output_data_filename_ = sys.argv[2]
313
+
314
+ headers_pairs = get_headers_pairs_list(dataset_filename_, verbose=True)
315
+
316
+ dist_matrix, max_dist = get_distance_matrix(
317
+ list(map(lambda x: x[1], headers_pairs)),
318
+ verbose=True
319
+ )
320
+
321
+ write_dist_matrix(dist_matrix, max_dist, output_data_filename_, verbose=True)
322
+
323
+ end = time.perf_counter()
324
+ """
325
+
326
+ tokenizer = AutoTokenizer.from_pretrained('JetBrains/Mellum-4b-base')
327
+ model = AutoModelForCausalLM.from_pretrained('JetBrains/Mellum-4b-base')
328
+ encoded_input = tokenizer(example, return_tensors='pt', return_token_type_ids=False)
329
+ input_len = len(encoded_input["input_ids"][0])
330
+ out = model.generate(
331
+ **encoded_input,
332
+ max_new_tokens=100,
333
+ )
334
+ print("### Context")
335
+ print(tokenizer.decode(out[0][:input_len]))
336
+ print("### Prediction")
337
+ print(tokenizer.decode(out[0][input_len:]))
338
+ ```
339
+
340
+ ## Fill in the middle with additional files as context generation
341
+ ```python
342
+ example = """<filename>utils.py
343
+ def multiply(x, y):
344
+ return x * y
345
+ <filename>config.py
346
+ DEBUG = True
347
+ MAX_VALUE = 100
348
+ <filename>example.py
349
+ <fim_suffix>
350
+
351
+ # Test the function
352
+ result = calculate_sum(5, 10)
353
+ print(result)<fim_prefix>def calculate_sum(a, b):
354
+ <fim_middle>"""
355
+
356
+ encoded_input = tokenizer(example, return_tensors='pt', return_token_type_ids=False)
357
+ out = model.generate(
358
+ **encoded_input,
359
+ max_new_tokens=100,
360
+ )
361
+ ```
362
+
363
+ # Citation
364
+ If you use this model, please cite:
365
+
366
+ ```bibtex
367
+ @misc{Mellum-4b-base,
368
+ title = {Mellum-4b-base},
369
+ author = {Pavlichenko, Nikita and Nazarov, Iurii and Dolgov, Ivan and Garanina, Ekaterina and Lasocki, Karol and Reshetnikova, Julia and Boitsov, Sergei and Bondyrev, Ivan and Karaeva, Dariia and Sheptyakov, Maksim and Ustalov, Dmitry and Abramov, Nikita and Kolomyttseva, Olga and Lysaniuk, Kseniia and Zavidnyi, Ilia and Semenkin, Anton and Tankov, Vladislav and Sazanovich, Uladzislau},
370
+ year = {2025},
371
+ }
372
+ ```
373
+
374
+ # Contact
375
+ For questions, collaborations and requests reach us out via [email protected]
config.json ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "LlamaForCausalLM"
4
+ ],
5
+ "attention_bias": false,
6
+ "attention_dropout": 0.0,
7
+ "bos_token_id": 0,
8
+ "eos_token_id": 0,
9
+ "head_dim": 128,
10
+ "hidden_act": "silu",
11
+ "hidden_size": 3072,
12
+ "initializer_range": 0.02,
13
+ "intermediate_size": 8256,
14
+ "max_position_embeddings": 8192,
15
+ "max_sequence_length": 8192,
16
+ "mlp_bias": false,
17
+ "model_type": "llama",
18
+ "num_attention_heads": 24,
19
+ "num_hidden_layers": 30,
20
+ "num_key_value_heads": 24,
21
+ "pad_token_id": 0,
22
+ "pretraining_tp": 1,
23
+ "rms_norm_eps": 1e-06,
24
+ "rope_scaling": null,
25
+ "rope_theta": 500000.0,
26
+ "tie_word_embeddings": false,
27
+ "torch_dtype": "bfloat16",
28
+ "transformers_version": "4.51.3",
29
+ "use_cache": true,
30
+ "vocab_size": 98304,
31
+ "quantization_config": {
32
+ "quant_method": "exl2",
33
+ "version": "0.2.9",
34
+ "bits": 4.0,
35
+ "head_bits": 6,
36
+ "calibration": {
37
+ "rows": 115,
38
+ "length": 2048,
39
+ "dataset": "(default)"
40
+ }
41
+ }
42
+ }
generation_config.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "bos_token_id": 0,
4
+ "eos_token_id": 0,
5
+ "pad_token_id": 0,
6
+ "transformers_version": "4.51.3"
7
+ }
merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
special_tokens_map.json ADDED
@@ -0,0 +1,58 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "additional_special_tokens": [
3
+ "<gh_stars>",
4
+ "</system>",
5
+ "<issue_start>",
6
+ "</think>",
7
+ "<commit_after>",
8
+ "<assistant>",
9
+ "<jupyter_text>",
10
+ "<fim_middle>",
11
+ "</assistant>",
12
+ "<jupyter_code>",
13
+ "<user>",
14
+ "<filename>",
15
+ "<think>",
16
+ "<fim_suffix>",
17
+ "<fim_prefix>",
18
+ "<commit_msg>",
19
+ "<fim_pad>",
20
+ "<system>",
21
+ "<issue_comment>",
22
+ "<reponame>",
23
+ "<jupyter_start>",
24
+ "<issue_closed>",
25
+ "<commit_before>",
26
+ "<empty_output>",
27
+ "<jupyter_output>",
28
+ "</user>"
29
+ ],
30
+ "bos_token": {
31
+ "content": "<|endoftext|>",
32
+ "lstrip": false,
33
+ "normalized": true,
34
+ "rstrip": false,
35
+ "single_word": false
36
+ },
37
+ "eos_token": {
38
+ "content": "<|endoftext|>",
39
+ "lstrip": false,
40
+ "normalized": true,
41
+ "rstrip": false,
42
+ "single_word": false
43
+ },
44
+ "pad_token": {
45
+ "content": "<|endoftext|>",
46
+ "lstrip": false,
47
+ "normalized": true,
48
+ "rstrip": false,
49
+ "single_word": false
50
+ },
51
+ "unk_token": {
52
+ "content": "<|endoftext|>",
53
+ "lstrip": false,
54
+ "normalized": true,
55
+ "rstrip": false,
56
+ "single_word": false
57
+ }
58
+ }
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json ADDED
@@ -0,0 +1,259 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": false,
3
+ "add_prefix_space": false,
4
+ "added_tokens_decoder": {
5
+ "0": {
6
+ "content": "<|endoftext|>",
7
+ "lstrip": false,
8
+ "normalized": true,
9
+ "rstrip": false,
10
+ "single_word": false,
11
+ "special": true
12
+ },
13
+ "1": {
14
+ "content": "<fim_prefix>",
15
+ "lstrip": false,
16
+ "normalized": false,
17
+ "rstrip": false,
18
+ "single_word": false,
19
+ "special": true
20
+ },
21
+ "2": {
22
+ "content": "<fim_middle>",
23
+ "lstrip": false,
24
+ "normalized": false,
25
+ "rstrip": false,
26
+ "single_word": false,
27
+ "special": true
28
+ },
29
+ "3": {
30
+ "content": "<fim_suffix>",
31
+ "lstrip": false,
32
+ "normalized": false,
33
+ "rstrip": false,
34
+ "single_word": false,
35
+ "special": true
36
+ },
37
+ "4": {
38
+ "content": "<fim_pad>",
39
+ "lstrip": false,
40
+ "normalized": false,
41
+ "rstrip": false,
42
+ "single_word": false,
43
+ "special": true
44
+ },
45
+ "5": {
46
+ "content": "<filename>",
47
+ "lstrip": false,
48
+ "normalized": false,
49
+ "rstrip": false,
50
+ "single_word": false,
51
+ "special": true
52
+ },
53
+ "6": {
54
+ "content": "<gh_stars>",
55
+ "lstrip": false,
56
+ "normalized": false,
57
+ "rstrip": false,
58
+ "single_word": false,
59
+ "special": true
60
+ },
61
+ "7": {
62
+ "content": "<issue_start>",
63
+ "lstrip": false,
64
+ "normalized": false,
65
+ "rstrip": false,
66
+ "single_word": false,
67
+ "special": true
68
+ },
69
+ "8": {
70
+ "content": "<issue_comment>",
71
+ "lstrip": false,
72
+ "normalized": false,
73
+ "rstrip": false,
74
+ "single_word": false,
75
+ "special": true
76
+ },
77
+ "9": {
78
+ "content": "<issue_closed>",
79
+ "lstrip": false,
80
+ "normalized": false,
81
+ "rstrip": false,
82
+ "single_word": false,
83
+ "special": true
84
+ },
85
+ "10": {
86
+ "content": "<jupyter_start>",
87
+ "lstrip": false,
88
+ "normalized": false,
89
+ "rstrip": false,
90
+ "single_word": false,
91
+ "special": true
92
+ },
93
+ "11": {
94
+ "content": "<jupyter_text>",
95
+ "lstrip": false,
96
+ "normalized": false,
97
+ "rstrip": false,
98
+ "single_word": false,
99
+ "special": true
100
+ },
101
+ "12": {
102
+ "content": "<jupyter_code>",
103
+ "lstrip": false,
104
+ "normalized": false,
105
+ "rstrip": false,
106
+ "single_word": false,
107
+ "special": true
108
+ },
109
+ "13": {
110
+ "content": "<jupyter_output>",
111
+ "lstrip": false,
112
+ "normalized": false,
113
+ "rstrip": false,
114
+ "single_word": false,
115
+ "special": true
116
+ },
117
+ "14": {
118
+ "content": "<empty_output>",
119
+ "lstrip": false,
120
+ "normalized": false,
121
+ "rstrip": false,
122
+ "single_word": false,
123
+ "special": true
124
+ },
125
+ "15": {
126
+ "content": "<commit_before>",
127
+ "lstrip": false,
128
+ "normalized": false,
129
+ "rstrip": false,
130
+ "single_word": false,
131
+ "special": true
132
+ },
133
+ "16": {
134
+ "content": "<commit_msg>",
135
+ "lstrip": false,
136
+ "normalized": false,
137
+ "rstrip": false,
138
+ "single_word": false,
139
+ "special": true
140
+ },
141
+ "17": {
142
+ "content": "<commit_after>",
143
+ "lstrip": false,
144
+ "normalized": false,
145
+ "rstrip": false,
146
+ "single_word": false,
147
+ "special": true
148
+ },
149
+ "18": {
150
+ "content": "<reponame>",
151
+ "lstrip": false,
152
+ "normalized": false,
153
+ "rstrip": false,
154
+ "single_word": false,
155
+ "special": true
156
+ },
157
+ "19": {
158
+ "content": "<system>",
159
+ "lstrip": false,
160
+ "normalized": false,
161
+ "rstrip": false,
162
+ "single_word": false,
163
+ "special": true
164
+ },
165
+ "20": {
166
+ "content": "</system>",
167
+ "lstrip": false,
168
+ "normalized": false,
169
+ "rstrip": false,
170
+ "single_word": false,
171
+ "special": true
172
+ },
173
+ "21": {
174
+ "content": "<user>",
175
+ "lstrip": false,
176
+ "normalized": false,
177
+ "rstrip": false,
178
+ "single_word": false,
179
+ "special": true
180
+ },
181
+ "22": {
182
+ "content": "</user>",
183
+ "lstrip": false,
184
+ "normalized": false,
185
+ "rstrip": false,
186
+ "single_word": false,
187
+ "special": true
188
+ },
189
+ "23": {
190
+ "content": "<assistant>",
191
+ "lstrip": false,
192
+ "normalized": false,
193
+ "rstrip": false,
194
+ "single_word": false,
195
+ "special": true
196
+ },
197
+ "24": {
198
+ "content": "</assistant>",
199
+ "lstrip": false,
200
+ "normalized": false,
201
+ "rstrip": false,
202
+ "single_word": false,
203
+ "special": true
204
+ },
205
+ "25": {
206
+ "content": "<think>",
207
+ "lstrip": false,
208
+ "normalized": false,
209
+ "rstrip": false,
210
+ "single_word": false,
211
+ "special": true
212
+ },
213
+ "26": {
214
+ "content": "</think>",
215
+ "lstrip": false,
216
+ "normalized": false,
217
+ "rstrip": false,
218
+ "single_word": false,
219
+ "special": true
220
+ }
221
+ },
222
+ "additional_special_tokens": [
223
+ "<gh_stars>",
224
+ "</system>",
225
+ "<issue_start>",
226
+ "</think>",
227
+ "<commit_after>",
228
+ "<assistant>",
229
+ "<jupyter_text>",
230
+ "<fim_middle>",
231
+ "</assistant>",
232
+ "<jupyter_code>",
233
+ "<user>",
234
+ "<filename>",
235
+ "<think>",
236
+ "<fim_suffix>",
237
+ "<fim_prefix>",
238
+ "<commit_msg>",
239
+ "<fim_pad>",
240
+ "<system>",
241
+ "<issue_comment>",
242
+ "<reponame>",
243
+ "<jupyter_start>",
244
+ "<issue_closed>",
245
+ "<commit_before>",
246
+ "<empty_output>",
247
+ "<jupyter_output>",
248
+ "</user>"
249
+ ],
250
+ "bos_token": "<|endoftext|>",
251
+ "clean_up_tokenization_spaces": true,
252
+ "eos_token": "<|endoftext|>",
253
+ "errors": "replace",
254
+ "extra_special_tokens": {},
255
+ "model_max_length": 1000000000000000019884624838656,
256
+ "pad_token": "<|endoftext|>",
257
+ "tokenizer_class": "GPT2Tokenizer",
258
+ "unk_token": "<|endoftext|>"
259
+ }
vocab.json ADDED
The diff for this file is too large to render. See raw diff