Fill-Mask
Transformers
Safetensors
Russian
English
modernbert
TatonkaHF commited on
Commit
1b3bd35
·
verified ·
1 Parent(s): 9c148bf

Upload tokenizer

Browse files
Files changed (2) hide show
  1. tokenizer.json +36 -55
  2. tokenizer_config.json +0 -48
tokenizer.json CHANGED
@@ -300,60 +300,6 @@
300
  "normalized": false,
301
  "special": true
302
  },
303
- {
304
- "id": 50285,
305
- "content": "[unused0]",
306
- "single_word": false,
307
- "lstrip": false,
308
- "rstrip": false,
309
- "normalized": true,
310
- "special": false
311
- },
312
- {
313
- "id": 50286,
314
- "content": "[unused1]",
315
- "single_word": false,
316
- "lstrip": false,
317
- "rstrip": false,
318
- "normalized": true,
319
- "special": false
320
- },
321
- {
322
- "id": 50287,
323
- "content": "[unused2]",
324
- "single_word": false,
325
- "lstrip": false,
326
- "rstrip": false,
327
- "normalized": true,
328
- "special": false
329
- },
330
- {
331
- "id": 50288,
332
- "content": "[unused3]",
333
- "single_word": false,
334
- "lstrip": false,
335
- "rstrip": false,
336
- "normalized": true,
337
- "special": false
338
- },
339
- {
340
- "id": 50289,
341
- "content": "[unused4]",
342
- "single_word": false,
343
- "lstrip": false,
344
- "rstrip": false,
345
- "normalized": true,
346
- "special": false
347
- },
348
- {
349
- "id": 50290,
350
- "content": "[unused5]",
351
- "single_word": false,
352
- "lstrip": false,
353
- "rstrip": false,
354
- "normalized": true,
355
- "special": false
356
- },
357
  {
358
  "id": 50291,
359
  "content": "[unused6]",
@@ -51454,7 +51400,18 @@
51454
  " ": 50276,
51455
  " ": 50277,
51456
  " ": 50278,
51457
- " ": 50279
 
 
 
 
 
 
 
 
 
 
 
51458
  },
51459
  "merges": [
51460
  [
@@ -251440,6 +251397,30 @@
251440
  [
251441
  "a",
251442
  "que"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
251443
  ]
251444
  ]
251445
  }
 
300
  "normalized": false,
301
  "special": true
302
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
303
  {
304
  "id": 50291,
305
  "content": "[unused6]",
 
51400
  " ": 50276,
51401
  " ": 50277,
51402
  " ": 50278,
51403
+ " ": 50279,
51404
+ "[UNK]": 50280,
51405
+ "[CLS]": 50281,
51406
+ "[SEP]": 50282,
51407
+ "[PAD]": 50283,
51408
+ "[MASK]": 50284,
51409
+ "а": 50285,
51410
+ "е": 50286,
51411
+ "и": 50287,
51412
+ "н": 50288,
51413
+ "о": 50289,
51414
+ "ÑĤ": 50290
51415
  },
51416
  "merges": [
51417
  [
 
251397
  [
251398
  "a",
251399
  "que"
251400
+ ],
251401
+ [
251402
+ "Ð",
251403
+ "°"
251404
+ ],
251405
+ [
251406
+ "Ð",
251407
+ "µ"
251408
+ ],
251409
+ [
251410
+ "Ð",
251411
+ "¸"
251412
+ ],
251413
+ [
251414
+ "Ð",
251415
+ "½"
251416
+ ],
251417
+ [
251418
+ "Ð",
251419
+ "¾"
251420
+ ],
251421
+ [
251422
+ "Ñ",
251423
+ "Ĥ"
251424
  ]
251425
  ]
251426
  }
tokenizer_config.json CHANGED
@@ -264,54 +264,6 @@
264
  "single_word": false,
265
  "special": true
266
  },
267
- "50285": {
268
- "content": "[unused0]",
269
- "lstrip": false,
270
- "normalized": true,
271
- "rstrip": false,
272
- "single_word": false,
273
- "special": false
274
- },
275
- "50286": {
276
- "content": "[unused1]",
277
- "lstrip": false,
278
- "normalized": true,
279
- "rstrip": false,
280
- "single_word": false,
281
- "special": false
282
- },
283
- "50287": {
284
- "content": "[unused2]",
285
- "lstrip": false,
286
- "normalized": true,
287
- "rstrip": false,
288
- "single_word": false,
289
- "special": false
290
- },
291
- "50288": {
292
- "content": "[unused3]",
293
- "lstrip": false,
294
- "normalized": true,
295
- "rstrip": false,
296
- "single_word": false,
297
- "special": false
298
- },
299
- "50289": {
300
- "content": "[unused4]",
301
- "lstrip": false,
302
- "normalized": true,
303
- "rstrip": false,
304
- "single_word": false,
305
- "special": false
306
- },
307
- "50290": {
308
- "content": "[unused5]",
309
- "lstrip": false,
310
- "normalized": true,
311
- "rstrip": false,
312
- "single_word": false,
313
- "special": false
314
- },
315
  "50291": {
316
  "content": "[unused6]",
317
  "lstrip": false,
 
264
  "single_word": false,
265
  "special": true
266
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
267
  "50291": {
268
  "content": "[unused6]",
269
  "lstrip": false,