Upload tokenizer
Browse files- tokenizer.json +36 -55
- tokenizer_config.json +0 -48
tokenizer.json
CHANGED
@@ -300,60 +300,6 @@
|
|
300 |
"normalized": false,
|
301 |
"special": true
|
302 |
},
|
303 |
-
{
|
304 |
-
"id": 50285,
|
305 |
-
"content": "[unused0]",
|
306 |
-
"single_word": false,
|
307 |
-
"lstrip": false,
|
308 |
-
"rstrip": false,
|
309 |
-
"normalized": true,
|
310 |
-
"special": false
|
311 |
-
},
|
312 |
-
{
|
313 |
-
"id": 50286,
|
314 |
-
"content": "[unused1]",
|
315 |
-
"single_word": false,
|
316 |
-
"lstrip": false,
|
317 |
-
"rstrip": false,
|
318 |
-
"normalized": true,
|
319 |
-
"special": false
|
320 |
-
},
|
321 |
-
{
|
322 |
-
"id": 50287,
|
323 |
-
"content": "[unused2]",
|
324 |
-
"single_word": false,
|
325 |
-
"lstrip": false,
|
326 |
-
"rstrip": false,
|
327 |
-
"normalized": true,
|
328 |
-
"special": false
|
329 |
-
},
|
330 |
-
{
|
331 |
-
"id": 50288,
|
332 |
-
"content": "[unused3]",
|
333 |
-
"single_word": false,
|
334 |
-
"lstrip": false,
|
335 |
-
"rstrip": false,
|
336 |
-
"normalized": true,
|
337 |
-
"special": false
|
338 |
-
},
|
339 |
-
{
|
340 |
-
"id": 50289,
|
341 |
-
"content": "[unused4]",
|
342 |
-
"single_word": false,
|
343 |
-
"lstrip": false,
|
344 |
-
"rstrip": false,
|
345 |
-
"normalized": true,
|
346 |
-
"special": false
|
347 |
-
},
|
348 |
-
{
|
349 |
-
"id": 50290,
|
350 |
-
"content": "[unused5]",
|
351 |
-
"single_word": false,
|
352 |
-
"lstrip": false,
|
353 |
-
"rstrip": false,
|
354 |
-
"normalized": true,
|
355 |
-
"special": false
|
356 |
-
},
|
357 |
{
|
358 |
"id": 50291,
|
359 |
"content": "[unused6]",
|
@@ -51454,7 +51400,18 @@
|
|
51454 |
" ": 50276,
|
51455 |
" ": 50277,
|
51456 |
" ": 50278,
|
51457 |
-
" ": 50279
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
51458 |
},
|
51459 |
"merges": [
|
51460 |
[
|
@@ -251440,6 +251397,30 @@
|
|
251440 |
[
|
251441 |
"a",
|
251442 |
"que"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
251443 |
]
|
251444 |
]
|
251445 |
}
|
|
|
300 |
"normalized": false,
|
301 |
"special": true
|
302 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
303 |
{
|
304 |
"id": 50291,
|
305 |
"content": "[unused6]",
|
|
|
51400 |
" ": 50276,
|
51401 |
" ": 50277,
|
51402 |
" ": 50278,
|
51403 |
+
" ": 50279,
|
51404 |
+
"[UNK]": 50280,
|
51405 |
+
"[CLS]": 50281,
|
51406 |
+
"[SEP]": 50282,
|
51407 |
+
"[PAD]": 50283,
|
51408 |
+
"[MASK]": 50284,
|
51409 |
+
"а": 50285,
|
51410 |
+
"е": 50286,
|
51411 |
+
"и": 50287,
|
51412 |
+
"н": 50288,
|
51413 |
+
"о": 50289,
|
51414 |
+
"ÑĤ": 50290
|
51415 |
},
|
51416 |
"merges": [
|
51417 |
[
|
|
|
251397 |
[
|
251398 |
"a",
|
251399 |
"que"
|
251400 |
+
],
|
251401 |
+
[
|
251402 |
+
"Ð",
|
251403 |
+
"°"
|
251404 |
+
],
|
251405 |
+
[
|
251406 |
+
"Ð",
|
251407 |
+
"µ"
|
251408 |
+
],
|
251409 |
+
[
|
251410 |
+
"Ð",
|
251411 |
+
"¸"
|
251412 |
+
],
|
251413 |
+
[
|
251414 |
+
"Ð",
|
251415 |
+
"½"
|
251416 |
+
],
|
251417 |
+
[
|
251418 |
+
"Ð",
|
251419 |
+
"¾"
|
251420 |
+
],
|
251421 |
+
[
|
251422 |
+
"Ñ",
|
251423 |
+
"Ĥ"
|
251424 |
]
|
251425 |
]
|
251426 |
}
|
tokenizer_config.json
CHANGED
@@ -264,54 +264,6 @@
|
|
264 |
"single_word": false,
|
265 |
"special": true
|
266 |
},
|
267 |
-
"50285": {
|
268 |
-
"content": "[unused0]",
|
269 |
-
"lstrip": false,
|
270 |
-
"normalized": true,
|
271 |
-
"rstrip": false,
|
272 |
-
"single_word": false,
|
273 |
-
"special": false
|
274 |
-
},
|
275 |
-
"50286": {
|
276 |
-
"content": "[unused1]",
|
277 |
-
"lstrip": false,
|
278 |
-
"normalized": true,
|
279 |
-
"rstrip": false,
|
280 |
-
"single_word": false,
|
281 |
-
"special": false
|
282 |
-
},
|
283 |
-
"50287": {
|
284 |
-
"content": "[unused2]",
|
285 |
-
"lstrip": false,
|
286 |
-
"normalized": true,
|
287 |
-
"rstrip": false,
|
288 |
-
"single_word": false,
|
289 |
-
"special": false
|
290 |
-
},
|
291 |
-
"50288": {
|
292 |
-
"content": "[unused3]",
|
293 |
-
"lstrip": false,
|
294 |
-
"normalized": true,
|
295 |
-
"rstrip": false,
|
296 |
-
"single_word": false,
|
297 |
-
"special": false
|
298 |
-
},
|
299 |
-
"50289": {
|
300 |
-
"content": "[unused4]",
|
301 |
-
"lstrip": false,
|
302 |
-
"normalized": true,
|
303 |
-
"rstrip": false,
|
304 |
-
"single_word": false,
|
305 |
-
"special": false
|
306 |
-
},
|
307 |
-
"50290": {
|
308 |
-
"content": "[unused5]",
|
309 |
-
"lstrip": false,
|
310 |
-
"normalized": true,
|
311 |
-
"rstrip": false,
|
312 |
-
"single_word": false,
|
313 |
-
"special": false
|
314 |
-
},
|
315 |
"50291": {
|
316 |
"content": "[unused6]",
|
317 |
"lstrip": false,
|
|
|
264 |
"single_word": false,
|
265 |
"special": true
|
266 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
267 |
"50291": {
|
268 |
"content": "[unused6]",
|
269 |
"lstrip": false,
|