my-tokenizer / my-tokenizer.json
Max1798's picture
Upload folder using huggingface_hub
3aa470a verified
{
"version": "1.0",
"truncation": null,
"padding": null,
"added_tokens": [
{
"id": 0,
"content": "[UNK]",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 1,
"content": "[CLS]",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 2,
"content": "[SEP]",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 3,
"content": "[PAD]",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 4,
"content": "[MASK]",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
}
],
"normalizer": null,
"pre_tokenizer": {
"type": "Whitespace"
},
"post_processor": null,
"decoder": null,
"model": {
"type": "BPE",
"dropout": null,
"unk_token": "[UNK]",
"continuing_subword_prefix": null,
"end_of_word_suffix": null,
"fuse_unk": false,
"byte_fallback": false,
"ignore_merges": false,
"vocab": {
"[UNK]": 0,
"[CLS]": 1,
"[SEP]": 2,
"[PAD]": 3,
"[MASK]": 4,
"!": 5,
"'": 6,
",": 7,
"-": 8,
".": 9,
":": 10,
"A": 11,
"B": 12,
"C": 13,
"E": 14,
"H": 15,
"I": 16,
"L": 17,
"N": 18,
"P": 19,
"S": 20,
"T": 21,
"a": 22,
"b": 23,
"c": 24,
"d": 25,
"e": 26,
"f": 27,
"g": 28,
"h": 29,
"i": 30,
"j": 31,
"k": 32,
"l": 33,
"m": 34,
"n": 35,
"o": 36,
"p": 37,
"q": 38,
"r": 39,
"s": 40,
"t": 41,
"u": 42,
"v": 43,
"w": 44,
"x": 45,
"y": 46,
"z": 47,
"or": 48,
"th": 49,
"an": 50,
"es": 51,
"ar": 52,
"er": 53,
"is": 54,
"wor": 55,
"in": 56,
"ou": 57,
"at": 58,
"on": 59,
"word": 60,
"re": 61,
"to": 62,
"words": 63,
"en": 64,
"le": 65,
"ing": 66,
"el": 67,
"for": 68,
"ion": 69,
"the": 70,
"al": 71,
"as": 72,
"can": 73,
"ed": 74,
"he": 75,
"im": 76,
"it": 77,
"iz": 78,
"ly": 79,
"of": 80,
"ve": 81,
"you": 82,
"and": 83,
"ers": 84,
"ave": 85,
"do": 86,
"ear": 87,
"fu": 88,
"have": 89,
"ken": 90,
"li": 91,
"no": 92,
"ol": 93,
"qu": 94,
"su": 95,
"tr": 96,
"tion": 97,
"that": 98,
"ise": 99,
"keniz": 100,
"not": 101,
"BP": 102,
"LP": 103,
"NLP": 104,
"To": 105,
"The": 106,
"ab": 107,
"ac": 108,
"ak": 109,
"am": 110,
"be": 111,
"bwords": 112,
"cou": 113,
"ep": 114,
"fear": 115,
"hat": 116,
"ke": 117,
"ll": 118,
"ntr": 119,
"oc": 120,
"pr": 121,
"ple": 122,
"rise": 123,
"st": 124,
"tim": 125,
"up": 126,
"us": 127,
"ver": 128,
"we": 129,
"was": 130,
"what": 131,
"orp": 132,
"thy": 133,
"thing": 134,
"ess": 135,
"est": 136,
"are": 137,
"arn": 138,
"out": 139,
"ation": 140,
"tokeniz": 141,
"ent": 142,
"learn": 143,
"althy": 144,
"your": 145,
"like": 146,
"subwords": 147,
"BPE": 148,
"countr": 149,
"times": 150,
"tokenizers": 151,
"country": 152,
"Ac": 153,
"As": 154,
"All": 155,
"Col": 156,
"Ear": 157,
"Hel": 158,
"In": 159,
"It": 160,
"Lar": 161,
"Nat": 162,
"Sm": 163,
"Su": 164,
"Th": 165,
"Tr": 166,
"ag": 167,
"ai": 168,
"ay": 169,
"az": 170,
"ain": 171,
"br": 172,
"by": 173,
"bword": 174,
"bed": 175,
"best": 176,
"ch": 177,
"ck": 178,
"co": 179,
"cre": 180,
"corp": 181,
"de": 182,
"der": 183,
"dre": 184,
"del": 185,
"day": 186,
"ex": 187,
"efu": 188,
"eak": 189,
"ever": 190,
"fe": 191,
"fi": 192,
"fo": 193,
"fre": 194,
"gl": 195,
"go": 196,
"gu": 197,
"ger": 198,
"gre": 199,
"ging": 200,
"gol": 201,
"hin": 202,
"hand": 203,
"ip": 204,
"iou": 205,
"ill": 206,
"ick": 207,
"ide": 208,
"ju": 209,
"ks": 210,
"ld": 211,
"lo": 212,
"lan": 213,
"lar": 214,
"ler": 215,
"lou": 216,
"less": 217,
"laz": 218,
"mm": 219,
"mo": 220,
"mp": 221,
"man": 222,
"mes": 223,
"mer": 224,
"mers": 225,
"mac": 226,
"mak": 227,
"morp": 228,
"nation": 229,
"ow": 230,
"over": 231,
"por": 232,
"pai": 233,
"peak": 234,
"pip": 235,
"rs": 236,
"ral": 237,
"rare": 238,
"riou": 239,
"sle": 240,
"sel": 241,
"sfor": 242,
"sim": 243,
"sly": 244,
"speak": 245,
"tan": 246,
"ter": 247,
"ters": 248,
"test": 249,
"ution": 250,
"ular": 251,
"ural": 252,
"vol": 253,
"voc": 254,
"wise": 255,
"will": 256,
"ything": 257,
"orless": 258,
"than": 259,
"this": 260,
"thre": 261,
"ansfor": 262,
"estion": 263,
"esent": 264,
"arac": 265,
"worst": 266,
"works": 267,
"world": 268,
"ines": 269,
"ated": 270,
"one": 271,
"only": 272,
"repr": 273,
"revol": 274,
"eld": 275,
"elines": 276,
"aller": 277,
"ask": 278,
"healthy": 279,
"hemes": 280,
"impor": 281,
"itsel": 282,
"itters": 283,
"ized": 284,
"dog": 285,
"early": 286,
"fun": 287,
"furiou": 288,
"life": 289,
"quent": 290,
"quick": 291,
"question": 292,
"sum": 293,
"train": 294,
"tions": 295,
"kenization": 296,
"Tokenization": 297,
"about": 298,
"abular": 299,
"ample": 300,
"ocess": 301,
"process": 302,
"step": 303,
"usefu": 304,
"wealthy": 305,
"learning": 306,
"learned": 307,
"Actions": 308,
"Ask": 309,
"Colorless": 310,
"Early": 311,
"Hello": 312,
"Larger": 313,
"Natural": 314,
"Smaller": 315,
"Subword": 316,
"This": 317,
"Transfor": 318,
"age": 319,
"brow": 320,
"charac": 321,
"comm": 322,
"created": 323,
"corpus": 324,
"dream": 325,
"dels": 326,
"example": 327,
"everything": 328,
"field": 329,
"fox": 330,
"frequent": 331,
"glitters": 332,
"goes": 333,
"guage": 334,
"green": 335,
"gold": 336,
"hine": 337,
"handle": 338,
"ideas": 339,
"jump": 340,
"language": 341,
"louder": 342,
"lazy": 343,
"models": 344,
"merging": 345,
"machine": 346,
"makes": 347,
"morphemes": 348,
"pairs": 349,
"pipelines": 350,
"sleep": 351,
"simple": 352,
"tant": 353,
"testing": 354,
"utionized": 355,
"vocabular": 356,
"three": 357,
"represent": 358,
"revolutionized": 359,
"important": 360,
"itself": 361,
"furiously": 362,
"processing": 363,
"useful": 364,
"Transformers": 365,
"brown": 366,
"character": 367,
"common": 368,
"jumps": 369,
"vocabulary": 370
},
"merges": [
[
"o",
"r"
],
[
"t",
"h"
],
[
"a",
"n"
],
[
"e",
"s"
],
[
"a",
"r"
],
[
"e",
"r"
],
[
"i",
"s"
],
[
"w",
"or"
],
[
"i",
"n"
],
[
"o",
"u"
],
[
"a",
"t"
],
[
"o",
"n"
],
[
"wor",
"d"
],
[
"r",
"e"
],
[
"t",
"o"
],
[
"word",
"s"
],
[
"e",
"n"
],
[
"l",
"e"
],
[
"in",
"g"
],
[
"e",
"l"
],
[
"f",
"or"
],
[
"i",
"on"
],
[
"th",
"e"
],
[
"a",
"l"
],
[
"a",
"s"
],
[
"c",
"an"
],
[
"e",
"d"
],
[
"h",
"e"
],
[
"i",
"m"
],
[
"i",
"t"
],
[
"i",
"z"
],
[
"l",
"y"
],
[
"o",
"f"
],
[
"v",
"e"
],
[
"y",
"ou"
],
[
"an",
"d"
],
[
"er",
"s"
],
[
"a",
"ve"
],
[
"d",
"o"
],
[
"e",
"ar"
],
[
"f",
"u"
],
[
"h",
"ave"
],
[
"k",
"en"
],
[
"l",
"i"
],
[
"n",
"o"
],
[
"o",
"l"
],
[
"q",
"u"
],
[
"s",
"u"
],
[
"t",
"r"
],
[
"t",
"ion"
],
[
"th",
"at"
],
[
"is",
"e"
],
[
"ken",
"iz"
],
[
"no",
"t"
],
[
"B",
"P"
],
[
"L",
"P"
],
[
"N",
"LP"
],
[
"T",
"o"
],
[
"T",
"he"
],
[
"a",
"b"
],
[
"a",
"c"
],
[
"a",
"k"
],
[
"a",
"m"
],
[
"b",
"e"
],
[
"b",
"words"
],
[
"c",
"ou"
],
[
"e",
"p"
],
[
"f",
"ear"
],
[
"h",
"at"
],
[
"k",
"e"
],
[
"l",
"l"
],
[
"n",
"tr"
],
[
"o",
"c"
],
[
"p",
"r"
],
[
"p",
"le"
],
[
"r",
"ise"
],
[
"s",
"t"
],
[
"t",
"im"
],
[
"u",
"p"
],
[
"u",
"s"
],
[
"v",
"er"
],
[
"w",
"e"
],
[
"w",
"as"
],
[
"w",
"hat"
],
[
"or",
"p"
],
[
"th",
"y"
],
[
"th",
"ing"
],
[
"es",
"s"
],
[
"es",
"t"
],
[
"ar",
"e"
],
[
"ar",
"n"
],
[
"ou",
"t"
],
[
"at",
"ion"
],
[
"to",
"keniz"
],
[
"en",
"t"
],
[
"le",
"arn"
],
[
"al",
"thy"
],
[
"you",
"r"
],
[
"li",
"ke"
],
[
"su",
"bwords"
],
[
"BP",
"E"
],
[
"cou",
"ntr"
],
[
"tim",
"es"
],
[
"tokeniz",
"ers"
],
[
"countr",
"y"
],
[
"A",
"c"
],
[
"A",
"s"
],
[
"A",
"ll"
],
[
"C",
"ol"
],
[
"E",
"ar"
],
[
"H",
"el"
],
[
"I",
"n"
],
[
"I",
"t"
],
[
"L",
"ar"
],
[
"N",
"at"
],
[
"S",
"m"
],
[
"S",
"u"
],
[
"T",
"h"
],
[
"T",
"r"
],
[
"a",
"g"
],
[
"a",
"i"
],
[
"a",
"y"
],
[
"a",
"z"
],
[
"a",
"in"
],
[
"b",
"r"
],
[
"b",
"y"
],
[
"b",
"word"
],
[
"b",
"ed"
],
[
"b",
"est"
],
[
"c",
"h"
],
[
"c",
"k"
],
[
"c",
"o"
],
[
"c",
"re"
],
[
"c",
"orp"
],
[
"d",
"e"
],
[
"d",
"er"
],
[
"d",
"re"
],
[
"d",
"el"
],
[
"d",
"ay"
],
[
"e",
"x"
],
[
"e",
"fu"
],
[
"e",
"ak"
],
[
"e",
"ver"
],
[
"f",
"e"
],
[
"f",
"i"
],
[
"f",
"o"
],
[
"f",
"re"
],
[
"g",
"l"
],
[
"g",
"o"
],
[
"g",
"u"
],
[
"g",
"er"
],
[
"g",
"re"
],
[
"g",
"ing"
],
[
"g",
"ol"
],
[
"h",
"in"
],
[
"h",
"and"
],
[
"i",
"p"
],
[
"i",
"ou"
],
[
"i",
"ll"
],
[
"i",
"ck"
],
[
"i",
"de"
],
[
"j",
"u"
],
[
"k",
"s"
],
[
"l",
"d"
],
[
"l",
"o"
],
[
"l",
"an"
],
[
"l",
"ar"
],
[
"l",
"er"
],
[
"l",
"ou"
],
[
"l",
"ess"
],
[
"l",
"az"
],
[
"m",
"m"
],
[
"m",
"o"
],
[
"m",
"p"
],
[
"m",
"an"
],
[
"m",
"es"
],
[
"m",
"er"
],
[
"m",
"ers"
],
[
"m",
"ac"
],
[
"m",
"ak"
],
[
"m",
"orp"
],
[
"n",
"ation"
],
[
"o",
"w"
],
[
"o",
"ver"
],
[
"p",
"or"
],
[
"p",
"ai"
],
[
"p",
"eak"
],
[
"p",
"ip"
],
[
"r",
"s"
],
[
"r",
"al"
],
[
"r",
"are"
],
[
"r",
"iou"
],
[
"s",
"le"
],
[
"s",
"el"
],
[
"s",
"for"
],
[
"s",
"im"
],
[
"s",
"ly"
],
[
"s",
"peak"
],
[
"t",
"an"
],
[
"t",
"er"
],
[
"t",
"ers"
],
[
"t",
"est"
],
[
"u",
"tion"
],
[
"u",
"lar"
],
[
"u",
"ral"
],
[
"v",
"ol"
],
[
"v",
"oc"
],
[
"w",
"ise"
],
[
"w",
"ill"
],
[
"y",
"thing"
],
[
"or",
"less"
],
[
"th",
"an"
],
[
"th",
"is"
],
[
"th",
"re"
],
[
"an",
"sfor"
],
[
"es",
"tion"
],
[
"es",
"ent"
],
[
"ar",
"ac"
],
[
"wor",
"st"
],
[
"wor",
"ks"
],
[
"wor",
"ld"
],
[
"in",
"es"
],
[
"at",
"ed"
],
[
"on",
"e"
],
[
"on",
"ly"
],
[
"re",
"pr"
],
[
"re",
"vol"
],
[
"el",
"d"
],
[
"el",
"ines"
],
[
"al",
"ler"
],
[
"as",
"k"
],
[
"he",
"althy"
],
[
"he",
"mes"
],
[
"im",
"por"
],
[
"it",
"sel"
],
[
"it",
"ters"
],
[
"iz",
"ed"
],
[
"do",
"g"
],
[
"ear",
"ly"
],
[
"fu",
"n"
],
[
"fu",
"riou"
],
[
"li",
"fe"
],
[
"qu",
"ent"
],
[
"qu",
"ick"
],
[
"qu",
"estion"
],
[
"su",
"m"
],
[
"tr",
"ain"
],
[
"tion",
"s"
],
[
"keniz",
"ation"
],
[
"To",
"kenization"
],
[
"ab",
"out"
],
[
"ab",
"ular"
],
[
"am",
"ple"
],
[
"oc",
"ess"
],
[
"pr",
"ocess"
],
[
"st",
"ep"
],
[
"us",
"efu"
],
[
"we",
"althy"
],
[
"learn",
"ing"
],
[
"learn",
"ed"
],
[
"Ac",
"tions"
],
[
"As",
"k"
],
[
"Col",
"orless"
],
[
"Ear",
"ly"
],
[
"Hel",
"lo"
],
[
"Lar",
"ger"
],
[
"Nat",
"ural"
],
[
"Sm",
"aller"
],
[
"Su",
"bword"
],
[
"Th",
"is"
],
[
"Tr",
"ansfor"
],
[
"ag",
"e"
],
[
"br",
"ow"
],
[
"ch",
"arac"
],
[
"co",
"mm"
],
[
"cre",
"ated"
],
[
"corp",
"us"
],
[
"dre",
"am"
],
[
"del",
"s"
],
[
"ex",
"ample"
],
[
"ever",
"ything"
],
[
"fi",
"eld"
],
[
"fo",
"x"
],
[
"fre",
"quent"
],
[
"gl",
"itters"
],
[
"go",
"es"
],
[
"gu",
"age"
],
[
"gre",
"en"
],
[
"gol",
"d"
],
[
"hin",
"e"
],
[
"hand",
"le"
],
[
"ide",
"as"
],
[
"ju",
"mp"
],
[
"lan",
"guage"
],
[
"lou",
"der"
],
[
"laz",
"y"
],
[
"mo",
"dels"
],
[
"mer",
"ging"
],
[
"mac",
"hine"
],
[
"mak",
"es"
],
[
"morp",
"hemes"
],
[
"pai",
"rs"
],
[
"pip",
"elines"
],
[
"sle",
"ep"
],
[
"sim",
"ple"
],
[
"tan",
"t"
],
[
"test",
"ing"
],
[
"ution",
"ized"
],
[
"voc",
"abular"
],
[
"thre",
"e"
],
[
"repr",
"esent"
],
[
"revol",
"utionized"
],
[
"impor",
"tant"
],
[
"itsel",
"f"
],
[
"furiou",
"sly"
],
[
"process",
"ing"
],
[
"usefu",
"l"
],
[
"Transfor",
"mers"
],
[
"brow",
"n"
],
[
"charac",
"ter"
],
[
"comm",
"on"
],
[
"jump",
"s"
],
[
"vocabular",
"y"
]
]
}
}