grover_hg19_tokenizer / vocab.json
zrthxn's picture
Format
a93e858
{
"[PAD]": 0,
"[UNK]": 1,
"[CLS]": 2,
"[SEP]": 3,
"[MASK]": 4,
"A": 5,
"C": 6,
"G": 7,
"T": 8,
"TT": 9,
"AA": 10,
"TG": 11,
"AG": 12,
"CC": 13,
"TC": 14,
"AC": 15,
"GG": 16,
"ATT": 17,
"AT": 18,
"ATG": 19,
"GC": 20,
"TAA": 21,
"TCC": 22,
"ACC": 23,
"AAAA": 24,
"AGG": 25,
"ATC": 26,
"AGC": 27,
"TTC": 28,
"AAG": 29,
"TTTT": 30,
"TGC": 31,
"TGG": 32,
"AAC": 33,
"TTG": 34,
"TAG": 35,
"TAC": 36,
"CCC": 37,
"TATT": 38,
"TGGG": 39,
"TAT": 40,
"AGAA": 41,
"AGGG": 42,
"TTTC": 43,
"AGGC": 44,
"AGCC": 45,
"ATAA": 46,
"TGTG": 47,
"TTGG": 48,
"ATTC": 49,
"AAGG": 50,
"ACAC": 51,
"TCCC": 52,
"TCTC": 53,
"TATG": 54,
"TTTG": 55,
"TTCC": 56,
"AGTG": 57,
"ATGG": 58,
"AGAC": 59,
"AAAC": 60,
"ACCC": 61,
"TGCC": 62,
"ATTG": 63,
"ATCC": 64,
"AGAG": 65,
"ATGC": 66,
"ATAC": 67,
"TCTG": 68,
"TTAA": 69,
"TCAC": 70,
"TGAA": 71,
"TGGC": 72,
"TTGC": 73,
"TAAG": 74,
"TATC": 75,
"TAAC": 76,
"AAAG": 77,
"TTAC": 78,
"AAGC": 79,
"GGG": 80,
"TAGC": 81,
"GGC": 82,
"ATAT": 83,
"TACC": 84,
"AACC": 85,
"AATG": 86,
"TAGG": 87,
"GCC": 88,
"ATATT": 89,
"AGTC": 90,
"TTTTC": 91,
"AAAAC": 92,
"TGAC": 93,
"TTTAA": 94,
"AAAAG": 95,
"AATC": 96,
"TGTC": 97,
"TTATT": 98,
"ATAG": 99,
"TGAG": 100,
"TTTTG": 101,
"AAATT": 102,
"AATT": 103,
"AATAA": 104,
"TTTCC": 105,
"ACAG": 106,
"TCAG": 107,
"AAATG": 108,
"TGGGC": 109,
"ACTC": 110,
"AGGCC": 111,
"TTAG": 112,
"ACTG": 113,
"ACG": 114,
"ATATG": 115,
"TGGCC": 116,
"ATTTC": 117,
"ACAA": 118,
"ATCTC": 119,
"TATTC": 120,
"TGTAA": 121,
"ACTT": 122,
"ATGCC": 123,
"TAAAA": 124,
"AAAAAAAA": 125,
"ATTCC": 126,
"TTTAG": 127,
"TCCCC": 128,
"TTTGC": 129,
"TTCCC": 130,
"TGGGG": 131,
"TTCTC": 132,
"ATAAAA": 133,
"AGAAG": 134,
"TTTTTTTT": 135,
"ACCCC": 136,
"AGGGC": 137,
"ACCTC": 138,
"AGATG": 139,
"ATTAC": 140,
"AAGCC": 141,
"GGCC": 142,
"AGGAG": 143,
"TCAA": 144,
"ATTGC": 145,
"TATTG": 146,
"ATAAC": 147,
"ATATC": 148,
"TTTAC": 149,
"ATGGC": 150,
"AAGGC": 151,
"ACCAC": 152,
"GTG": 153,
"ATCCC": 154,
"AGAAC": 155,
"ATTTT": 156,
"TTGCC": 157,
"AAATC": 158,
"ATAAG": 159,
"TTGGC": 160,
"TGGAG": 161,
"ATGGG": 162,
"AAAGC": 163,
"AGGGG": 164,
"ATCAC": 165,
"ATTTG": 166,
"AATTC": 167,
"TGCAC": 168,
"TTTGG": 169,
"TCG": 170,
"AGAGC": 171,
"AAAGG": 172,
"GGGC": 173,
"TTGGG": 174,
"AGAAAA": 175,
"TATCC": 176,
"TCTCC": 177,
"ATAGC": 178,
"TGAGG": 179,
"TTTATT": 180,
"AGTAA": 181,
"AGAGG": 182,
"TCTTC": 183,
"ACATT": 184,
"TCCTG": 185,
"AGCCC": 186,
"TATGC": 187,
"TTAAAA": 188,
"AGATT": 189,
"TTAAC": 190,
"GGGG": 191,
"AAGAC": 192,
"TCATT": 193,
"TTCTG": 194,
"AGACC": 195,
"AAGGG": 196,
"ATACC": 197,
"TTTAT": 198,
"AAGTG": 199,
"TTATG": 200,
"AAGAA": 201,
"TAGCC": 202,
"TTCAC": 203,
"AGGTG": 204,
"TTGAA": 205,
"ATCTG": 206,
"AGCAC": 207,
"TGCTG": 208,
"AAACC": 209,
"ATGTG": 210,
"TTTTCC": 211,
"AGTTC": 212,
"TCCTC": 213,
"TATGG": 214,
"AATAC": 215,
"AGTGG": 216,
"TAGGC": 217,
"AGCTC": 218,
"ATAGG": 219,
"TTATC": 220,
"TTAAG": 221,
"TACCC": 222,
"TTTTTG": 223,
"AACAC": 224,
"TGCTC": 225,
"AGATC": 226,
"TCCCAGC": 227,
"AGCTG": 228,
"AATAG": 229,
"TCTTG": 230,
"AGTGGC": 231,
"ATTGG": 232,
"TACTC": 233,
"TAAAC": 234,
"AATGG": 235,
"AGGTC": 236,
"AGGAC": 237,
"TTGTG": 238,
"TATAC": 239,
"ATTTTC": 240,
"ATATAA": 241,
"AGGCTG": 242,
"ATTTAA": 243,
"AGTT": 244,
"AGTAG": 245,
"ATGAC": 246,
"AATGC": 247,
"TCCAC": 248,
"CCCC": 249,
"ATGTC": 250,
"AACTC": 251,
"TTTTTC": 252,
"TAAGC": 253,
"AAGTC": 254,
"TGGTG": 255,
"TATAA": 256,
"AGTGC": 257,
"TAAGG": 258,
"ACCTG": 259,
"TTAGC": 260,
"AAATAA": 261,
"TGCCTC": 262,
"AATCC": 263,
"TTGGCC": 264,
"TAGGG": 265,
"TGGAC": 266,
"TTGTC": 267,
"AACCC": 268,
"TTACC": 269,
"TAACC": 270,
"AATTTT": 271,
"AAAGAA": 272,
"ATTATT": 273,
"AGCG": 274,
"AAAAAC": 275,
"TAATG": 276,
"TTGAC": 277,
"AGTCC": 278,
"AACTG": 279,
"AGTTG": 280,
"AATTG": 281,
"TCTGC": 282,
"TTAGG": 283,
"TACAC": 284,
"AGAAGG": 285,
"ATATTC": 286,
"AAAACC": 287,
"AAAAGC": 288,
"TGCCC": 289,
"ACTGC": 290,
"AGAAGC": 291,
"TAATAA": 292,
"AATATT": 293,
"ACCATG": 294,
"TGGTC": 295,
"TTTTGC": 296,
"AACG": 297,
"TACTG": 298,
"ACACACAC": 299,
"ATTTTG": 300,
"TCCG": 301,
"TGCG": 302,
"AAAATG": 303,
"ACATG": 304,
"TCAGC": 305,
"ATCG": 306,
"AGTAC": 307,
"TTTTGG": 308,
"AATAT": 309,
"AGAGAA": 310,
"TTCG": 311,
"TCCAGCC": 312,
"ATATAC": 313,
"TCACC": 314,
"AAAAGG": 315,
"TGTGTGTG": 316,
"TCATC": 317,
"TGCTGGG": 318,
"TGAAG": 319,
"TGTAG": 320,
"TGTGG": 321,
"AAAAATT": 322,
"ACTTC": 323,
"TTCCCC": 324,
"ATAGAA": 325,
"TTGCCC": 326,
"AGGAGG": 327,
"TTTCCC": 328,
"TATATT": 329,
"ACCG": 330,
"ACTAC": 331,
"TCACTGC": 332,
"GCG": 333,
"TTTGTG": 334,
"ACAGC": 335,
"TCATG": 336,
"AGTTTT": 337,
"AGGAA": 338,
"TTTATG": 339,
"ATATTG": 340,
"TGATG": 341,
"TCTAA": 342,
"TGTGC": 343,
"AGGAAG": 344,
"TTTGGG": 345,
"TGTTC": 346,
"AGCCCC": 347,
"AGTTTC": 348,
"AGGCTGG": 349,
"TTTGCC": 350,
"ATTTCC": 351,
"ATACAC": 352,
"AAAATAA": 353,
"TAGAC": 354,
"AGGAGAA": 355,
"TGAGC": 356,
"TGGAA": 357,
"TTTTTAA": 358,
"AGCCTCCC": 359,
"ATGAA": 360,
"TTTAAG": 361,
"TCTGG": 362,
"TTTATC": 363,
"TTATAA": 364,
"TGATT": 365,
"AACAA": 366,
"TAGCTGGG": 367,
"TCAAG": 368,
"AAAAAA": 369,
"ACTTTGGG": 370,
"TATTCC": 371,
"TCAGG": 372,
"AACAG": 373,
"TTCTTC": 374,
"TGTGGC": 375,
"ATATGC": 376,
"ATTACAGGC": 377,
"AGGGGC": 378,
"AGGGCC": 379,
"TTATTC": 380,
"ATATCC": 381,
"TGTAATCCCAGC": 382,
"TACG": 383,
"AGAAAC": 384,
"TGTCC": 385,
"AGATGG": 386,
"TGTGCC": 387,
"TTTCTC": 388,
"TGAAC": 389,
"AGTCTC": 390,
"TGTTG": 391,
"ATTTTTT": 392,
"AAGAAG": 393,
"TGGGGC": 394,
"AGCAGC": 395,
"GCCC": 396,
"TTTGGC": 397,
"AGGCTGAGGC": 398,
"TGGGCC": 399,
"TTCTCC": 400,
"TAGAA": 401,
"TGGAGTGC": 402,
"ATTAA": 403,
"AGTGCC": 404,
"TGTCTC": 405,
"ATATGG": 406,
"ACATC": 407,
"TGGGGG": 408,
"TGACC": 409,
"ACTCC": 410,
"TAAAAC": 411,
"AGATAA": 412,
"TAATTTT": 413,
"TCAAC": 414,
"TCTAC": 415,
"TCTAG": 416,
"GAG": 417,
"TAAATG": 418,
"AGCAA": 419,
"TATATG": 420,
"ATATATAT": 421,
"ATTTGC": 422,
"TCCTCC": 423,
"CCCAC": 424,
"ATTTATT": 425,
"TCTGCC": 426,
"ATGGCC": 427,
"TCGC": 428,
"AGTATT": 429,
"AGAACC": 430,
"TTAAAC": 431,
"AAATTC": 432,
"AGAGAC": 433,
"ATTTAC": 434,
"ATTGCC": 435,
"AACAAC": 436,
"TTTAAC": 437,
"ACGG": 438,
"AAGAAAA": 439,
"TCTGGC": 440,
"ATTCTCC": 441,
"AGGTGG": 442,
"TGCTGC": 443,
"TTCAAG": 444,
"AGAGGG": 445,
"ACACC": 446,
"TCTTTT": 447,
"AGAGGC": 448,
"ATCACC": 449,
"TAAATT": 450,
"AAGGCC": 451,
"TTGCAGTG": 452,
"TGTAC": 453,
"AATTTC": 454,
"ATCCCC": 455,
"ACAAG": 456,
"ACAGG": 457,
"ACAAC": 458,
"TGCCCC": 459,
"AGATTC": 460,
"TTAGAA": 461,
"TTGGGG": 462,
"AGACAC": 463,
"TGGAAG": 464,
"ACCTCC": 465,
"ATGGGG": 466,
"AGCCTCC": 467,
"TTATTG": 468,
"TAAAAG": 469,
"ATCTTC": 470,
"ATCTCC": 471,
"TGAAGC": 472,
"TAATC": 473,
"AAATGC": 474,
"TTGTTG": 475,
"ATTCCC": 476,
"TACTAAAA": 477,
"ATAGTG": 478,
"AAATAC": 479,
"TTGGGC": 480,
"TAGAGAC": 481,
"TGTTTT": 482,
"TTCTGC": 483,
"TGGCCC": 484,
"TCTGTC": 485,
"AGCTCC": 486,
"AACTCC": 487,
"TTAGCC": 488,
"AAAGTGCTGGG": 489,
"ATAGAC": 490,
"TATTTTTAG": 491,
"ACTTG": 492,
"ACCACC": 493,
"AAACAC": 494,
"GTGG": 495,
"ATTTAG": 496,
"AGGAGC": 497,
"AGGCTGGAGTGC": 498,
"ATACCC": 499,
"ATGTAA": 500,
"ACGC": 501,
"AGTAT": 502,
"TTTACC": 503,
"ACTAA": 504,
"AGGCCC": 505,
"AAGGGG": 506,
"TCTCG": 507,
"ATGAAG": 508,
"AAAGAC": 509,
"TGAAAA": 510,
"AAGGGC": 511,
"ATAGGC": 512,
"AGAGTG": 513,
"AGCTGC": 514,
"ATGTTC": 515,
"TATTTC": 516,
"TGATC": 517,
"AGTTTG": 518,
"AGCTAA": 519,
"AGAGCC": 520,
"TGCTTC": 521,
"ATCATC": 522,
"AACATGG": 523,
"AGCTTC": 524,
"AAGAAC": 525,
"TTTTTTG": 526,
"AGGGGG": 527,
"ATAAGC": 528,
"TAAGCC": 529,
"ACTGG": 530,
"ACAAAA": 531,
"ATCATT": 532,
"TCTTTC": 533,
"ATGATG": 534,
"TGCAA": 535,
"AGGTTC": 536,
"AACATT": 537,
"ATGGGC": 538,
"ATAGAG": 539,
"AAATGG": 540,
"AGTTCC": 541,
"TTTAGC": 542,
"AACTTC": 543,
"AGCAAG": 544,
"ATAAAAC": 545,
"AAAATC": 546,
"AGCCAC": 547,
"AGGAAC": 548,
"TTAACC": 549,
"TATTTATT": 550,
"TTTCTG": 551,
"ATAAGG": 552,
"AGCCACC": 553,
"AGATGC": 554,
"TTAAGC": 555,
"TTGTAA": 556,
"AGTGTG": 557,
"AACCCC": 558,
"TTCATT": 559,
"ATCATG": 560,
"AATGAA": 561,
"AGGTGC": 562,
"AAAAAAAAAAAAAAAA": 563,
"AGGATG": 564,
"AGCCG": 565,
"TGGTGG": 566,
"AGTGGG": 567,
"TGCACTCCAGCC": 568,
"TATTGC": 569,
"TAGTC": 570,
"CCCG": 571,
"AAGTAA": 572,
"TAGTG": 573,
"TTTTTTTTTTTTTTTT": 574,
"AGCATT": 575,
"ATCTGC": 576,
"TCTCAC": 577,
"AAATTG": 578,
"TTTAGG": 579,
"AGACCC": 580,
"GGGCC": 581,
"TCCTTC": 582,
"ATAGGG": 583,
"AATATG": 584,
"TTATAC": 585,
"TAGAAG": 586,
"AAAGTG": 587,
"AAATCC": 588,
"TTCCTC": 589,
"TTTCAC": 590,
"AGTATG": 591,
"TACTAAAAATAC": 592,
"ATGTGC": 593,
"AGGAGGC": 594,
"TATATC": 595,
"TTCTAA": 596,
"TGAGGC": 597,
"ACACAC": 598,
"TCCCCC": 599,
"AACATC": 600,
"AAGCG": 601,
"AATGGC": 602,
"ACCCCC": 603,
"AGATAC": 604,
"ATAAAAG": 605,
"ATGATT": 606,
"TGGAGG": 607,
"AGTTAA": 608,
"": 609
}