zrthxn commited on
Commit
303ece6
·
0 Parent(s):

Create tokenizer

Browse files
BertTokenizer/special_tokens_map.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "cls_token": "[CLS]",
3
+ "mask_token": "[MASK]",
4
+ "pad_token": "[PAD]",
5
+ "sep_token": "[SEP]",
6
+ "unk_token": "[UNK]"
7
+ }
BertTokenizer/tokenizer_config.json ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "clean_up_tokenization_spaces": true,
3
+ "cls_token": "[CLS]",
4
+ "do_basic_tokenize": true,
5
+ "do_lower_case": true,
6
+ "mask_token": "[MASK]",
7
+ "max_len": 512,
8
+ "model_max_length": 512,
9
+ "never_split": null,
10
+ "pad_token": "[PAD]",
11
+ "sep_token": "[SEP]",
12
+ "strip_accents": null,
13
+ "tokenize_chinese_chars": true,
14
+ "tokenizer_class": "BertTokenizer",
15
+ "unk_token": "[UNK]"
16
+ }
BertTokenizer/vocab.txt ADDED
@@ -0,0 +1,609 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [PAD]
2
+ [UNK]
3
+ [CLS]
4
+ [SEP]
5
+ [MASK]
6
+ A
7
+ C
8
+ G
9
+ T
10
+ TT
11
+ AA
12
+ TG
13
+ AG
14
+ CC
15
+ TC
16
+ AC
17
+ GG
18
+ ATT
19
+ AT
20
+ ATG
21
+ GC
22
+ TAA
23
+ TCC
24
+ ACC
25
+ AAAA
26
+ AGG
27
+ ATC
28
+ AGC
29
+ TTC
30
+ AAG
31
+ TTTT
32
+ TGC
33
+ TGG
34
+ AAC
35
+ TTG
36
+ TAG
37
+ TAC
38
+ CCC
39
+ TATT
40
+ TGGG
41
+ TAT
42
+ AGAA
43
+ AGGG
44
+ TTTC
45
+ AGGC
46
+ AGCC
47
+ ATAA
48
+ TGTG
49
+ TTGG
50
+ ATTC
51
+ AAGG
52
+ ACAC
53
+ TCCC
54
+ TCTC
55
+ TATG
56
+ TTTG
57
+ TTCC
58
+ AGTG
59
+ ATGG
60
+ AGAC
61
+ AAAC
62
+ ACCC
63
+ TGCC
64
+ ATTG
65
+ ATCC
66
+ AGAG
67
+ ATGC
68
+ ATAC
69
+ TCTG
70
+ TTAA
71
+ TCAC
72
+ TGAA
73
+ TGGC
74
+ TTGC
75
+ TAAG
76
+ TATC
77
+ TAAC
78
+ AAAG
79
+ TTAC
80
+ AAGC
81
+ GGG
82
+ TAGC
83
+ GGC
84
+ ATAT
85
+ TACC
86
+ AACC
87
+ AATG
88
+ TAGG
89
+ GCC
90
+ ATATT
91
+ AGTC
92
+ TTTTC
93
+ AAAAC
94
+ TGAC
95
+ TTTAA
96
+ AAAAG
97
+ AATC
98
+ TGTC
99
+ TTATT
100
+ ATAG
101
+ TGAG
102
+ TTTTG
103
+ AAATT
104
+ AATT
105
+ AATAA
106
+ TTTCC
107
+ ACAG
108
+ TCAG
109
+ AAATG
110
+ TGGGC
111
+ ACTC
112
+ AGGCC
113
+ TTAG
114
+ ACTG
115
+ ACG
116
+ ATATG
117
+ TGGCC
118
+ ATTTC
119
+ ACAA
120
+ ATCTC
121
+ TATTC
122
+ TGTAA
123
+ ACTT
124
+ ATGCC
125
+ TAAAA
126
+ AAAAAAAA
127
+ ATTCC
128
+ TTTAG
129
+ TCCCC
130
+ TTTGC
131
+ TTCCC
132
+ TGGGG
133
+ TTCTC
134
+ ATAAAA
135
+ AGAAG
136
+ TTTTTTTT
137
+ ACCCC
138
+ AGGGC
139
+ ACCTC
140
+ AGATG
141
+ ATTAC
142
+ AAGCC
143
+ GGCC
144
+ AGGAG
145
+ TCAA
146
+ ATTGC
147
+ TATTG
148
+ ATAAC
149
+ ATATC
150
+ TTTAC
151
+ ATGGC
152
+ AAGGC
153
+ ACCAC
154
+ GTG
155
+ ATCCC
156
+ AGAAC
157
+ ATTTT
158
+ TTGCC
159
+ AAATC
160
+ ATAAG
161
+ TTGGC
162
+ TGGAG
163
+ ATGGG
164
+ AAAGC
165
+ AGGGG
166
+ ATCAC
167
+ ATTTG
168
+ AATTC
169
+ TGCAC
170
+ TTTGG
171
+ TCG
172
+ AGAGC
173
+ AAAGG
174
+ GGGC
175
+ TTGGG
176
+ AGAAAA
177
+ TATCC
178
+ TCTCC
179
+ ATAGC
180
+ TGAGG
181
+ TTTATT
182
+ AGTAA
183
+ AGAGG
184
+ TCTTC
185
+ ACATT
186
+ TCCTG
187
+ AGCCC
188
+ TATGC
189
+ TTAAAA
190
+ AGATT
191
+ TTAAC
192
+ GGGG
193
+ AAGAC
194
+ TCATT
195
+ TTCTG
196
+ AGACC
197
+ AAGGG
198
+ ATACC
199
+ TTTAT
200
+ AAGTG
201
+ TTATG
202
+ AAGAA
203
+ TAGCC
204
+ TTCAC
205
+ AGGTG
206
+ TTGAA
207
+ ATCTG
208
+ AGCAC
209
+ TGCTG
210
+ AAACC
211
+ ATGTG
212
+ TTTTCC
213
+ AGTTC
214
+ TCCTC
215
+ TATGG
216
+ AATAC
217
+ AGTGG
218
+ TAGGC
219
+ AGCTC
220
+ ATAGG
221
+ TTATC
222
+ TTAAG
223
+ TACCC
224
+ TTTTTG
225
+ AACAC
226
+ TGCTC
227
+ AGATC
228
+ TCCCAGC
229
+ AGCTG
230
+ AATAG
231
+ TCTTG
232
+ AGTGGC
233
+ ATTGG
234
+ TACTC
235
+ TAAAC
236
+ AATGG
237
+ AGGTC
238
+ AGGAC
239
+ TTGTG
240
+ TATAC
241
+ ATTTTC
242
+ ATATAA
243
+ AGGCTG
244
+ ATTTAA
245
+ AGTT
246
+ AGTAG
247
+ ATGAC
248
+ AATGC
249
+ TCCAC
250
+ CCCC
251
+ ATGTC
252
+ AACTC
253
+ TTTTTC
254
+ TAAGC
255
+ AAGTC
256
+ TGGTG
257
+ TATAA
258
+ AGTGC
259
+ TAAGG
260
+ ACCTG
261
+ TTAGC
262
+ AAATAA
263
+ TGCCTC
264
+ AATCC
265
+ TTGGCC
266
+ TAGGG
267
+ TGGAC
268
+ TTGTC
269
+ AACCC
270
+ TTACC
271
+ TAACC
272
+ AATTTT
273
+ AAAGAA
274
+ ATTATT
275
+ AGCG
276
+ AAAAAC
277
+ TAATG
278
+ TTGAC
279
+ AGTCC
280
+ AACTG
281
+ AGTTG
282
+ AATTG
283
+ TCTGC
284
+ TTAGG
285
+ TACAC
286
+ AGAAGG
287
+ ATATTC
288
+ AAAACC
289
+ AAAAGC
290
+ TGCCC
291
+ ACTGC
292
+ AGAAGC
293
+ TAATAA
294
+ AATATT
295
+ ACCATG
296
+ TGGTC
297
+ TTTTGC
298
+ AACG
299
+ TACTG
300
+ ACACACAC
301
+ ATTTTG
302
+ TCCG
303
+ TGCG
304
+ AAAATG
305
+ ACATG
306
+ TCAGC
307
+ ATCG
308
+ AGTAC
309
+ TTTTGG
310
+ AATAT
311
+ AGAGAA
312
+ TTCG
313
+ TCCAGCC
314
+ ATATAC
315
+ TCACC
316
+ AAAAGG
317
+ TGTGTGTG
318
+ TCATC
319
+ TGCTGGG
320
+ TGAAG
321
+ TGTAG
322
+ TGTGG
323
+ AAAAATT
324
+ ACTTC
325
+ TTCCCC
326
+ ATAGAA
327
+ TTGCCC
328
+ AGGAGG
329
+ TTTCCC
330
+ TATATT
331
+ ACCG
332
+ ACTAC
333
+ TCACTGC
334
+ GCG
335
+ TTTGTG
336
+ ACAGC
337
+ TCATG
338
+ AGTTTT
339
+ AGGAA
340
+ TTTATG
341
+ ATATTG
342
+ TGATG
343
+ TCTAA
344
+ TGTGC
345
+ AGGAAG
346
+ TTTGGG
347
+ TGTTC
348
+ AGCCCC
349
+ AGTTTC
350
+ AGGCTGG
351
+ TTTGCC
352
+ ATTTCC
353
+ ATACAC
354
+ AAAATAA
355
+ TAGAC
356
+ AGGAGAA
357
+ TGAGC
358
+ TGGAA
359
+ TTTTTAA
360
+ AGCCTCCC
361
+ ATGAA
362
+ TTTAAG
363
+ TCTGG
364
+ TTTATC
365
+ TTATAA
366
+ TGATT
367
+ AACAA
368
+ TAGCTGGG
369
+ TCAAG
370
+ AAAAAA
371
+ ACTTTGGG
372
+ TATTCC
373
+ TCAGG
374
+ AACAG
375
+ TTCTTC
376
+ TGTGGC
377
+ ATATGC
378
+ ATTACAGGC
379
+ AGGGGC
380
+ AGGGCC
381
+ TTATTC
382
+ ATATCC
383
+ TGTAATCCCAGC
384
+ TACG
385
+ AGAAAC
386
+ TGTCC
387
+ AGATGG
388
+ TGTGCC
389
+ TTTCTC
390
+ TGAAC
391
+ AGTCTC
392
+ TGTTG
393
+ ATTTTTT
394
+ AAGAAG
395
+ TGGGGC
396
+ AGCAGC
397
+ GCCC
398
+ TTTGGC
399
+ AGGCTGAGGC
400
+ TGGGCC
401
+ TTCTCC
402
+ TAGAA
403
+ TGGAGTGC
404
+ ATTAA
405
+ AGTGCC
406
+ TGTCTC
407
+ ATATGG
408
+ ACATC
409
+ TGGGGG
410
+ TGACC
411
+ ACTCC
412
+ TAAAAC
413
+ AGATAA
414
+ TAATTTT
415
+ TCAAC
416
+ TCTAC
417
+ TCTAG
418
+ GAG
419
+ TAAATG
420
+ AGCAA
421
+ TATATG
422
+ ATATATAT
423
+ ATTTGC
424
+ TCCTCC
425
+ CCCAC
426
+ ATTTATT
427
+ TCTGCC
428
+ ATGGCC
429
+ TCGC
430
+ AGTATT
431
+ AGAACC
432
+ TTAAAC
433
+ AAATTC
434
+ AGAGAC
435
+ ATTTAC
436
+ ATTGCC
437
+ AACAAC
438
+ TTTAAC
439
+ ACGG
440
+ AAGAAAA
441
+ TCTGGC
442
+ ATTCTCC
443
+ AGGTGG
444
+ TGCTGC
445
+ TTCAAG
446
+ AGAGGG
447
+ ACACC
448
+ TCTTTT
449
+ AGAGGC
450
+ ATCACC
451
+ TAAATT
452
+ AAGGCC
453
+ TTGCAGTG
454
+ TGTAC
455
+ AATTTC
456
+ ATCCCC
457
+ ACAAG
458
+ ACAGG
459
+ ACAAC
460
+ TGCCCC
461
+ AGATTC
462
+ TTAGAA
463
+ TTGGGG
464
+ AGACAC
465
+ TGGAAG
466
+ ACCTCC
467
+ ATGGGG
468
+ AGCCTCC
469
+ TTATTG
470
+ TAAAAG
471
+ ATCTTC
472
+ ATCTCC
473
+ TGAAGC
474
+ TAATC
475
+ AAATGC
476
+ TTGTTG
477
+ ATTCCC
478
+ TACTAAAA
479
+ ATAGTG
480
+ AAATAC
481
+ TTGGGC
482
+ TAGAGAC
483
+ TGTTTT
484
+ TTCTGC
485
+ TGGCCC
486
+ TCTGTC
487
+ AGCTCC
488
+ AACTCC
489
+ TTAGCC
490
+ AAAGTGCTGGG
491
+ ATAGAC
492
+ TATTTTTAG
493
+ ACTTG
494
+ ACCACC
495
+ AAACAC
496
+ GTGG
497
+ ATTTAG
498
+ AGGAGC
499
+ AGGCTGGAGTGC
500
+ ATACCC
501
+ ATGTAA
502
+ ACGC
503
+ AGTAT
504
+ TTTACC
505
+ ACTAA
506
+ AGGCCC
507
+ AAGGGG
508
+ TCTCG
509
+ ATGAAG
510
+ AAAGAC
511
+ TGAAAA
512
+ AAGGGC
513
+ ATAGGC
514
+ AGAGTG
515
+ AGCTGC
516
+ ATGTTC
517
+ TATTTC
518
+ TGATC
519
+ AGTTTG
520
+ AGCTAA
521
+ AGAGCC
522
+ TGCTTC
523
+ ATCATC
524
+ AACATGG
525
+ AGCTTC
526
+ AAGAAC
527
+ TTTTTTG
528
+ AGGGGG
529
+ ATAAGC
530
+ TAAGCC
531
+ ACTGG
532
+ ACAAAA
533
+ ATCATT
534
+ TCTTTC
535
+ ATGATG
536
+ TGCAA
537
+ AGGTTC
538
+ AACATT
539
+ ATGGGC
540
+ ATAGAG
541
+ AAATGG
542
+ AGTTCC
543
+ TTTAGC
544
+ AACTTC
545
+ AGCAAG
546
+ ATAAAAC
547
+ AAAATC
548
+ AGCCAC
549
+ AGGAAC
550
+ TTAACC
551
+ TATTTATT
552
+ TTTCTG
553
+ ATAAGG
554
+ AGCCACC
555
+ AGATGC
556
+ TTAAGC
557
+ TTGTAA
558
+ AGTGTG
559
+ AACCCC
560
+ TTCATT
561
+ ATCATG
562
+ AATGAA
563
+ AGGTGC
564
+ AAAAAAAAAAAAAAAA
565
+ AGGATG
566
+ AGCCG
567
+ TGGTGG
568
+ AGTGGG
569
+ TGCACTCCAGCC
570
+ TATTGC
571
+ TAGTC
572
+ CCCG
573
+ AAGTAA
574
+ TAGTG
575
+ TTTTTTTTTTTTTTTT
576
+ AGCATT
577
+ ATCTGC
578
+ TCTCAC
579
+ AAATTG
580
+ TTTAGG
581
+ AGACCC
582
+ GGGCC
583
+ TCCTTC
584
+ ATAGGG
585
+ AATATG
586
+ TTATAC
587
+ TAGAAG
588
+ AAAGTG
589
+ AAATCC
590
+ TTCCTC
591
+ TTTCAC
592
+ AGTATG
593
+ TACTAAAAATAC
594
+ ATGTGC
595
+ AGGAGGC
596
+ TATATC
597
+ TTCTAA
598
+ TGAGGC
599
+ ACACAC
600
+ TCCCCC
601
+ AACATC
602
+ AAGCG
603
+ AATGGC
604
+ ACCCCC
605
+ AGATAC
606
+ ATAAAAG
607
+ ATGATT
608
+ TGGAGG
609
+ AGTTAA
charBPEtokenizer.json ADDED
@@ -0,0 +1,1257 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "version": "1.0",
3
+ "truncation": null,
4
+ "padding": null,
5
+ "added_tokens": [
6
+ {
7
+ "id": 1,
8
+ "content": "[UNK]",
9
+ "single_word": false,
10
+ "lstrip": false,
11
+ "rstrip": false,
12
+ "normalized": false,
13
+ "special": true
14
+ }
15
+ ],
16
+ "normalizer": {
17
+ "type": "BertNormalizer",
18
+ "clean_text": true,
19
+ "handle_chinese_chars": true,
20
+ "strip_accents": null,
21
+ "lowercase": false
22
+ },
23
+ "pre_tokenizer": {
24
+ "type": "BertPreTokenizer"
25
+ },
26
+ "post_processor": null,
27
+ "decoder": {
28
+ "type": "BPEDecoder",
29
+ "suffix": ""
30
+ },
31
+ "model": {
32
+ "type": "BPE",
33
+ "dropout": null,
34
+ "unk_token": "[UNK]",
35
+ "continuing_subword_prefix": null,
36
+ "end_of_word_suffix": "",
37
+ "fuse_unk": false,
38
+ "vocab": {
39
+ "[PAD]": 0,
40
+ "[UNK]": 1,
41
+ "[CLS]": 2,
42
+ "[SEP]": 3,
43
+ "[MASK]": 4,
44
+ "A": 5,
45
+ "C": 6,
46
+ "G": 7,
47
+ "T": 8,
48
+ "TT": 9,
49
+ "AA": 10,
50
+ "TG": 11,
51
+ "AG": 12,
52
+ "CC": 13,
53
+ "TC": 14,
54
+ "AC": 15,
55
+ "GG": 16,
56
+ "ATT": 17,
57
+ "AT": 18,
58
+ "ATG": 19,
59
+ "GC": 20,
60
+ "TAA": 21,
61
+ "TCC": 22,
62
+ "ACC": 23,
63
+ "AAAA": 24,
64
+ "AGG": 25,
65
+ "ATC": 26,
66
+ "AGC": 27,
67
+ "TTC": 28,
68
+ "AAG": 29,
69
+ "TTTT": 30,
70
+ "TGC": 31,
71
+ "TGG": 32,
72
+ "AAC": 33,
73
+ "TTG": 34,
74
+ "TAG": 35,
75
+ "TAC": 36,
76
+ "CCC": 37,
77
+ "TATT": 38,
78
+ "TGGG": 39,
79
+ "TAT": 40,
80
+ "AGAA": 41,
81
+ "AGGG": 42,
82
+ "TTTC": 43,
83
+ "AGGC": 44,
84
+ "AGCC": 45,
85
+ "ATAA": 46,
86
+ "TGTG": 47,
87
+ "TTGG": 48,
88
+ "ATTC": 49,
89
+ "AAGG": 50,
90
+ "ACAC": 51,
91
+ "TCCC": 52,
92
+ "TCTC": 53,
93
+ "TATG": 54,
94
+ "TTTG": 55,
95
+ "TTCC": 56,
96
+ "AGTG": 57,
97
+ "ATGG": 58,
98
+ "AGAC": 59,
99
+ "AAAC": 60,
100
+ "ACCC": 61,
101
+ "TGCC": 62,
102
+ "ATTG": 63,
103
+ "ATCC": 64,
104
+ "AGAG": 65,
105
+ "ATGC": 66,
106
+ "ATAC": 67,
107
+ "TCTG": 68,
108
+ "TTAA": 69,
109
+ "TCAC": 70,
110
+ "TGAA": 71,
111
+ "TGGC": 72,
112
+ "TTGC": 73,
113
+ "TAAG": 74,
114
+ "TATC": 75,
115
+ "TAAC": 76,
116
+ "AAAG": 77,
117
+ "TTAC": 78,
118
+ "AAGC": 79,
119
+ "GGG": 80,
120
+ "TAGC": 81,
121
+ "GGC": 82,
122
+ "ATAT": 83,
123
+ "TACC": 84,
124
+ "AACC": 85,
125
+ "AATG": 86,
126
+ "TAGG": 87,
127
+ "GCC": 88,
128
+ "ATATT": 89,
129
+ "AGTC": 90,
130
+ "TTTTC": 91,
131
+ "AAAAC": 92,
132
+ "TGAC": 93,
133
+ "TTTAA": 94,
134
+ "AAAAG": 95,
135
+ "AATC": 96,
136
+ "TGTC": 97,
137
+ "TTATT": 98,
138
+ "ATAG": 99,
139
+ "TGAG": 100,
140
+ "TTTTG": 101,
141
+ "AAATT": 102,
142
+ "AATT": 103,
143
+ "AATAA": 104,
144
+ "TTTCC": 105,
145
+ "ACAG": 106,
146
+ "TCAG": 107,
147
+ "AAATG": 108,
148
+ "TGGGC": 109,
149
+ "ACTC": 110,
150
+ "AGGCC": 111,
151
+ "TTAG": 112,
152
+ "ACTG": 113,
153
+ "ACG": 114,
154
+ "ATATG": 115,
155
+ "TGGCC": 116,
156
+ "ATTTC": 117,
157
+ "ACAA": 118,
158
+ "ATCTC": 119,
159
+ "TATTC": 120,
160
+ "TGTAA": 121,
161
+ "ACTT": 122,
162
+ "ATGCC": 123,
163
+ "TAAAA": 124,
164
+ "AAAAAAAA": 125,
165
+ "ATTCC": 126,
166
+ "TTTAG": 127,
167
+ "TCCCC": 128,
168
+ "TTTGC": 129,
169
+ "TTCCC": 130,
170
+ "TGGGG": 131,
171
+ "TTCTC": 132,
172
+ "ATAAAA": 133,
173
+ "AGAAG": 134,
174
+ "TTTTTTTT": 135,
175
+ "ACCCC": 136,
176
+ "AGGGC": 137,
177
+ "ACCTC": 138,
178
+ "AGATG": 139,
179
+ "ATTAC": 140,
180
+ "AAGCC": 141,
181
+ "GGCC": 142,
182
+ "AGGAG": 143,
183
+ "TCAA": 144,
184
+ "ATTGC": 145,
185
+ "TATTG": 146,
186
+ "ATAAC": 147,
187
+ "ATATC": 148,
188
+ "TTTAC": 149,
189
+ "ATGGC": 150,
190
+ "AAGGC": 151,
191
+ "ACCAC": 152,
192
+ "GTG": 153,
193
+ "ATCCC": 154,
194
+ "AGAAC": 155,
195
+ "ATTTT": 156,
196
+ "TTGCC": 157,
197
+ "AAATC": 158,
198
+ "ATAAG": 159,
199
+ "TTGGC": 160,
200
+ "TGGAG": 161,
201
+ "ATGGG": 162,
202
+ "AAAGC": 163,
203
+ "AGGGG": 164,
204
+ "ATCAC": 165,
205
+ "ATTTG": 166,
206
+ "AATTC": 167,
207
+ "TGCAC": 168,
208
+ "TTTGG": 169,
209
+ "TCG": 170,
210
+ "AGAGC": 171,
211
+ "AAAGG": 172,
212
+ "GGGC": 173,
213
+ "TTGGG": 174,
214
+ "AGAAAA": 175,
215
+ "TATCC": 176,
216
+ "TCTCC": 177,
217
+ "ATAGC": 178,
218
+ "TGAGG": 179,
219
+ "TTTATT": 180,
220
+ "AGTAA": 181,
221
+ "AGAGG": 182,
222
+ "TCTTC": 183,
223
+ "ACATT": 184,
224
+ "TCCTG": 185,
225
+ "AGCCC": 186,
226
+ "TATGC": 187,
227
+ "TTAAAA": 188,
228
+ "AGATT": 189,
229
+ "TTAAC": 190,
230
+ "GGGG": 191,
231
+ "AAGAC": 192,
232
+ "TCATT": 193,
233
+ "TTCTG": 194,
234
+ "AGACC": 195,
235
+ "AAGGG": 196,
236
+ "ATACC": 197,
237
+ "TTTAT": 198,
238
+ "AAGTG": 199,
239
+ "TTATG": 200,
240
+ "AAGAA": 201,
241
+ "TAGCC": 202,
242
+ "TTCAC": 203,
243
+ "AGGTG": 204,
244
+ "TTGAA": 205,
245
+ "ATCTG": 206,
246
+ "AGCAC": 207,
247
+ "TGCTG": 208,
248
+ "AAACC": 209,
249
+ "ATGTG": 210,
250
+ "TTTTCC": 211,
251
+ "AGTTC": 212,
252
+ "TCCTC": 213,
253
+ "TATGG": 214,
254
+ "AATAC": 215,
255
+ "AGTGG": 216,
256
+ "TAGGC": 217,
257
+ "AGCTC": 218,
258
+ "ATAGG": 219,
259
+ "TTATC": 220,
260
+ "TTAAG": 221,
261
+ "TACCC": 222,
262
+ "TTTTTG": 223,
263
+ "AACAC": 224,
264
+ "TGCTC": 225,
265
+ "AGATC": 226,
266
+ "TCCCAGC": 227,
267
+ "AGCTG": 228,
268
+ "AATAG": 229,
269
+ "TCTTG": 230,
270
+ "AGTGGC": 231,
271
+ "ATTGG": 232,
272
+ "TACTC": 233,
273
+ "TAAAC": 234,
274
+ "AATGG": 235,
275
+ "AGGTC": 236,
276
+ "AGGAC": 237,
277
+ "TTGTG": 238,
278
+ "TATAC": 239,
279
+ "ATTTTC": 240,
280
+ "ATATAA": 241,
281
+ "AGGCTG": 242,
282
+ "ATTTAA": 243,
283
+ "AGTT": 244,
284
+ "AGTAG": 245,
285
+ "ATGAC": 246,
286
+ "AATGC": 247,
287
+ "TCCAC": 248,
288
+ "CCCC": 249,
289
+ "ATGTC": 250,
290
+ "AACTC": 251,
291
+ "TTTTTC": 252,
292
+ "TAAGC": 253,
293
+ "AAGTC": 254,
294
+ "TGGTG": 255,
295
+ "TATAA": 256,
296
+ "AGTGC": 257,
297
+ "TAAGG": 258,
298
+ "ACCTG": 259,
299
+ "TTAGC": 260,
300
+ "AAATAA": 261,
301
+ "TGCCTC": 262,
302
+ "AATCC": 263,
303
+ "TTGGCC": 264,
304
+ "TAGGG": 265,
305
+ "TGGAC": 266,
306
+ "TTGTC": 267,
307
+ "AACCC": 268,
308
+ "TTACC": 269,
309
+ "TAACC": 270,
310
+ "AATTTT": 271,
311
+ "AAAGAA": 272,
312
+ "ATTATT": 273,
313
+ "AGCG": 274,
314
+ "AAAAAC": 275,
315
+ "TAATG": 276,
316
+ "TTGAC": 277,
317
+ "AGTCC": 278,
318
+ "AACTG": 279,
319
+ "AGTTG": 280,
320
+ "AATTG": 281,
321
+ "TCTGC": 282,
322
+ "TTAGG": 283,
323
+ "TACAC": 284,
324
+ "AGAAGG": 285,
325
+ "ATATTC": 286,
326
+ "AAAACC": 287,
327
+ "AAAAGC": 288,
328
+ "TGCCC": 289,
329
+ "ACTGC": 290,
330
+ "AGAAGC": 291,
331
+ "TAATAA": 292,
332
+ "AATATT": 293,
333
+ "ACCATG": 294,
334
+ "TGGTC": 295,
335
+ "TTTTGC": 296,
336
+ "AACG": 297,
337
+ "TACTG": 298,
338
+ "ACACACAC": 299,
339
+ "ATTTTG": 300,
340
+ "TCCG": 301,
341
+ "TGCG": 302,
342
+ "AAAATG": 303,
343
+ "ACATG": 304,
344
+ "TCAGC": 305,
345
+ "ATCG": 306,
346
+ "AGTAC": 307,
347
+ "TTTTGG": 308,
348
+ "AATAT": 309,
349
+ "AGAGAA": 310,
350
+ "TTCG": 311,
351
+ "TCCAGCC": 312,
352
+ "ATATAC": 313,
353
+ "TCACC": 314,
354
+ "AAAAGG": 315,
355
+ "TGTGTGTG": 316,
356
+ "TCATC": 317,
357
+ "TGCTGGG": 318,
358
+ "TGAAG": 319,
359
+ "TGTAG": 320,
360
+ "TGTGG": 321,
361
+ "AAAAATT": 322,
362
+ "ACTTC": 323,
363
+ "TTCCCC": 324,
364
+ "ATAGAA": 325,
365
+ "TTGCCC": 326,
366
+ "AGGAGG": 327,
367
+ "TTTCCC": 328,
368
+ "TATATT": 329,
369
+ "ACCG": 330,
370
+ "ACTAC": 331,
371
+ "TCACTGC": 332,
372
+ "GCG": 333,
373
+ "TTTGTG": 334,
374
+ "ACAGC": 335,
375
+ "TCATG": 336,
376
+ "AGTTTT": 337,
377
+ "AGGAA": 338,
378
+ "TTTATG": 339,
379
+ "ATATTG": 340,
380
+ "TGATG": 341,
381
+ "TCTAA": 342,
382
+ "TGTGC": 343,
383
+ "AGGAAG": 344,
384
+ "TTTGGG": 345,
385
+ "TGTTC": 346,
386
+ "AGCCCC": 347,
387
+ "AGTTTC": 348,
388
+ "AGGCTGG": 349,
389
+ "TTTGCC": 350,
390
+ "ATTTCC": 351,
391
+ "ATACAC": 352,
392
+ "AAAATAA": 353,
393
+ "TAGAC": 354,
394
+ "AGGAGAA": 355,
395
+ "TGAGC": 356,
396
+ "TGGAA": 357,
397
+ "TTTTTAA": 358,
398
+ "AGCCTCCC": 359,
399
+ "ATGAA": 360,
400
+ "TTTAAG": 361,
401
+ "TCTGG": 362,
402
+ "TTTATC": 363,
403
+ "TTATAA": 364,
404
+ "TGATT": 365,
405
+ "AACAA": 366,
406
+ "TAGCTGGG": 367,
407
+ "TCAAG": 368,
408
+ "AAAAAA": 369,
409
+ "ACTTTGGG": 370,
410
+ "TATTCC": 371,
411
+ "TCAGG": 372,
412
+ "AACAG": 373,
413
+ "TTCTTC": 374,
414
+ "TGTGGC": 375,
415
+ "ATATGC": 376,
416
+ "ATTACAGGC": 377,
417
+ "AGGGGC": 378,
418
+ "AGGGCC": 379,
419
+ "TTATTC": 380,
420
+ "ATATCC": 381,
421
+ "TGTAATCCCAGC": 382,
422
+ "TACG": 383,
423
+ "AGAAAC": 384,
424
+ "TGTCC": 385,
425
+ "AGATGG": 386,
426
+ "TGTGCC": 387,
427
+ "TTTCTC": 388,
428
+ "TGAAC": 389,
429
+ "AGTCTC": 390,
430
+ "TGTTG": 391,
431
+ "ATTTTTT": 392,
432
+ "AAGAAG": 393,
433
+ "TGGGGC": 394,
434
+ "AGCAGC": 395,
435
+ "GCCC": 396,
436
+ "TTTGGC": 397,
437
+ "AGGCTGAGGC": 398,
438
+ "TGGGCC": 399,
439
+ "TTCTCC": 400,
440
+ "TAGAA": 401,
441
+ "TGGAGTGC": 402,
442
+ "ATTAA": 403,
443
+ "AGTGCC": 404,
444
+ "TGTCTC": 405,
445
+ "ATATGG": 406,
446
+ "ACATC": 407,
447
+ "TGGGGG": 408,
448
+ "TGACC": 409,
449
+ "ACTCC": 410,
450
+ "TAAAAC": 411,
451
+ "AGATAA": 412,
452
+ "TAATTTT": 413,
453
+ "TCAAC": 414,
454
+ "TCTAC": 415,
455
+ "TCTAG": 416,
456
+ "GAG": 417,
457
+ "TAAATG": 418,
458
+ "AGCAA": 419,
459
+ "TATATG": 420,
460
+ "ATATATAT": 421,
461
+ "ATTTGC": 422,
462
+ "TCCTCC": 423,
463
+ "CCCAC": 424,
464
+ "ATTTATT": 425,
465
+ "TCTGCC": 426,
466
+ "ATGGCC": 427,
467
+ "TCGC": 428,
468
+ "AGTATT": 429,
469
+ "AGAACC": 430,
470
+ "TTAAAC": 431,
471
+ "AAATTC": 432,
472
+ "AGAGAC": 433,
473
+ "ATTTAC": 434,
474
+ "ATTGCC": 435,
475
+ "AACAAC": 436,
476
+ "TTTAAC": 437,
477
+ "ACGG": 438,
478
+ "AAGAAAA": 439,
479
+ "TCTGGC": 440,
480
+ "ATTCTCC": 441,
481
+ "AGGTGG": 442,
482
+ "TGCTGC": 443,
483
+ "TTCAAG": 444,
484
+ "AGAGGG": 445,
485
+ "ACACC": 446,
486
+ "TCTTTT": 447,
487
+ "AGAGGC": 448,
488
+ "ATCACC": 449,
489
+ "TAAATT": 450,
490
+ "AAGGCC": 451,
491
+ "TTGCAGTG": 452,
492
+ "TGTAC": 453,
493
+ "AATTTC": 454,
494
+ "ATCCCC": 455,
495
+ "ACAAG": 456,
496
+ "ACAGG": 457,
497
+ "ACAAC": 458,
498
+ "TGCCCC": 459,
499
+ "AGATTC": 460,
500
+ "TTAGAA": 461,
501
+ "TTGGGG": 462,
502
+ "AGACAC": 463,
503
+ "TGGAAG": 464,
504
+ "ACCTCC": 465,
505
+ "ATGGGG": 466,
506
+ "AGCCTCC": 467,
507
+ "TTATTG": 468,
508
+ "TAAAAG": 469,
509
+ "ATCTTC": 470,
510
+ "ATCTCC": 471,
511
+ "TGAAGC": 472,
512
+ "TAATC": 473,
513
+ "AAATGC": 474,
514
+ "TTGTTG": 475,
515
+ "ATTCCC": 476,
516
+ "TACTAAAA": 477,
517
+ "ATAGTG": 478,
518
+ "AAATAC": 479,
519
+ "TTGGGC": 480,
520
+ "TAGAGAC": 481,
521
+ "TGTTTT": 482,
522
+ "TTCTGC": 483,
523
+ "TGGCCC": 484,
524
+ "TCTGTC": 485,
525
+ "AGCTCC": 486,
526
+ "AACTCC": 487,
527
+ "TTAGCC": 488,
528
+ "AAAGTGCTGGG": 489,
529
+ "ATAGAC": 490,
530
+ "TATTTTTAG": 491,
531
+ "ACTTG": 492,
532
+ "ACCACC": 493,
533
+ "AAACAC": 494,
534
+ "GTGG": 495,
535
+ "ATTTAG": 496,
536
+ "AGGAGC": 497,
537
+ "AGGCTGGAGTGC": 498,
538
+ "ATACCC": 499,
539
+ "ATGTAA": 500,
540
+ "ACGC": 501,
541
+ "AGTAT": 502,
542
+ "TTTACC": 503,
543
+ "ACTAA": 504,
544
+ "AGGCCC": 505,
545
+ "AAGGGG": 506,
546
+ "TCTCG": 507,
547
+ "ATGAAG": 508,
548
+ "AAAGAC": 509,
549
+ "TGAAAA": 510,
550
+ "AAGGGC": 511,
551
+ "ATAGGC": 512,
552
+ "AGAGTG": 513,
553
+ "AGCTGC": 514,
554
+ "ATGTTC": 515,
555
+ "TATTTC": 516,
556
+ "TGATC": 517,
557
+ "AGTTTG": 518,
558
+ "AGCTAA": 519,
559
+ "AGAGCC": 520,
560
+ "TGCTTC": 521,
561
+ "ATCATC": 522,
562
+ "AACATGG": 523,
563
+ "AGCTTC": 524,
564
+ "AAGAAC": 525,
565
+ "TTTTTTG": 526,
566
+ "AGGGGG": 527,
567
+ "ATAAGC": 528,
568
+ "TAAGCC": 529,
569
+ "ACTGG": 530,
570
+ "ACAAAA": 531,
571
+ "ATCATT": 532,
572
+ "TCTTTC": 533,
573
+ "ATGATG": 534,
574
+ "TGCAA": 535,
575
+ "AGGTTC": 536,
576
+ "AACATT": 537,
577
+ "ATGGGC": 538,
578
+ "ATAGAG": 539,
579
+ "AAATGG": 540,
580
+ "AGTTCC": 541,
581
+ "TTTAGC": 542,
582
+ "AACTTC": 543,
583
+ "AGCAAG": 544,
584
+ "ATAAAAC": 545,
585
+ "AAAATC": 546,
586
+ "AGCCAC": 547,
587
+ "AGGAAC": 548,
588
+ "TTAACC": 549,
589
+ "TATTTATT": 550,
590
+ "TTTCTG": 551,
591
+ "ATAAGG": 552,
592
+ "AGCCACC": 553,
593
+ "AGATGC": 554,
594
+ "TTAAGC": 555,
595
+ "TTGTAA": 556,
596
+ "AGTGTG": 557,
597
+ "AACCCC": 558,
598
+ "TTCATT": 559,
599
+ "ATCATG": 560,
600
+ "AATGAA": 561,
601
+ "AGGTGC": 562,
602
+ "AAAAAAAAAAAAAAAA": 563,
603
+ "AGGATG": 564,
604
+ "AGCCG": 565,
605
+ "TGGTGG": 566,
606
+ "AGTGGG": 567,
607
+ "TGCACTCCAGCC": 568,
608
+ "TATTGC": 569,
609
+ "TAGTC": 570,
610
+ "CCCG": 571,
611
+ "AAGTAA": 572,
612
+ "TAGTG": 573,
613
+ "TTTTTTTTTTTTTTTT": 574,
614
+ "AGCATT": 575,
615
+ "ATCTGC": 576,
616
+ "TCTCAC": 577,
617
+ "AAATTG": 578,
618
+ "TTTAGG": 579,
619
+ "AGACCC": 580,
620
+ "GGGCC": 581,
621
+ "TCCTTC": 582,
622
+ "ATAGGG": 583,
623
+ "AATATG": 584,
624
+ "TTATAC": 585,
625
+ "TAGAAG": 586,
626
+ "AAAGTG": 587,
627
+ "AAATCC": 588,
628
+ "TTCCTC": 589,
629
+ "TTTCAC": 590,
630
+ "AGTATG": 591,
631
+ "TACTAAAAATAC": 592,
632
+ "ATGTGC": 593,
633
+ "AGGAGGC": 594,
634
+ "TATATC": 595,
635
+ "TTCTAA": 596,
636
+ "TGAGGC": 597,
637
+ "ACACAC": 598,
638
+ "TCCCCC": 599,
639
+ "AACATC": 600,
640
+ "AAGCG": 601,
641
+ "AATGGC": 602,
642
+ "ACCCCC": 603,
643
+ "AGATAC": 604,
644
+ "ATAAAAG": 605,
645
+ "ATGATT": 606,
646
+ "TGGAGG": 607,
647
+ "AGTTAA": 608,
648
+ "": 609
649
+ },
650
+ "merges": [
651
+ "A ",
652
+ "C ",
653
+ "G ",
654
+ "T ",
655
+ "T T",
656
+ "A A",
657
+ "T G",
658
+ "A G",
659
+ "C C",
660
+ "T C",
661
+ "A C",
662
+ "G G",
663
+ "A TT",
664
+ "A T",
665
+ "A TG",
666
+ "G C",
667
+ "T AA",
668
+ "T CC",
669
+ "A CC",
670
+ "AA AA",
671
+ "AG G",
672
+ "A TC",
673
+ "AG C",
674
+ "TT C",
675
+ "AA G",
676
+ "TT TT",
677
+ "TG C",
678
+ "TG G",
679
+ "AA C",
680
+ "TT G",
681
+ "T AG",
682
+ "T AC",
683
+ "CC C",
684
+ "T ATT",
685
+ "TG GG",
686
+ "T AT",
687
+ "AG AA",
688
+ "AG GG",
689
+ "TT TC",
690
+ "AG GC",
691
+ "AG CC",
692
+ "AT AA",
693
+ "TG TG",
694
+ "TT GG",
695
+ "ATT C",
696
+ "AA GG",
697
+ "AC AC",
698
+ "TCC C",
699
+ "TC TC",
700
+ "T ATG",
701
+ "TT TG",
702
+ "TT CC",
703
+ "AG TG",
704
+ "ATG G",
705
+ "AG AC",
706
+ "AA AC",
707
+ "ACC C",
708
+ "TG CC",
709
+ "ATT G",
710
+ "AT CC",
711
+ "AG AG",
712
+ "ATG C",
713
+ "AT AC",
714
+ "TC TG",
715
+ "TT AA",
716
+ "TC AC",
717
+ "TG AA",
718
+ "TG GC",
719
+ "TT GC",
720
+ "TAA G",
721
+ "T ATC",
722
+ "TAA C",
723
+ "AA AG",
724
+ "TT AC",
725
+ "AA GC",
726
+ "GG G",
727
+ "T AGC",
728
+ "GG C",
729
+ "AT AT",
730
+ "T ACC",
731
+ "AA CC",
732
+ "AA TG",
733
+ "T AGG",
734
+ "G CC",
735
+ "AT ATT",
736
+ "AG TC",
737
+ "TT TTC",
738
+ "AAAA C",
739
+ "TG AC",
740
+ "TT TAA",
741
+ "AAAA G",
742
+ "AA TC",
743
+ "TG TC",
744
+ "TT ATT",
745
+ "AT AG",
746
+ "TG AG",
747
+ "TTTT G",
748
+ "AA ATT",
749
+ "AA TT",
750
+ "AA TAA",
751
+ "TT TCC",
752
+ "AC AG",
753
+ "TC AG",
754
+ "AA ATG",
755
+ "TGGG C",
756
+ "AC TC",
757
+ "AGG CC",
758
+ "TT AG",
759
+ "AC TG",
760
+ "AC G",
761
+ "AT ATG",
762
+ "TGG CC",
763
+ "ATT TC",
764
+ "AC AA",
765
+ "ATC TC",
766
+ "TATT C",
767
+ "TG TAA",
768
+ "AC TT",
769
+ "ATG CC",
770
+ "TAA AA",
771
+ "AAAA AAAA",
772
+ "ATT CC",
773
+ "TT TAG",
774
+ "TCC CC",
775
+ "TT TGC",
776
+ "TT CCC",
777
+ "TGGG G",
778
+ "TTC TC",
779
+ "AT AAAA",
780
+ "AG AAG",
781
+ "TTTT TTTT",
782
+ "ACC CC",
783
+ "AGGG C",
784
+ "ACC TC",
785
+ "AG ATG",
786
+ "ATT AC",
787
+ "AAG CC",
788
+ "GG CC",
789
+ "AGG AG",
790
+ "TC AA",
791
+ "ATT GC",
792
+ "TATT G",
793
+ "AT AAC",
794
+ "AT ATC",
795
+ "TT TAC",
796
+ "ATG GC",
797
+ "AAGG C",
798
+ "ACC AC",
799
+ "G TG",
800
+ "AT CCC",
801
+ "AG AAC",
802
+ "ATT TT",
803
+ "TTG CC",
804
+ "AA ATC",
805
+ "AT AAG",
806
+ "TTGG C",
807
+ "TGG AG",
808
+ "ATG GG",
809
+ "AA AGC",
810
+ "AGGG G",
811
+ "ATC AC",
812
+ "ATT TG",
813
+ "AA TTC",
814
+ "TGC AC",
815
+ "TT TGG",
816
+ "TC G",
817
+ "AG AGC",
818
+ "AA AGG",
819
+ "GG GC",
820
+ "TTGG G",
821
+ "AG AAAA",
822
+ "TAT CC",
823
+ "TC TCC",
824
+ "AT AGC",
825
+ "TG AGG",
826
+ "TT TATT",
827
+ "AG TAA",
828
+ "AG AGG",
829
+ "TC TTC",
830
+ "AC ATT",
831
+ "TCC TG",
832
+ "AG CCC",
833
+ "TATG C",
834
+ "TT AAAA",
835
+ "AG ATT",
836
+ "TT AAC",
837
+ "GG GG",
838
+ "AAG AC",
839
+ "TC ATT",
840
+ "TTC TG",
841
+ "AG ACC",
842
+ "AAGG G",
843
+ "AT ACC",
844
+ "TT TAT",
845
+ "AAG TG",
846
+ "TT ATG",
847
+ "AAG AA",
848
+ "TAG CC",
849
+ "TTC AC",
850
+ "AGG TG",
851
+ "TTG AA",
852
+ "ATC TG",
853
+ "AGC AC",
854
+ "TGC TG",
855
+ "AA ACC",
856
+ "ATG TG",
857
+ "TTTT CC",
858
+ "AG TTC",
859
+ "TCC TC",
860
+ "TATG G",
861
+ "AA TAC",
862
+ "AG TGG",
863
+ "TAG GC",
864
+ "AGC TC",
865
+ "AT AGG",
866
+ "TT ATC",
867
+ "TT AAG",
868
+ "T ACCC",
869
+ "TTTT TG",
870
+ "AAC AC",
871
+ "TGC TC",
872
+ "AG ATC",
873
+ "TCCC AGC",
874
+ "AGC TG",
875
+ "AA TAG",
876
+ "TC TTG",
877
+ "AGTG GC",
878
+ "ATT GG",
879
+ "TAC TC",
880
+ "TAA AC",
881
+ "AA TGG",
882
+ "AGG TC",
883
+ "AGG AC",
884
+ "TTG TG",
885
+ "TAT AC",
886
+ "ATT TTC",
887
+ "AT ATAA",
888
+ "AGGC TG",
889
+ "ATT TAA",
890
+ "AG TT",
891
+ "AG TAG",
892
+ "ATG AC",
893
+ "AA TGC",
894
+ "TCC AC",
895
+ "CC CC",
896
+ "ATG TC",
897
+ "AAC TC",
898
+ "TTTT TC",
899
+ "TAA GC",
900
+ "AAG TC",
901
+ "TGG TG",
902
+ "TAT AA",
903
+ "AG TGC",
904
+ "TAA GG",
905
+ "ACC TG",
906
+ "TT AGC",
907
+ "AA ATAA",
908
+ "TGCC TC",
909
+ "AA TCC",
910
+ "TTGG CC",
911
+ "TAG GG",
912
+ "TGG AC",
913
+ "TTG TC",
914
+ "AA CCC",
915
+ "TT ACC",
916
+ "TAA CC",
917
+ "AA TTTT",
918
+ "AA AGAA",
919
+ "ATT ATT",
920
+ "AGC G",
921
+ "AAAA AC",
922
+ "TAA TG",
923
+ "TTG AC",
924
+ "AG TCC",
925
+ "AAC TG",
926
+ "AG TTG",
927
+ "AA TTG",
928
+ "TC TGC",
929
+ "TT AGG",
930
+ "TAC AC",
931
+ "AGAA GG",
932
+ "AT ATTC",
933
+ "AAAA CC",
934
+ "AAAA GC",
935
+ "TG CCC",
936
+ "AC TGC",
937
+ "AGAA GC",
938
+ "TAA TAA",
939
+ "AA TATT",
940
+ "ACC ATG",
941
+ "TGG TC",
942
+ "TTTT GC",
943
+ "AAC G",
944
+ "TAC TG",
945
+ "ACAC ACAC",
946
+ "ATT TTG",
947
+ "TCC G",
948
+ "TGC G",
949
+ "AAAA TG",
950
+ "AC ATG",
951
+ "TC AGC",
952
+ "ATC G",
953
+ "AG TAC",
954
+ "TTTT GG",
955
+ "AA TAT",
956
+ "AG AGAA",
957
+ "TTC G",
958
+ "TCC AGCC",
959
+ "AT ATAC",
960
+ "TC ACC",
961
+ "AAAA GG",
962
+ "TGTG TGTG",
963
+ "TC ATC",
964
+ "TGC TGGG",
965
+ "TG AAG",
966
+ "TG TAG",
967
+ "TG TGG",
968
+ "AAAA ATT",
969
+ "AC TTC",
970
+ "TTCC CC",
971
+ "AT AGAA",
972
+ "TTG CCC",
973
+ "AGG AGG",
974
+ "TT TCCC",
975
+ "TAT ATT",
976
+ "ACC G",
977
+ "AC TAC",
978
+ "TCAC TGC",
979
+ "GC G",
980
+ "TT TGTG",
981
+ "AC AGC",
982
+ "TC ATG",
983
+ "AG TTTT",
984
+ "AGG AA",
985
+ "TT TATG",
986
+ "AT ATTG",
987
+ "TG ATG",
988
+ "TC TAA",
989
+ "TG TGC",
990
+ "AGG AAG",
991
+ "TT TGGG",
992
+ "TG TTC",
993
+ "AGCC CC",
994
+ "AG TTTC",
995
+ "AGGC TGG",
996
+ "TTTG CC",
997
+ "ATT TCC",
998
+ "AT ACAC",
999
+ "AAAA TAA",
1000
+ "TAG AC",
1001
+ "AGG AGAA",
1002
+ "TG AGC",
1003
+ "TGG AA",
1004
+ "TTTT TAA",
1005
+ "AGCC TCCC",
1006
+ "ATG AA",
1007
+ "TT TAAG",
1008
+ "TC TGG",
1009
+ "TT TATC",
1010
+ "TT ATAA",
1011
+ "TG ATT",
1012
+ "AAC AA",
1013
+ "TAGC TGGG",
1014
+ "TC AAG",
1015
+ "AAAA AA",
1016
+ "ACTT TGGG",
1017
+ "TATT CC",
1018
+ "TC AGG",
1019
+ "AAC AG",
1020
+ "TTC TTC",
1021
+ "TGTG GC",
1022
+ "AT ATGC",
1023
+ "ATTAC AGGC",
1024
+ "AGGG GC",
1025
+ "AGGG CC",
1026
+ "TT ATTC",
1027
+ "AT ATCC",
1028
+ "TGTAA TCCCAGC",
1029
+ "TAC G",
1030
+ "AGAA AC",
1031
+ "TG TCC",
1032
+ "AG ATGG",
1033
+ "TGTG CC",
1034
+ "TTTC TC",
1035
+ "TG AAC",
1036
+ "AG TCTC",
1037
+ "TG TTG",
1038
+ "ATT TTTT",
1039
+ "AAG AAG",
1040
+ "TGGG GC",
1041
+ "AGC AGC",
1042
+ "G CCC",
1043
+ "TTTG GC",
1044
+ "AGGCTG AGGC",
1045
+ "TGGG CC",
1046
+ "TTC TCC",
1047
+ "TAG AA",
1048
+ "TGGAG TGC",
1049
+ "ATT AA",
1050
+ "AGTG CC",
1051
+ "TG TCTC",
1052
+ "AT ATGG",
1053
+ "AC ATC",
1054
+ "TGGG GG",
1055
+ "TG ACC",
1056
+ "AC TCC",
1057
+ "TAA AAC",
1058
+ "AG ATAA",
1059
+ "TAA TTTT",
1060
+ "TC AAC",
1061
+ "TC TAC",
1062
+ "TC TAG",
1063
+ "G AG",
1064
+ "TAA ATG",
1065
+ "AGC AA",
1066
+ "TAT ATG",
1067
+ "ATAT ATAT",
1068
+ "ATT TGC",
1069
+ "TCC TCC",
1070
+ "CCC AC",
1071
+ "ATT TATT",
1072
+ "TC TGCC",
1073
+ "ATGG CC",
1074
+ "TC GC",
1075
+ "AG TATT",
1076
+ "AGAA CC",
1077
+ "TT AAAC",
1078
+ "AA ATTC",
1079
+ "AG AGAC",
1080
+ "ATT TAC",
1081
+ "ATTG CC",
1082
+ "AAC AAC",
1083
+ "TT TAAC",
1084
+ "AC GG",
1085
+ "AAG AAAA",
1086
+ "TCTG GC",
1087
+ "ATTC TCC",
1088
+ "AGG TGG",
1089
+ "TGC TGC",
1090
+ "TTC AAG",
1091
+ "AG AGGG",
1092
+ "AC ACC",
1093
+ "TC TTTT",
1094
+ "AG AGGC",
1095
+ "ATC ACC",
1096
+ "TAA ATT",
1097
+ "AAGG CC",
1098
+ "TTGC AGTG",
1099
+ "TG TAC",
1100
+ "AA TTTC",
1101
+ "ATCC CC",
1102
+ "AC AAG",
1103
+ "AC AGG",
1104
+ "AC AAC",
1105
+ "TGCC CC",
1106
+ "AG ATTC",
1107
+ "TT AGAA",
1108
+ "TTGG GG",
1109
+ "AG ACAC",
1110
+ "TGG AAG",
1111
+ "ACC TCC",
1112
+ "ATG GGG",
1113
+ "AGCC TCC",
1114
+ "TT ATTG",
1115
+ "TAA AAG",
1116
+ "ATC TTC",
1117
+ "ATC TCC",
1118
+ "TGAA GC",
1119
+ "TAA TC",
1120
+ "AA ATGC",
1121
+ "TTG TTG",
1122
+ "ATT CCC",
1123
+ "TAC TAAAA",
1124
+ "AT AGTG",
1125
+ "AA ATAC",
1126
+ "TTGG GC",
1127
+ "TAG AGAC",
1128
+ "TG TTTT",
1129
+ "TTC TGC",
1130
+ "TGG CCC",
1131
+ "TCTG TC",
1132
+ "AGC TCC",
1133
+ "AAC TCC",
1134
+ "TT AGCC",
1135
+ "AAAG TGCTGGG",
1136
+ "AT AGAC",
1137
+ "TATT TTTAG",
1138
+ "AC TTG",
1139
+ "ACC ACC",
1140
+ "AA ACAC",
1141
+ "G TGG",
1142
+ "ATT TAG",
1143
+ "AGG AGC",
1144
+ "AGGC TGGAGTGC",
1145
+ "AT ACCC",
1146
+ "ATG TAA",
1147
+ "AC GC",
1148
+ "AG TAT",
1149
+ "TT TACC",
1150
+ "AC TAA",
1151
+ "AGG CCC",
1152
+ "AAGG GG",
1153
+ "TCTC G",
1154
+ "ATG AAG",
1155
+ "AA AGAC",
1156
+ "TG AAAA",
1157
+ "AAGG GC",
1158
+ "AT AGGC",
1159
+ "AG AGTG",
1160
+ "AGC TGC",
1161
+ "ATG TTC",
1162
+ "TATT TC",
1163
+ "TG ATC",
1164
+ "AG TTTG",
1165
+ "AGC TAA",
1166
+ "AG AGCC",
1167
+ "TGC TTC",
1168
+ "ATC ATC",
1169
+ "AAC ATGG",
1170
+ "AGC TTC",
1171
+ "AAG AAC",
1172
+ "TTTT TTG",
1173
+ "AGGG GG",
1174
+ "ATAA GC",
1175
+ "TAAG CC",
1176
+ "AC TGG",
1177
+ "AC AAAA",
1178
+ "ATC ATT",
1179
+ "TC TTTC",
1180
+ "ATG ATG",
1181
+ "TGC AA",
1182
+ "AGG TTC",
1183
+ "AAC ATT",
1184
+ "ATG GGC",
1185
+ "AT AGAG",
1186
+ "AA ATGG",
1187
+ "AG TTCC",
1188
+ "TT TAGC",
1189
+ "AAC TTC",
1190
+ "AGC AAG",
1191
+ "AT AAAAC",
1192
+ "AAAA TC",
1193
+ "AGCC AC",
1194
+ "AGG AAC",
1195
+ "TTAA CC",
1196
+ "TATT TATT",
1197
+ "TTTC TG",
1198
+ "ATAA GG",
1199
+ "AGCC ACC",
1200
+ "AG ATGC",
1201
+ "TTAA GC",
1202
+ "TTG TAA",
1203
+ "AG TGTG",
1204
+ "AACC CC",
1205
+ "TTC ATT",
1206
+ "ATC ATG",
1207
+ "AA TGAA",
1208
+ "AGG TGC",
1209
+ "AAAAAAAA AAAAAAAA",
1210
+ "AGG ATG",
1211
+ "AGCC G",
1212
+ "TGG TGG",
1213
+ "AG TGGG",
1214
+ "TGCAC TCCAGCC",
1215
+ "TATT GC",
1216
+ "TAG TC",
1217
+ "CCC G",
1218
+ "AAG TAA",
1219
+ "TAG TG",
1220
+ "TTTTTTTT TTTTTTTT",
1221
+ "AGC ATT",
1222
+ "ATC TGC",
1223
+ "TCTC AC",
1224
+ "AA ATTG",
1225
+ "TT TAGG",
1226
+ "AG ACCC",
1227
+ "GGG CC",
1228
+ "TCC TTC",
1229
+ "AT AGGG",
1230
+ "AA TATG",
1231
+ "TT ATAC",
1232
+ "TAG AAG",
1233
+ "AA AGTG",
1234
+ "AA ATCC",
1235
+ "TTCC TC",
1236
+ "TTTC AC",
1237
+ "AG TATG",
1238
+ "TACTAAAA ATAC",
1239
+ "ATG TGC",
1240
+ "AGG AGGC",
1241
+ "TAT ATC",
1242
+ "TTC TAA",
1243
+ "TG AGGC",
1244
+ "ACAC AC",
1245
+ "TCC CCC",
1246
+ "AAC ATC",
1247
+ "AAGC G",
1248
+ "AA TGGC",
1249
+ "ACC CCC",
1250
+ "AG ATAC",
1251
+ "AT AAAAG",
1252
+ "ATG ATT",
1253
+ "TGG AGG",
1254
+ "AG TTAA"
1255
+ ]
1256
+ }
1257
+ }
merges.txt ADDED
@@ -0,0 +1,604 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ A
2
+ C
3
+ G
4
+ T
5
+ T T
6
+ A A
7
+ T G
8
+ A G
9
+ C C
10
+ T C
11
+ A C
12
+ G G
13
+ A TT
14
+ A T
15
+ A TG
16
+ G C
17
+ T AA
18
+ T CC
19
+ A CC
20
+ AA AA
21
+ AG G
22
+ A TC
23
+ AG C
24
+ TT C
25
+ AA G
26
+ TT TT
27
+ TG C
28
+ TG G
29
+ AA C
30
+ TT G
31
+ T AG
32
+ T AC
33
+ CC C
34
+ T ATT
35
+ TG GG
36
+ T AT
37
+ AG AA
38
+ AG GG
39
+ TT TC
40
+ AG GC
41
+ AG CC
42
+ AT AA
43
+ TG TG
44
+ TT GG
45
+ ATT C
46
+ AA GG
47
+ AC AC
48
+ TCC C
49
+ TC TC
50
+ T ATG
51
+ TT TG
52
+ TT CC
53
+ AG TG
54
+ ATG G
55
+ AG AC
56
+ AA AC
57
+ ACC C
58
+ TG CC
59
+ ATT G
60
+ AT CC
61
+ AG AG
62
+ ATG C
63
+ AT AC
64
+ TC TG
65
+ TT AA
66
+ TC AC
67
+ TG AA
68
+ TG GC
69
+ TT GC
70
+ TAA G
71
+ T ATC
72
+ TAA C
73
+ AA AG
74
+ TT AC
75
+ AA GC
76
+ GG G
77
+ T AGC
78
+ GG C
79
+ AT AT
80
+ T ACC
81
+ AA CC
82
+ AA TG
83
+ T AGG
84
+ G CC
85
+ AT ATT
86
+ AG TC
87
+ TT TTC
88
+ AAAA C
89
+ TG AC
90
+ TT TAA
91
+ AAAA G
92
+ AA TC
93
+ TG TC
94
+ TT ATT
95
+ AT AG
96
+ TG AG
97
+ TTTT G
98
+ AA ATT
99
+ AA TT
100
+ AA TAA
101
+ TT TCC
102
+ AC AG
103
+ TC AG
104
+ AA ATG
105
+ TGGG C
106
+ AC TC
107
+ AGG CC
108
+ TT AG
109
+ AC TG
110
+ AC G
111
+ AT ATG
112
+ TGG CC
113
+ ATT TC
114
+ AC AA
115
+ ATC TC
116
+ TATT C
117
+ TG TAA
118
+ AC TT
119
+ ATG CC
120
+ TAA AA
121
+ AAAA AAAA
122
+ ATT CC
123
+ TT TAG
124
+ TCC CC
125
+ TT TGC
126
+ TT CCC
127
+ TGGG G
128
+ TTC TC
129
+ AT AAAA
130
+ AG AAG
131
+ TTTT TTTT
132
+ ACC CC
133
+ AGGG C
134
+ ACC TC
135
+ AG ATG
136
+ ATT AC
137
+ AAG CC
138
+ GG CC
139
+ AGG AG
140
+ TC AA
141
+ ATT GC
142
+ TATT G
143
+ AT AAC
144
+ AT ATC
145
+ TT TAC
146
+ ATG GC
147
+ AAGG C
148
+ ACC AC
149
+ G TG
150
+ AT CCC
151
+ AG AAC
152
+ ATT TT
153
+ TTG CC
154
+ AA ATC
155
+ AT AAG
156
+ TTGG C
157
+ TGG AG
158
+ ATG GG
159
+ AA AGC
160
+ AGGG G
161
+ ATC AC
162
+ ATT TG
163
+ AA TTC
164
+ TGC AC
165
+ TT TGG
166
+ TC G
167
+ AG AGC
168
+ AA AGG
169
+ GG GC
170
+ TTGG G
171
+ AG AAAA
172
+ TAT CC
173
+ TC TCC
174
+ AT AGC
175
+ TG AGG
176
+ TT TATT
177
+ AG TAA
178
+ AG AGG
179
+ TC TTC
180
+ AC ATT
181
+ TCC TG
182
+ AG CCC
183
+ TATG C
184
+ TT AAAA
185
+ AG ATT
186
+ TT AAC
187
+ GG GG
188
+ AAG AC
189
+ TC ATT
190
+ TTC TG
191
+ AG ACC
192
+ AAGG G
193
+ AT ACC
194
+ TT TAT
195
+ AAG TG
196
+ TT ATG
197
+ AAG AA
198
+ TAG CC
199
+ TTC AC
200
+ AGG TG
201
+ TTG AA
202
+ ATC TG
203
+ AGC AC
204
+ TGC TG
205
+ AA ACC
206
+ ATG TG
207
+ TTTT CC
208
+ AG TTC
209
+ TCC TC
210
+ TATG G
211
+ AA TAC
212
+ AG TGG
213
+ TAG GC
214
+ AGC TC
215
+ AT AGG
216
+ TT ATC
217
+ TT AAG
218
+ T ACCC
219
+ TTTT TG
220
+ AAC AC
221
+ TGC TC
222
+ AG ATC
223
+ TCCC AGC
224
+ AGC TG
225
+ AA TAG
226
+ TC TTG
227
+ AGTG GC
228
+ ATT GG
229
+ TAC TC
230
+ TAA AC
231
+ AA TGG
232
+ AGG TC
233
+ AGG AC
234
+ TTG TG
235
+ TAT AC
236
+ ATT TTC
237
+ AT ATAA
238
+ AGGC TG
239
+ ATT TAA
240
+ AG TT
241
+ AG TAG
242
+ ATG AC
243
+ AA TGC
244
+ TCC AC
245
+ CC CC
246
+ ATG TC
247
+ AAC TC
248
+ TTTT TC
249
+ TAA GC
250
+ AAG TC
251
+ TGG TG
252
+ TAT AA
253
+ AG TGC
254
+ TAA GG
255
+ ACC TG
256
+ TT AGC
257
+ AA ATAA
258
+ TGCC TC
259
+ AA TCC
260
+ TTGG CC
261
+ TAG GG
262
+ TGG AC
263
+ TTG TC
264
+ AA CCC
265
+ TT ACC
266
+ TAA CC
267
+ AA TTTT
268
+ AA AGAA
269
+ ATT ATT
270
+ AGC G
271
+ AAAA AC
272
+ TAA TG
273
+ TTG AC
274
+ AG TCC
275
+ AAC TG
276
+ AG TTG
277
+ AA TTG
278
+ TC TGC
279
+ TT AGG
280
+ TAC AC
281
+ AGAA GG
282
+ AT ATTC
283
+ AAAA CC
284
+ AAAA GC
285
+ TG CCC
286
+ AC TGC
287
+ AGAA GC
288
+ TAA TAA
289
+ AA TATT
290
+ ACC ATG
291
+ TGG TC
292
+ TTTT GC
293
+ AAC G
294
+ TAC TG
295
+ ACAC ACAC
296
+ ATT TTG
297
+ TCC G
298
+ TGC G
299
+ AAAA TG
300
+ AC ATG
301
+ TC AGC
302
+ ATC G
303
+ AG TAC
304
+ TTTT GG
305
+ AA TAT
306
+ AG AGAA
307
+ TTC G
308
+ TCC AGCC
309
+ AT ATAC
310
+ TC ACC
311
+ AAAA GG
312
+ TGTG TGTG
313
+ TC ATC
314
+ TGC TGGG
315
+ TG AAG
316
+ TG TAG
317
+ TG TGG
318
+ AAAA ATT
319
+ AC TTC
320
+ TTCC CC
321
+ AT AGAA
322
+ TTG CCC
323
+ AGG AGG
324
+ TT TCCC
325
+ TAT ATT
326
+ ACC G
327
+ AC TAC
328
+ TCAC TGC
329
+ GC G
330
+ TT TGTG
331
+ AC AGC
332
+ TC ATG
333
+ AG TTTT
334
+ AGG AA
335
+ TT TATG
336
+ AT ATTG
337
+ TG ATG
338
+ TC TAA
339
+ TG TGC
340
+ AGG AAG
341
+ TT TGGG
342
+ TG TTC
343
+ AGCC CC
344
+ AG TTTC
345
+ AGGC TGG
346
+ TTTG CC
347
+ ATT TCC
348
+ AT ACAC
349
+ AAAA TAA
350
+ TAG AC
351
+ AGG AGAA
352
+ TG AGC
353
+ TGG AA
354
+ TTTT TAA
355
+ AGCC TCCC
356
+ ATG AA
357
+ TT TAAG
358
+ TC TGG
359
+ TT TATC
360
+ TT ATAA
361
+ TG ATT
362
+ AAC AA
363
+ TAGC TGGG
364
+ TC AAG
365
+ AAAA AA
366
+ ACTT TGGG
367
+ TATT CC
368
+ TC AGG
369
+ AAC AG
370
+ TTC TTC
371
+ TGTG GC
372
+ AT ATGC
373
+ ATTAC AGGC
374
+ AGGG GC
375
+ AGGG CC
376
+ TT ATTC
377
+ AT ATCC
378
+ TGTAA TCCCAGC
379
+ TAC G
380
+ AGAA AC
381
+ TG TCC
382
+ AG ATGG
383
+ TGTG CC
384
+ TTTC TC
385
+ TG AAC
386
+ AG TCTC
387
+ TG TTG
388
+ ATT TTTT
389
+ AAG AAG
390
+ TGGG GC
391
+ AGC AGC
392
+ G CCC
393
+ TTTG GC
394
+ AGGCTG AGGC
395
+ TGGG CC
396
+ TTC TCC
397
+ TAG AA
398
+ TGGAG TGC
399
+ ATT AA
400
+ AGTG CC
401
+ TG TCTC
402
+ AT ATGG
403
+ AC ATC
404
+ TGGG GG
405
+ TG ACC
406
+ AC TCC
407
+ TAA AAC
408
+ AG ATAA
409
+ TAA TTTT
410
+ TC AAC
411
+ TC TAC
412
+ TC TAG
413
+ G AG
414
+ TAA ATG
415
+ AGC AA
416
+ TAT ATG
417
+ ATAT ATAT
418
+ ATT TGC
419
+ TCC TCC
420
+ CCC AC
421
+ ATT TATT
422
+ TC TGCC
423
+ ATGG CC
424
+ TC GC
425
+ AG TATT
426
+ AGAA CC
427
+ TT AAAC
428
+ AA ATTC
429
+ AG AGAC
430
+ ATT TAC
431
+ ATTG CC
432
+ AAC AAC
433
+ TT TAAC
434
+ AC GG
435
+ AAG AAAA
436
+ TCTG GC
437
+ ATTC TCC
438
+ AGG TGG
439
+ TGC TGC
440
+ TTC AAG
441
+ AG AGGG
442
+ AC ACC
443
+ TC TTTT
444
+ AG AGGC
445
+ ATC ACC
446
+ TAA ATT
447
+ AAGG CC
448
+ TTGC AGTG
449
+ TG TAC
450
+ AA TTTC
451
+ ATCC CC
452
+ AC AAG
453
+ AC AGG
454
+ AC AAC
455
+ TGCC CC
456
+ AG ATTC
457
+ TT AGAA
458
+ TTGG GG
459
+ AG ACAC
460
+ TGG AAG
461
+ ACC TCC
462
+ ATG GGG
463
+ AGCC TCC
464
+ TT ATTG
465
+ TAA AAG
466
+ ATC TTC
467
+ ATC TCC
468
+ TGAA GC
469
+ TAA TC
470
+ AA ATGC
471
+ TTG TTG
472
+ ATT CCC
473
+ TAC TAAAA
474
+ AT AGTG
475
+ AA ATAC
476
+ TTGG GC
477
+ TAG AGAC
478
+ TG TTTT
479
+ TTC TGC
480
+ TGG CCC
481
+ TCTG TC
482
+ AGC TCC
483
+ AAC TCC
484
+ TT AGCC
485
+ AAAG TGCTGGG
486
+ AT AGAC
487
+ TATT TTTAG
488
+ AC TTG
489
+ ACC ACC
490
+ AA ACAC
491
+ G TGG
492
+ ATT TAG
493
+ AGG AGC
494
+ AGGC TGGAGTGC
495
+ AT ACCC
496
+ ATG TAA
497
+ AC GC
498
+ AG TAT
499
+ TT TACC
500
+ AC TAA
501
+ AGG CCC
502
+ AAGG GG
503
+ TCTC G
504
+ ATG AAG
505
+ AA AGAC
506
+ TG AAAA
507
+ AAGG GC
508
+ AT AGGC
509
+ AG AGTG
510
+ AGC TGC
511
+ ATG TTC
512
+ TATT TC
513
+ TG ATC
514
+ AG TTTG
515
+ AGC TAA
516
+ AG AGCC
517
+ TGC TTC
518
+ ATC ATC
519
+ AAC ATGG
520
+ AGC TTC
521
+ AAG AAC
522
+ TTTT TTG
523
+ AGGG GG
524
+ ATAA GC
525
+ TAAG CC
526
+ AC TGG
527
+ AC AAAA
528
+ ATC ATT
529
+ TC TTTC
530
+ ATG ATG
531
+ TGC AA
532
+ AGG TTC
533
+ AAC ATT
534
+ ATG GGC
535
+ AT AGAG
536
+ AA ATGG
537
+ AG TTCC
538
+ TT TAGC
539
+ AAC TTC
540
+ AGC AAG
541
+ AT AAAAC
542
+ AAAA TC
543
+ AGCC AC
544
+ AGG AAC
545
+ TTAA CC
546
+ TATT TATT
547
+ TTTC TG
548
+ ATAA GG
549
+ AGCC ACC
550
+ AG ATGC
551
+ TTAA GC
552
+ TTG TAA
553
+ AG TGTG
554
+ AACC CC
555
+ TTC ATT
556
+ ATC ATG
557
+ AA TGAA
558
+ AGG TGC
559
+ AAAAAAAA AAAAAAAA
560
+ AGG ATG
561
+ AGCC G
562
+ TGG TGG
563
+ AG TGGG
564
+ TGCAC TCCAGCC
565
+ TATT GC
566
+ TAG TC
567
+ CCC G
568
+ AAG TAA
569
+ TAG TG
570
+ TTTTTTTT TTTTTTTT
571
+ AGC ATT
572
+ ATC TGC
573
+ TCTC AC
574
+ AA ATTG
575
+ TT TAGG
576
+ AG ACCC
577
+ GGG CC
578
+ TCC TTC
579
+ AT AGGG
580
+ AA TATG
581
+ TT ATAC
582
+ TAG AAG
583
+ AA AGTG
584
+ AA ATCC
585
+ TTCC TC
586
+ TTTC AC
587
+ AG TATG
588
+ TACTAAAA ATAC
589
+ ATG TGC
590
+ AGG AGGC
591
+ TAT ATC
592
+ TTC TAA
593
+ TG AGGC
594
+ ACAC AC
595
+ TCC CCC
596
+ AAC ATC
597
+ AAGC G
598
+ AA TGGC
599
+ ACC CCC
600
+ AG ATAC
601
+ AT AAAAG
602
+ ATG ATT
603
+ TGG AGG
604
+ AG TTAA
tokenizer.json ADDED
@@ -0,0 +1,1257 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "version": "1.0",
3
+ "truncation": null,
4
+ "padding": null,
5
+ "added_tokens": [
6
+ {
7
+ "id": 1,
8
+ "content": "[UNK]",
9
+ "single_word": false,
10
+ "lstrip": false,
11
+ "rstrip": false,
12
+ "normalized": false,
13
+ "special": true
14
+ }
15
+ ],
16
+ "normalizer": {
17
+ "type": "BertNormalizer",
18
+ "clean_text": true,
19
+ "handle_chinese_chars": true,
20
+ "strip_accents": null,
21
+ "lowercase": false
22
+ },
23
+ "pre_tokenizer": {
24
+ "type": "BertPreTokenizer"
25
+ },
26
+ "post_processor": null,
27
+ "decoder": {
28
+ "type": "BPEDecoder",
29
+ "suffix": ""
30
+ },
31
+ "model": {
32
+ "type": "BPE",
33
+ "dropout": null,
34
+ "unk_token": "[UNK]",
35
+ "continuing_subword_prefix": null,
36
+ "end_of_word_suffix": "",
37
+ "fuse_unk": false,
38
+ "vocab": {
39
+ "[PAD]": 0,
40
+ "[UNK]": 1,
41
+ "[CLS]": 2,
42
+ "[SEP]": 3,
43
+ "[MASK]": 4,
44
+ "A": 5,
45
+ "C": 6,
46
+ "G": 7,
47
+ "T": 8,
48
+ "TT": 9,
49
+ "AA": 10,
50
+ "TG": 11,
51
+ "AG": 12,
52
+ "CC": 13,
53
+ "TC": 14,
54
+ "AC": 15,
55
+ "GG": 16,
56
+ "ATT": 17,
57
+ "AT": 18,
58
+ "ATG": 19,
59
+ "GC": 20,
60
+ "TAA": 21,
61
+ "TCC": 22,
62
+ "ACC": 23,
63
+ "AAAA": 24,
64
+ "AGG": 25,
65
+ "ATC": 26,
66
+ "AGC": 27,
67
+ "TTC": 28,
68
+ "AAG": 29,
69
+ "TTTT": 30,
70
+ "TGC": 31,
71
+ "TGG": 32,
72
+ "AAC": 33,
73
+ "TTG": 34,
74
+ "TAG": 35,
75
+ "TAC": 36,
76
+ "CCC": 37,
77
+ "TATT": 38,
78
+ "TGGG": 39,
79
+ "TAT": 40,
80
+ "AGAA": 41,
81
+ "AGGG": 42,
82
+ "TTTC": 43,
83
+ "AGGC": 44,
84
+ "AGCC": 45,
85
+ "ATAA": 46,
86
+ "TGTG": 47,
87
+ "TTGG": 48,
88
+ "ATTC": 49,
89
+ "AAGG": 50,
90
+ "ACAC": 51,
91
+ "TCCC": 52,
92
+ "TCTC": 53,
93
+ "TATG": 54,
94
+ "TTTG": 55,
95
+ "TTCC": 56,
96
+ "AGTG": 57,
97
+ "ATGG": 58,
98
+ "AGAC": 59,
99
+ "AAAC": 60,
100
+ "ACCC": 61,
101
+ "TGCC": 62,
102
+ "ATTG": 63,
103
+ "ATCC": 64,
104
+ "AGAG": 65,
105
+ "ATGC": 66,
106
+ "ATAC": 67,
107
+ "TCTG": 68,
108
+ "TTAA": 69,
109
+ "TCAC": 70,
110
+ "TGAA": 71,
111
+ "TGGC": 72,
112
+ "TTGC": 73,
113
+ "TAAG": 74,
114
+ "TATC": 75,
115
+ "TAAC": 76,
116
+ "AAAG": 77,
117
+ "TTAC": 78,
118
+ "AAGC": 79,
119
+ "GGG": 80,
120
+ "TAGC": 81,
121
+ "GGC": 82,
122
+ "ATAT": 83,
123
+ "TACC": 84,
124
+ "AACC": 85,
125
+ "AATG": 86,
126
+ "TAGG": 87,
127
+ "GCC": 88,
128
+ "ATATT": 89,
129
+ "AGTC": 90,
130
+ "TTTTC": 91,
131
+ "AAAAC": 92,
132
+ "TGAC": 93,
133
+ "TTTAA": 94,
134
+ "AAAAG": 95,
135
+ "AATC": 96,
136
+ "TGTC": 97,
137
+ "TTATT": 98,
138
+ "ATAG": 99,
139
+ "TGAG": 100,
140
+ "TTTTG": 101,
141
+ "AAATT": 102,
142
+ "AATT": 103,
143
+ "AATAA": 104,
144
+ "TTTCC": 105,
145
+ "ACAG": 106,
146
+ "TCAG": 107,
147
+ "AAATG": 108,
148
+ "TGGGC": 109,
149
+ "ACTC": 110,
150
+ "AGGCC": 111,
151
+ "TTAG": 112,
152
+ "ACTG": 113,
153
+ "ACG": 114,
154
+ "ATATG": 115,
155
+ "TGGCC": 116,
156
+ "ATTTC": 117,
157
+ "ACAA": 118,
158
+ "ATCTC": 119,
159
+ "TATTC": 120,
160
+ "TGTAA": 121,
161
+ "ACTT": 122,
162
+ "ATGCC": 123,
163
+ "TAAAA": 124,
164
+ "AAAAAAAA": 125,
165
+ "ATTCC": 126,
166
+ "TTTAG": 127,
167
+ "TCCCC": 128,
168
+ "TTTGC": 129,
169
+ "TTCCC": 130,
170
+ "TGGGG": 131,
171
+ "TTCTC": 132,
172
+ "ATAAAA": 133,
173
+ "AGAAG": 134,
174
+ "TTTTTTTT": 135,
175
+ "ACCCC": 136,
176
+ "AGGGC": 137,
177
+ "ACCTC": 138,
178
+ "AGATG": 139,
179
+ "ATTAC": 140,
180
+ "AAGCC": 141,
181
+ "GGCC": 142,
182
+ "AGGAG": 143,
183
+ "TCAA": 144,
184
+ "ATTGC": 145,
185
+ "TATTG": 146,
186
+ "ATAAC": 147,
187
+ "ATATC": 148,
188
+ "TTTAC": 149,
189
+ "ATGGC": 150,
190
+ "AAGGC": 151,
191
+ "ACCAC": 152,
192
+ "GTG": 153,
193
+ "ATCCC": 154,
194
+ "AGAAC": 155,
195
+ "ATTTT": 156,
196
+ "TTGCC": 157,
197
+ "AAATC": 158,
198
+ "ATAAG": 159,
199
+ "TTGGC": 160,
200
+ "TGGAG": 161,
201
+ "ATGGG": 162,
202
+ "AAAGC": 163,
203
+ "AGGGG": 164,
204
+ "ATCAC": 165,
205
+ "ATTTG": 166,
206
+ "AATTC": 167,
207
+ "TGCAC": 168,
208
+ "TTTGG": 169,
209
+ "TCG": 170,
210
+ "AGAGC": 171,
211
+ "AAAGG": 172,
212
+ "GGGC": 173,
213
+ "TTGGG": 174,
214
+ "AGAAAA": 175,
215
+ "TATCC": 176,
216
+ "TCTCC": 177,
217
+ "ATAGC": 178,
218
+ "TGAGG": 179,
219
+ "TTTATT": 180,
220
+ "AGTAA": 181,
221
+ "AGAGG": 182,
222
+ "TCTTC": 183,
223
+ "ACATT": 184,
224
+ "TCCTG": 185,
225
+ "AGCCC": 186,
226
+ "TATGC": 187,
227
+ "TTAAAA": 188,
228
+ "AGATT": 189,
229
+ "TTAAC": 190,
230
+ "GGGG": 191,
231
+ "AAGAC": 192,
232
+ "TCATT": 193,
233
+ "TTCTG": 194,
234
+ "AGACC": 195,
235
+ "AAGGG": 196,
236
+ "ATACC": 197,
237
+ "TTTAT": 198,
238
+ "AAGTG": 199,
239
+ "TTATG": 200,
240
+ "AAGAA": 201,
241
+ "TAGCC": 202,
242
+ "TTCAC": 203,
243
+ "AGGTG": 204,
244
+ "TTGAA": 205,
245
+ "ATCTG": 206,
246
+ "AGCAC": 207,
247
+ "TGCTG": 208,
248
+ "AAACC": 209,
249
+ "ATGTG": 210,
250
+ "TTTTCC": 211,
251
+ "AGTTC": 212,
252
+ "TCCTC": 213,
253
+ "TATGG": 214,
254
+ "AATAC": 215,
255
+ "AGTGG": 216,
256
+ "TAGGC": 217,
257
+ "AGCTC": 218,
258
+ "ATAGG": 219,
259
+ "TTATC": 220,
260
+ "TTAAG": 221,
261
+ "TACCC": 222,
262
+ "TTTTTG": 223,
263
+ "AACAC": 224,
264
+ "TGCTC": 225,
265
+ "AGATC": 226,
266
+ "TCCCAGC": 227,
267
+ "AGCTG": 228,
268
+ "AATAG": 229,
269
+ "TCTTG": 230,
270
+ "AGTGGC": 231,
271
+ "ATTGG": 232,
272
+ "TACTC": 233,
273
+ "TAAAC": 234,
274
+ "AATGG": 235,
275
+ "AGGTC": 236,
276
+ "AGGAC": 237,
277
+ "TTGTG": 238,
278
+ "TATAC": 239,
279
+ "ATTTTC": 240,
280
+ "ATATAA": 241,
281
+ "AGGCTG": 242,
282
+ "ATTTAA": 243,
283
+ "AGTT": 244,
284
+ "AGTAG": 245,
285
+ "ATGAC": 246,
286
+ "AATGC": 247,
287
+ "TCCAC": 248,
288
+ "CCCC": 249,
289
+ "ATGTC": 250,
290
+ "AACTC": 251,
291
+ "TTTTTC": 252,
292
+ "TAAGC": 253,
293
+ "AAGTC": 254,
294
+ "TGGTG": 255,
295
+ "TATAA": 256,
296
+ "AGTGC": 257,
297
+ "TAAGG": 258,
298
+ "ACCTG": 259,
299
+ "TTAGC": 260,
300
+ "AAATAA": 261,
301
+ "TGCCTC": 262,
302
+ "AATCC": 263,
303
+ "TTGGCC": 264,
304
+ "TAGGG": 265,
305
+ "TGGAC": 266,
306
+ "TTGTC": 267,
307
+ "AACCC": 268,
308
+ "TTACC": 269,
309
+ "TAACC": 270,
310
+ "AATTTT": 271,
311
+ "AAAGAA": 272,
312
+ "ATTATT": 273,
313
+ "AGCG": 274,
314
+ "AAAAAC": 275,
315
+ "TAATG": 276,
316
+ "TTGAC": 277,
317
+ "AGTCC": 278,
318
+ "AACTG": 279,
319
+ "AGTTG": 280,
320
+ "AATTG": 281,
321
+ "TCTGC": 282,
322
+ "TTAGG": 283,
323
+ "TACAC": 284,
324
+ "AGAAGG": 285,
325
+ "ATATTC": 286,
326
+ "AAAACC": 287,
327
+ "AAAAGC": 288,
328
+ "TGCCC": 289,
329
+ "ACTGC": 290,
330
+ "AGAAGC": 291,
331
+ "TAATAA": 292,
332
+ "AATATT": 293,
333
+ "ACCATG": 294,
334
+ "TGGTC": 295,
335
+ "TTTTGC": 296,
336
+ "AACG": 297,
337
+ "TACTG": 298,
338
+ "ACACACAC": 299,
339
+ "ATTTTG": 300,
340
+ "TCCG": 301,
341
+ "TGCG": 302,
342
+ "AAAATG": 303,
343
+ "ACATG": 304,
344
+ "TCAGC": 305,
345
+ "ATCG": 306,
346
+ "AGTAC": 307,
347
+ "TTTTGG": 308,
348
+ "AATAT": 309,
349
+ "AGAGAA": 310,
350
+ "TTCG": 311,
351
+ "TCCAGCC": 312,
352
+ "ATATAC": 313,
353
+ "TCACC": 314,
354
+ "AAAAGG": 315,
355
+ "TGTGTGTG": 316,
356
+ "TCATC": 317,
357
+ "TGCTGGG": 318,
358
+ "TGAAG": 319,
359
+ "TGTAG": 320,
360
+ "TGTGG": 321,
361
+ "AAAAATT": 322,
362
+ "ACTTC": 323,
363
+ "TTCCCC": 324,
364
+ "ATAGAA": 325,
365
+ "TTGCCC": 326,
366
+ "AGGAGG": 327,
367
+ "TTTCCC": 328,
368
+ "TATATT": 329,
369
+ "ACCG": 330,
370
+ "ACTAC": 331,
371
+ "TCACTGC": 332,
372
+ "GCG": 333,
373
+ "TTTGTG": 334,
374
+ "ACAGC": 335,
375
+ "TCATG": 336,
376
+ "AGTTTT": 337,
377
+ "AGGAA": 338,
378
+ "TTTATG": 339,
379
+ "ATATTG": 340,
380
+ "TGATG": 341,
381
+ "TCTAA": 342,
382
+ "TGTGC": 343,
383
+ "AGGAAG": 344,
384
+ "TTTGGG": 345,
385
+ "TGTTC": 346,
386
+ "AGCCCC": 347,
387
+ "AGTTTC": 348,
388
+ "AGGCTGG": 349,
389
+ "TTTGCC": 350,
390
+ "ATTTCC": 351,
391
+ "ATACAC": 352,
392
+ "AAAATAA": 353,
393
+ "TAGAC": 354,
394
+ "AGGAGAA": 355,
395
+ "TGAGC": 356,
396
+ "TGGAA": 357,
397
+ "TTTTTAA": 358,
398
+ "AGCCTCCC": 359,
399
+ "ATGAA": 360,
400
+ "TTTAAG": 361,
401
+ "TCTGG": 362,
402
+ "TTTATC": 363,
403
+ "TTATAA": 364,
404
+ "TGATT": 365,
405
+ "AACAA": 366,
406
+ "TAGCTGGG": 367,
407
+ "TCAAG": 368,
408
+ "AAAAAA": 369,
409
+ "ACTTTGGG": 370,
410
+ "TATTCC": 371,
411
+ "TCAGG": 372,
412
+ "AACAG": 373,
413
+ "TTCTTC": 374,
414
+ "TGTGGC": 375,
415
+ "ATATGC": 376,
416
+ "ATTACAGGC": 377,
417
+ "AGGGGC": 378,
418
+ "AGGGCC": 379,
419
+ "TTATTC": 380,
420
+ "ATATCC": 381,
421
+ "TGTAATCCCAGC": 382,
422
+ "TACG": 383,
423
+ "AGAAAC": 384,
424
+ "TGTCC": 385,
425
+ "AGATGG": 386,
426
+ "TGTGCC": 387,
427
+ "TTTCTC": 388,
428
+ "TGAAC": 389,
429
+ "AGTCTC": 390,
430
+ "TGTTG": 391,
431
+ "ATTTTTT": 392,
432
+ "AAGAAG": 393,
433
+ "TGGGGC": 394,
434
+ "AGCAGC": 395,
435
+ "GCCC": 396,
436
+ "TTTGGC": 397,
437
+ "AGGCTGAGGC": 398,
438
+ "TGGGCC": 399,
439
+ "TTCTCC": 400,
440
+ "TAGAA": 401,
441
+ "TGGAGTGC": 402,
442
+ "ATTAA": 403,
443
+ "AGTGCC": 404,
444
+ "TGTCTC": 405,
445
+ "ATATGG": 406,
446
+ "ACATC": 407,
447
+ "TGGGGG": 408,
448
+ "TGACC": 409,
449
+ "ACTCC": 410,
450
+ "TAAAAC": 411,
451
+ "AGATAA": 412,
452
+ "TAATTTT": 413,
453
+ "TCAAC": 414,
454
+ "TCTAC": 415,
455
+ "TCTAG": 416,
456
+ "GAG": 417,
457
+ "TAAATG": 418,
458
+ "AGCAA": 419,
459
+ "TATATG": 420,
460
+ "ATATATAT": 421,
461
+ "ATTTGC": 422,
462
+ "TCCTCC": 423,
463
+ "CCCAC": 424,
464
+ "ATTTATT": 425,
465
+ "TCTGCC": 426,
466
+ "ATGGCC": 427,
467
+ "TCGC": 428,
468
+ "AGTATT": 429,
469
+ "AGAACC": 430,
470
+ "TTAAAC": 431,
471
+ "AAATTC": 432,
472
+ "AGAGAC": 433,
473
+ "ATTTAC": 434,
474
+ "ATTGCC": 435,
475
+ "AACAAC": 436,
476
+ "TTTAAC": 437,
477
+ "ACGG": 438,
478
+ "AAGAAAA": 439,
479
+ "TCTGGC": 440,
480
+ "ATTCTCC": 441,
481
+ "AGGTGG": 442,
482
+ "TGCTGC": 443,
483
+ "TTCAAG": 444,
484
+ "AGAGGG": 445,
485
+ "ACACC": 446,
486
+ "TCTTTT": 447,
487
+ "AGAGGC": 448,
488
+ "ATCACC": 449,
489
+ "TAAATT": 450,
490
+ "AAGGCC": 451,
491
+ "TTGCAGTG": 452,
492
+ "TGTAC": 453,
493
+ "AATTTC": 454,
494
+ "ATCCCC": 455,
495
+ "ACAAG": 456,
496
+ "ACAGG": 457,
497
+ "ACAAC": 458,
498
+ "TGCCCC": 459,
499
+ "AGATTC": 460,
500
+ "TTAGAA": 461,
501
+ "TTGGGG": 462,
502
+ "AGACAC": 463,
503
+ "TGGAAG": 464,
504
+ "ACCTCC": 465,
505
+ "ATGGGG": 466,
506
+ "AGCCTCC": 467,
507
+ "TTATTG": 468,
508
+ "TAAAAG": 469,
509
+ "ATCTTC": 470,
510
+ "ATCTCC": 471,
511
+ "TGAAGC": 472,
512
+ "TAATC": 473,
513
+ "AAATGC": 474,
514
+ "TTGTTG": 475,
515
+ "ATTCCC": 476,
516
+ "TACTAAAA": 477,
517
+ "ATAGTG": 478,
518
+ "AAATAC": 479,
519
+ "TTGGGC": 480,
520
+ "TAGAGAC": 481,
521
+ "TGTTTT": 482,
522
+ "TTCTGC": 483,
523
+ "TGGCCC": 484,
524
+ "TCTGTC": 485,
525
+ "AGCTCC": 486,
526
+ "AACTCC": 487,
527
+ "TTAGCC": 488,
528
+ "AAAGTGCTGGG": 489,
529
+ "ATAGAC": 490,
530
+ "TATTTTTAG": 491,
531
+ "ACTTG": 492,
532
+ "ACCACC": 493,
533
+ "AAACAC": 494,
534
+ "GTGG": 495,
535
+ "ATTTAG": 496,
536
+ "AGGAGC": 497,
537
+ "AGGCTGGAGTGC": 498,
538
+ "ATACCC": 499,
539
+ "ATGTAA": 500,
540
+ "ACGC": 501,
541
+ "AGTAT": 502,
542
+ "TTTACC": 503,
543
+ "ACTAA": 504,
544
+ "AGGCCC": 505,
545
+ "AAGGGG": 506,
546
+ "TCTCG": 507,
547
+ "ATGAAG": 508,
548
+ "AAAGAC": 509,
549
+ "TGAAAA": 510,
550
+ "AAGGGC": 511,
551
+ "ATAGGC": 512,
552
+ "AGAGTG": 513,
553
+ "AGCTGC": 514,
554
+ "ATGTTC": 515,
555
+ "TATTTC": 516,
556
+ "TGATC": 517,
557
+ "AGTTTG": 518,
558
+ "AGCTAA": 519,
559
+ "AGAGCC": 520,
560
+ "TGCTTC": 521,
561
+ "ATCATC": 522,
562
+ "AACATGG": 523,
563
+ "AGCTTC": 524,
564
+ "AAGAAC": 525,
565
+ "TTTTTTG": 526,
566
+ "AGGGGG": 527,
567
+ "ATAAGC": 528,
568
+ "TAAGCC": 529,
569
+ "ACTGG": 530,
570
+ "ACAAAA": 531,
571
+ "ATCATT": 532,
572
+ "TCTTTC": 533,
573
+ "ATGATG": 534,
574
+ "TGCAA": 535,
575
+ "AGGTTC": 536,
576
+ "AACATT": 537,
577
+ "ATGGGC": 538,
578
+ "ATAGAG": 539,
579
+ "AAATGG": 540,
580
+ "AGTTCC": 541,
581
+ "TTTAGC": 542,
582
+ "AACTTC": 543,
583
+ "AGCAAG": 544,
584
+ "ATAAAAC": 545,
585
+ "AAAATC": 546,
586
+ "AGCCAC": 547,
587
+ "AGGAAC": 548,
588
+ "TTAACC": 549,
589
+ "TATTTATT": 550,
590
+ "TTTCTG": 551,
591
+ "ATAAGG": 552,
592
+ "AGCCACC": 553,
593
+ "AGATGC": 554,
594
+ "TTAAGC": 555,
595
+ "TTGTAA": 556,
596
+ "AGTGTG": 557,
597
+ "AACCCC": 558,
598
+ "TTCATT": 559,
599
+ "ATCATG": 560,
600
+ "AATGAA": 561,
601
+ "AGGTGC": 562,
602
+ "AAAAAAAAAAAAAAAA": 563,
603
+ "AGGATG": 564,
604
+ "AGCCG": 565,
605
+ "TGGTGG": 566,
606
+ "AGTGGG": 567,
607
+ "TGCACTCCAGCC": 568,
608
+ "TATTGC": 569,
609
+ "TAGTC": 570,
610
+ "CCCG": 571,
611
+ "AAGTAA": 572,
612
+ "TAGTG": 573,
613
+ "TTTTTTTTTTTTTTTT": 574,
614
+ "AGCATT": 575,
615
+ "ATCTGC": 576,
616
+ "TCTCAC": 577,
617
+ "AAATTG": 578,
618
+ "TTTAGG": 579,
619
+ "AGACCC": 580,
620
+ "GGGCC": 581,
621
+ "TCCTTC": 582,
622
+ "ATAGGG": 583,
623
+ "AATATG": 584,
624
+ "TTATAC": 585,
625
+ "TAGAAG": 586,
626
+ "AAAGTG": 587,
627
+ "AAATCC": 588,
628
+ "TTCCTC": 589,
629
+ "TTTCAC": 590,
630
+ "AGTATG": 591,
631
+ "TACTAAAAATAC": 592,
632
+ "ATGTGC": 593,
633
+ "AGGAGGC": 594,
634
+ "TATATC": 595,
635
+ "TTCTAA": 596,
636
+ "TGAGGC": 597,
637
+ "ACACAC": 598,
638
+ "TCCCCC": 599,
639
+ "AACATC": 600,
640
+ "AAGCG": 601,
641
+ "AATGGC": 602,
642
+ "ACCCCC": 603,
643
+ "AGATAC": 604,
644
+ "ATAAAAG": 605,
645
+ "ATGATT": 606,
646
+ "TGGAGG": 607,
647
+ "AGTTAA": 608,
648
+ "": 609
649
+ },
650
+ "merges": [
651
+ "A ",
652
+ "C ",
653
+ "G ",
654
+ "T ",
655
+ "T T",
656
+ "A A",
657
+ "T G",
658
+ "A G",
659
+ "C C",
660
+ "T C",
661
+ "A C",
662
+ "G G",
663
+ "A TT",
664
+ "A T",
665
+ "A TG",
666
+ "G C",
667
+ "T AA",
668
+ "T CC",
669
+ "A CC",
670
+ "AA AA",
671
+ "AG G",
672
+ "A TC",
673
+ "AG C",
674
+ "TT C",
675
+ "AA G",
676
+ "TT TT",
677
+ "TG C",
678
+ "TG G",
679
+ "AA C",
680
+ "TT G",
681
+ "T AG",
682
+ "T AC",
683
+ "CC C",
684
+ "T ATT",
685
+ "TG GG",
686
+ "T AT",
687
+ "AG AA",
688
+ "AG GG",
689
+ "TT TC",
690
+ "AG GC",
691
+ "AG CC",
692
+ "AT AA",
693
+ "TG TG",
694
+ "TT GG",
695
+ "ATT C",
696
+ "AA GG",
697
+ "AC AC",
698
+ "TCC C",
699
+ "TC TC",
700
+ "T ATG",
701
+ "TT TG",
702
+ "TT CC",
703
+ "AG TG",
704
+ "ATG G",
705
+ "AG AC",
706
+ "AA AC",
707
+ "ACC C",
708
+ "TG CC",
709
+ "ATT G",
710
+ "AT CC",
711
+ "AG AG",
712
+ "ATG C",
713
+ "AT AC",
714
+ "TC TG",
715
+ "TT AA",
716
+ "TC AC",
717
+ "TG AA",
718
+ "TG GC",
719
+ "TT GC",
720
+ "TAA G",
721
+ "T ATC",
722
+ "TAA C",
723
+ "AA AG",
724
+ "TT AC",
725
+ "AA GC",
726
+ "GG G",
727
+ "T AGC",
728
+ "GG C",
729
+ "AT AT",
730
+ "T ACC",
731
+ "AA CC",
732
+ "AA TG",
733
+ "T AGG",
734
+ "G CC",
735
+ "AT ATT",
736
+ "AG TC",
737
+ "TT TTC",
738
+ "AAAA C",
739
+ "TG AC",
740
+ "TT TAA",
741
+ "AAAA G",
742
+ "AA TC",
743
+ "TG TC",
744
+ "TT ATT",
745
+ "AT AG",
746
+ "TG AG",
747
+ "TTTT G",
748
+ "AA ATT",
749
+ "AA TT",
750
+ "AA TAA",
751
+ "TT TCC",
752
+ "AC AG",
753
+ "TC AG",
754
+ "AA ATG",
755
+ "TGGG C",
756
+ "AC TC",
757
+ "AGG CC",
758
+ "TT AG",
759
+ "AC TG",
760
+ "AC G",
761
+ "AT ATG",
762
+ "TGG CC",
763
+ "ATT TC",
764
+ "AC AA",
765
+ "ATC TC",
766
+ "TATT C",
767
+ "TG TAA",
768
+ "AC TT",
769
+ "ATG CC",
770
+ "TAA AA",
771
+ "AAAA AAAA",
772
+ "ATT CC",
773
+ "TT TAG",
774
+ "TCC CC",
775
+ "TT TGC",
776
+ "TT CCC",
777
+ "TGGG G",
778
+ "TTC TC",
779
+ "AT AAAA",
780
+ "AG AAG",
781
+ "TTTT TTTT",
782
+ "ACC CC",
783
+ "AGGG C",
784
+ "ACC TC",
785
+ "AG ATG",
786
+ "ATT AC",
787
+ "AAG CC",
788
+ "GG CC",
789
+ "AGG AG",
790
+ "TC AA",
791
+ "ATT GC",
792
+ "TATT G",
793
+ "AT AAC",
794
+ "AT ATC",
795
+ "TT TAC",
796
+ "ATG GC",
797
+ "AAGG C",
798
+ "ACC AC",
799
+ "G TG",
800
+ "AT CCC",
801
+ "AG AAC",
802
+ "ATT TT",
803
+ "TTG CC",
804
+ "AA ATC",
805
+ "AT AAG",
806
+ "TTGG C",
807
+ "TGG AG",
808
+ "ATG GG",
809
+ "AA AGC",
810
+ "AGGG G",
811
+ "ATC AC",
812
+ "ATT TG",
813
+ "AA TTC",
814
+ "TGC AC",
815
+ "TT TGG",
816
+ "TC G",
817
+ "AG AGC",
818
+ "AA AGG",
819
+ "GG GC",
820
+ "TTGG G",
821
+ "AG AAAA",
822
+ "TAT CC",
823
+ "TC TCC",
824
+ "AT AGC",
825
+ "TG AGG",
826
+ "TT TATT",
827
+ "AG TAA",
828
+ "AG AGG",
829
+ "TC TTC",
830
+ "AC ATT",
831
+ "TCC TG",
832
+ "AG CCC",
833
+ "TATG C",
834
+ "TT AAAA",
835
+ "AG ATT",
836
+ "TT AAC",
837
+ "GG GG",
838
+ "AAG AC",
839
+ "TC ATT",
840
+ "TTC TG",
841
+ "AG ACC",
842
+ "AAGG G",
843
+ "AT ACC",
844
+ "TT TAT",
845
+ "AAG TG",
846
+ "TT ATG",
847
+ "AAG AA",
848
+ "TAG CC",
849
+ "TTC AC",
850
+ "AGG TG",
851
+ "TTG AA",
852
+ "ATC TG",
853
+ "AGC AC",
854
+ "TGC TG",
855
+ "AA ACC",
856
+ "ATG TG",
857
+ "TTTT CC",
858
+ "AG TTC",
859
+ "TCC TC",
860
+ "TATG G",
861
+ "AA TAC",
862
+ "AG TGG",
863
+ "TAG GC",
864
+ "AGC TC",
865
+ "AT AGG",
866
+ "TT ATC",
867
+ "TT AAG",
868
+ "T ACCC",
869
+ "TTTT TG",
870
+ "AAC AC",
871
+ "TGC TC",
872
+ "AG ATC",
873
+ "TCCC AGC",
874
+ "AGC TG",
875
+ "AA TAG",
876
+ "TC TTG",
877
+ "AGTG GC",
878
+ "ATT GG",
879
+ "TAC TC",
880
+ "TAA AC",
881
+ "AA TGG",
882
+ "AGG TC",
883
+ "AGG AC",
884
+ "TTG TG",
885
+ "TAT AC",
886
+ "ATT TTC",
887
+ "AT ATAA",
888
+ "AGGC TG",
889
+ "ATT TAA",
890
+ "AG TT",
891
+ "AG TAG",
892
+ "ATG AC",
893
+ "AA TGC",
894
+ "TCC AC",
895
+ "CC CC",
896
+ "ATG TC",
897
+ "AAC TC",
898
+ "TTTT TC",
899
+ "TAA GC",
900
+ "AAG TC",
901
+ "TGG TG",
902
+ "TAT AA",
903
+ "AG TGC",
904
+ "TAA GG",
905
+ "ACC TG",
906
+ "TT AGC",
907
+ "AA ATAA",
908
+ "TGCC TC",
909
+ "AA TCC",
910
+ "TTGG CC",
911
+ "TAG GG",
912
+ "TGG AC",
913
+ "TTG TC",
914
+ "AA CCC",
915
+ "TT ACC",
916
+ "TAA CC",
917
+ "AA TTTT",
918
+ "AA AGAA",
919
+ "ATT ATT",
920
+ "AGC G",
921
+ "AAAA AC",
922
+ "TAA TG",
923
+ "TTG AC",
924
+ "AG TCC",
925
+ "AAC TG",
926
+ "AG TTG",
927
+ "AA TTG",
928
+ "TC TGC",
929
+ "TT AGG",
930
+ "TAC AC",
931
+ "AGAA GG",
932
+ "AT ATTC",
933
+ "AAAA CC",
934
+ "AAAA GC",
935
+ "TG CCC",
936
+ "AC TGC",
937
+ "AGAA GC",
938
+ "TAA TAA",
939
+ "AA TATT",
940
+ "ACC ATG",
941
+ "TGG TC",
942
+ "TTTT GC",
943
+ "AAC G",
944
+ "TAC TG",
945
+ "ACAC ACAC",
946
+ "ATT TTG",
947
+ "TCC G",
948
+ "TGC G",
949
+ "AAAA TG",
950
+ "AC ATG",
951
+ "TC AGC",
952
+ "ATC G",
953
+ "AG TAC",
954
+ "TTTT GG",
955
+ "AA TAT",
956
+ "AG AGAA",
957
+ "TTC G",
958
+ "TCC AGCC",
959
+ "AT ATAC",
960
+ "TC ACC",
961
+ "AAAA GG",
962
+ "TGTG TGTG",
963
+ "TC ATC",
964
+ "TGC TGGG",
965
+ "TG AAG",
966
+ "TG TAG",
967
+ "TG TGG",
968
+ "AAAA ATT",
969
+ "AC TTC",
970
+ "TTCC CC",
971
+ "AT AGAA",
972
+ "TTG CCC",
973
+ "AGG AGG",
974
+ "TT TCCC",
975
+ "TAT ATT",
976
+ "ACC G",
977
+ "AC TAC",
978
+ "TCAC TGC",
979
+ "GC G",
980
+ "TT TGTG",
981
+ "AC AGC",
982
+ "TC ATG",
983
+ "AG TTTT",
984
+ "AGG AA",
985
+ "TT TATG",
986
+ "AT ATTG",
987
+ "TG ATG",
988
+ "TC TAA",
989
+ "TG TGC",
990
+ "AGG AAG",
991
+ "TT TGGG",
992
+ "TG TTC",
993
+ "AGCC CC",
994
+ "AG TTTC",
995
+ "AGGC TGG",
996
+ "TTTG CC",
997
+ "ATT TCC",
998
+ "AT ACAC",
999
+ "AAAA TAA",
1000
+ "TAG AC",
1001
+ "AGG AGAA",
1002
+ "TG AGC",
1003
+ "TGG AA",
1004
+ "TTTT TAA",
1005
+ "AGCC TCCC",
1006
+ "ATG AA",
1007
+ "TT TAAG",
1008
+ "TC TGG",
1009
+ "TT TATC",
1010
+ "TT ATAA",
1011
+ "TG ATT",
1012
+ "AAC AA",
1013
+ "TAGC TGGG",
1014
+ "TC AAG",
1015
+ "AAAA AA",
1016
+ "ACTT TGGG",
1017
+ "TATT CC",
1018
+ "TC AGG",
1019
+ "AAC AG",
1020
+ "TTC TTC",
1021
+ "TGTG GC",
1022
+ "AT ATGC",
1023
+ "ATTAC AGGC",
1024
+ "AGGG GC",
1025
+ "AGGG CC",
1026
+ "TT ATTC",
1027
+ "AT ATCC",
1028
+ "TGTAA TCCCAGC",
1029
+ "TAC G",
1030
+ "AGAA AC",
1031
+ "TG TCC",
1032
+ "AG ATGG",
1033
+ "TGTG CC",
1034
+ "TTTC TC",
1035
+ "TG AAC",
1036
+ "AG TCTC",
1037
+ "TG TTG",
1038
+ "ATT TTTT",
1039
+ "AAG AAG",
1040
+ "TGGG GC",
1041
+ "AGC AGC",
1042
+ "G CCC",
1043
+ "TTTG GC",
1044
+ "AGGCTG AGGC",
1045
+ "TGGG CC",
1046
+ "TTC TCC",
1047
+ "TAG AA",
1048
+ "TGGAG TGC",
1049
+ "ATT AA",
1050
+ "AGTG CC",
1051
+ "TG TCTC",
1052
+ "AT ATGG",
1053
+ "AC ATC",
1054
+ "TGGG GG",
1055
+ "TG ACC",
1056
+ "AC TCC",
1057
+ "TAA AAC",
1058
+ "AG ATAA",
1059
+ "TAA TTTT",
1060
+ "TC AAC",
1061
+ "TC TAC",
1062
+ "TC TAG",
1063
+ "G AG",
1064
+ "TAA ATG",
1065
+ "AGC AA",
1066
+ "TAT ATG",
1067
+ "ATAT ATAT",
1068
+ "ATT TGC",
1069
+ "TCC TCC",
1070
+ "CCC AC",
1071
+ "ATT TATT",
1072
+ "TC TGCC",
1073
+ "ATGG CC",
1074
+ "TC GC",
1075
+ "AG TATT",
1076
+ "AGAA CC",
1077
+ "TT AAAC",
1078
+ "AA ATTC",
1079
+ "AG AGAC",
1080
+ "ATT TAC",
1081
+ "ATTG CC",
1082
+ "AAC AAC",
1083
+ "TT TAAC",
1084
+ "AC GG",
1085
+ "AAG AAAA",
1086
+ "TCTG GC",
1087
+ "ATTC TCC",
1088
+ "AGG TGG",
1089
+ "TGC TGC",
1090
+ "TTC AAG",
1091
+ "AG AGGG",
1092
+ "AC ACC",
1093
+ "TC TTTT",
1094
+ "AG AGGC",
1095
+ "ATC ACC",
1096
+ "TAA ATT",
1097
+ "AAGG CC",
1098
+ "TTGC AGTG",
1099
+ "TG TAC",
1100
+ "AA TTTC",
1101
+ "ATCC CC",
1102
+ "AC AAG",
1103
+ "AC AGG",
1104
+ "AC AAC",
1105
+ "TGCC CC",
1106
+ "AG ATTC",
1107
+ "TT AGAA",
1108
+ "TTGG GG",
1109
+ "AG ACAC",
1110
+ "TGG AAG",
1111
+ "ACC TCC",
1112
+ "ATG GGG",
1113
+ "AGCC TCC",
1114
+ "TT ATTG",
1115
+ "TAA AAG",
1116
+ "ATC TTC",
1117
+ "ATC TCC",
1118
+ "TGAA GC",
1119
+ "TAA TC",
1120
+ "AA ATGC",
1121
+ "TTG TTG",
1122
+ "ATT CCC",
1123
+ "TAC TAAAA",
1124
+ "AT AGTG",
1125
+ "AA ATAC",
1126
+ "TTGG GC",
1127
+ "TAG AGAC",
1128
+ "TG TTTT",
1129
+ "TTC TGC",
1130
+ "TGG CCC",
1131
+ "TCTG TC",
1132
+ "AGC TCC",
1133
+ "AAC TCC",
1134
+ "TT AGCC",
1135
+ "AAAG TGCTGGG",
1136
+ "AT AGAC",
1137
+ "TATT TTTAG",
1138
+ "AC TTG",
1139
+ "ACC ACC",
1140
+ "AA ACAC",
1141
+ "G TGG",
1142
+ "ATT TAG",
1143
+ "AGG AGC",
1144
+ "AGGC TGGAGTGC",
1145
+ "AT ACCC",
1146
+ "ATG TAA",
1147
+ "AC GC",
1148
+ "AG TAT",
1149
+ "TT TACC",
1150
+ "AC TAA",
1151
+ "AGG CCC",
1152
+ "AAGG GG",
1153
+ "TCTC G",
1154
+ "ATG AAG",
1155
+ "AA AGAC",
1156
+ "TG AAAA",
1157
+ "AAGG GC",
1158
+ "AT AGGC",
1159
+ "AG AGTG",
1160
+ "AGC TGC",
1161
+ "ATG TTC",
1162
+ "TATT TC",
1163
+ "TG ATC",
1164
+ "AG TTTG",
1165
+ "AGC TAA",
1166
+ "AG AGCC",
1167
+ "TGC TTC",
1168
+ "ATC ATC",
1169
+ "AAC ATGG",
1170
+ "AGC TTC",
1171
+ "AAG AAC",
1172
+ "TTTT TTG",
1173
+ "AGGG GG",
1174
+ "ATAA GC",
1175
+ "TAAG CC",
1176
+ "AC TGG",
1177
+ "AC AAAA",
1178
+ "ATC ATT",
1179
+ "TC TTTC",
1180
+ "ATG ATG",
1181
+ "TGC AA",
1182
+ "AGG TTC",
1183
+ "AAC ATT",
1184
+ "ATG GGC",
1185
+ "AT AGAG",
1186
+ "AA ATGG",
1187
+ "AG TTCC",
1188
+ "TT TAGC",
1189
+ "AAC TTC",
1190
+ "AGC AAG",
1191
+ "AT AAAAC",
1192
+ "AAAA TC",
1193
+ "AGCC AC",
1194
+ "AGG AAC",
1195
+ "TTAA CC",
1196
+ "TATT TATT",
1197
+ "TTTC TG",
1198
+ "ATAA GG",
1199
+ "AGCC ACC",
1200
+ "AG ATGC",
1201
+ "TTAA GC",
1202
+ "TTG TAA",
1203
+ "AG TGTG",
1204
+ "AACC CC",
1205
+ "TTC ATT",
1206
+ "ATC ATG",
1207
+ "AA TGAA",
1208
+ "AGG TGC",
1209
+ "AAAAAAAA AAAAAAAA",
1210
+ "AGG ATG",
1211
+ "AGCC G",
1212
+ "TGG TGG",
1213
+ "AG TGGG",
1214
+ "TGCAC TCCAGCC",
1215
+ "TATT GC",
1216
+ "TAG TC",
1217
+ "CCC G",
1218
+ "AAG TAA",
1219
+ "TAG TG",
1220
+ "TTTTTTTT TTTTTTTT",
1221
+ "AGC ATT",
1222
+ "ATC TGC",
1223
+ "TCTC AC",
1224
+ "AA ATTG",
1225
+ "TT TAGG",
1226
+ "AG ACCC",
1227
+ "GGG CC",
1228
+ "TCC TTC",
1229
+ "AT AGGG",
1230
+ "AA TATG",
1231
+ "TT ATAC",
1232
+ "TAG AAG",
1233
+ "AA AGTG",
1234
+ "AA ATCC",
1235
+ "TTCC TC",
1236
+ "TTTC AC",
1237
+ "AG TATG",
1238
+ "TACTAAAA ATAC",
1239
+ "ATG TGC",
1240
+ "AGG AGGC",
1241
+ "TAT ATC",
1242
+ "TTC TAA",
1243
+ "TG AGGC",
1244
+ "ACAC AC",
1245
+ "TCC CCC",
1246
+ "AAC ATC",
1247
+ "AAGC G",
1248
+ "AA TGGC",
1249
+ "ACC CCC",
1250
+ "AG ATAC",
1251
+ "AT AAAAG",
1252
+ "ATG ATT",
1253
+ "TGG AGG",
1254
+ "AG TTAA"
1255
+ ]
1256
+ }
1257
+ }
vocab.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"[PAD]": 0, "[UNK]": 1, "[CLS]": 2, "[SEP]": 3, "[MASK]": 4, "A": 5, "C": 6, "G": 7, "T": 8, "TT": 9, "AA": 10, "TG": 11, "AG": 12, "CC": 13, "TC": 14, "AC": 15, "GG": 16, "ATT": 17, "AT": 18, "ATG": 19, "GC": 20, "TAA": 21, "TCC": 22, "ACC": 23, "AAAA": 24, "AGG": 25, "ATC": 26, "AGC": 27, "TTC": 28, "AAG": 29, "TTTT": 30, "TGC": 31, "TGG": 32, "AAC": 33, "TTG": 34, "TAG": 35, "TAC": 36, "CCC": 37, "TATT": 38, "TGGG": 39, "TAT": 40, "AGAA": 41, "AGGG": 42, "TTTC": 43, "AGGC": 44, "AGCC": 45, "ATAA": 46, "TGTG": 47, "TTGG": 48, "ATTC": 49, "AAGG": 50, "ACAC": 51, "TCCC": 52, "TCTC": 53, "TATG": 54, "TTTG": 55, "TTCC": 56, "AGTG": 57, "ATGG": 58, "AGAC": 59, "AAAC": 60, "ACCC": 61, "TGCC": 62, "ATTG": 63, "ATCC": 64, "AGAG": 65, "ATGC": 66, "ATAC": 67, "TCTG": 68, "TTAA": 69, "TCAC": 70, "TGAA": 71, "TGGC": 72, "TTGC": 73, "TAAG": 74, "TATC": 75, "TAAC": 76, "AAAG": 77, "TTAC": 78, "AAGC": 79, "GGG": 80, "TAGC": 81, "GGC": 82, "ATAT": 83, "TACC": 84, "AACC": 85, "AATG": 86, "TAGG": 87, "GCC": 88, "ATATT": 89, "AGTC": 90, "TTTTC": 91, "AAAAC": 92, "TGAC": 93, "TTTAA": 94, "AAAAG": 95, "AATC": 96, "TGTC": 97, "TTATT": 98, "ATAG": 99, "TGAG": 100, "TTTTG": 101, "AAATT": 102, "AATT": 103, "AATAA": 104, "TTTCC": 105, "ACAG": 106, "TCAG": 107, "AAATG": 108, "TGGGC": 109, "ACTC": 110, "AGGCC": 111, "TTAG": 112, "ACTG": 113, "ACG": 114, "ATATG": 115, "TGGCC": 116, "ATTTC": 117, "ACAA": 118, "ATCTC": 119, "TATTC": 120, "TGTAA": 121, "ACTT": 122, "ATGCC": 123, "TAAAA": 124, "AAAAAAAA": 125, "ATTCC": 126, "TTTAG": 127, "TCCCC": 128, "TTTGC": 129, "TTCCC": 130, "TGGGG": 131, "TTCTC": 132, "ATAAAA": 133, "AGAAG": 134, "TTTTTTTT": 135, "ACCCC": 136, "AGGGC": 137, "ACCTC": 138, "AGATG": 139, "ATTAC": 140, "AAGCC": 141, "GGCC": 142, "AGGAG": 143, "TCAA": 144, "ATTGC": 145, "TATTG": 146, "ATAAC": 147, "ATATC": 148, "TTTAC": 149, "ATGGC": 150, "AAGGC": 151, "ACCAC": 152, "GTG": 153, "ATCCC": 154, "AGAAC": 155, "ATTTT": 156, "TTGCC": 157, "AAATC": 158, "ATAAG": 159, "TTGGC": 160, "TGGAG": 161, "ATGGG": 162, "AAAGC": 163, "AGGGG": 164, "ATCAC": 165, "ATTTG": 166, "AATTC": 167, "TGCAC": 168, "TTTGG": 169, "TCG": 170, "AGAGC": 171, "AAAGG": 172, "GGGC": 173, "TTGGG": 174, "AGAAAA": 175, "TATCC": 176, "TCTCC": 177, "ATAGC": 178, "TGAGG": 179, "TTTATT": 180, "AGTAA": 181, "AGAGG": 182, "TCTTC": 183, "ACATT": 184, "TCCTG": 185, "AGCCC": 186, "TATGC": 187, "TTAAAA": 188, "AGATT": 189, "TTAAC": 190, "GGGG": 191, "AAGAC": 192, "TCATT": 193, "TTCTG": 194, "AGACC": 195, "AAGGG": 196, "ATACC": 197, "TTTAT": 198, "AAGTG": 199, "TTATG": 200, "AAGAA": 201, "TAGCC": 202, "TTCAC": 203, "AGGTG": 204, "TTGAA": 205, "ATCTG": 206, "AGCAC": 207, "TGCTG": 208, "AAACC": 209, "ATGTG": 210, "TTTTCC": 211, "AGTTC": 212, "TCCTC": 213, "TATGG": 214, "AATAC": 215, "AGTGG": 216, "TAGGC": 217, "AGCTC": 218, "ATAGG": 219, "TTATC": 220, "TTAAG": 221, "TACCC": 222, "TTTTTG": 223, "AACAC": 224, "TGCTC": 225, "AGATC": 226, "TCCCAGC": 227, "AGCTG": 228, "AATAG": 229, "TCTTG": 230, "AGTGGC": 231, "ATTGG": 232, "TACTC": 233, "TAAAC": 234, "AATGG": 235, "AGGTC": 236, "AGGAC": 237, "TTGTG": 238, "TATAC": 239, "ATTTTC": 240, "ATATAA": 241, "AGGCTG": 242, "ATTTAA": 243, "AGTT": 244, "AGTAG": 245, "ATGAC": 246, "AATGC": 247, "TCCAC": 248, "CCCC": 249, "ATGTC": 250, "AACTC": 251, "TTTTTC": 252, "TAAGC": 253, "AAGTC": 254, "TGGTG": 255, "TATAA": 256, "AGTGC": 257, "TAAGG": 258, "ACCTG": 259, "TTAGC": 260, "AAATAA": 261, "TGCCTC": 262, "AATCC": 263, "TTGGCC": 264, "TAGGG": 265, "TGGAC": 266, "TTGTC": 267, "AACCC": 268, "TTACC": 269, "TAACC": 270, "AATTTT": 271, "AAAGAA": 272, "ATTATT": 273, "AGCG": 274, "AAAAAC": 275, "TAATG": 276, "TTGAC": 277, "AGTCC": 278, "AACTG": 279, "AGTTG": 280, "AATTG": 281, "TCTGC": 282, "TTAGG": 283, "TACAC": 284, "AGAAGG": 285, "ATATTC": 286, "AAAACC": 287, "AAAAGC": 288, "TGCCC": 289, "ACTGC": 290, "AGAAGC": 291, "TAATAA": 292, "AATATT": 293, "ACCATG": 294, "TGGTC": 295, "TTTTGC": 296, "AACG": 297, "TACTG": 298, "ACACACAC": 299, "ATTTTG": 300, "TCCG": 301, "TGCG": 302, "AAAATG": 303, "ACATG": 304, "TCAGC": 305, "ATCG": 306, "AGTAC": 307, "TTTTGG": 308, "AATAT": 309, "AGAGAA": 310, "TTCG": 311, "TCCAGCC": 312, "ATATAC": 313, "TCACC": 314, "AAAAGG": 315, "TGTGTGTG": 316, "TCATC": 317, "TGCTGGG": 318, "TGAAG": 319, "TGTAG": 320, "TGTGG": 321, "AAAAATT": 322, "ACTTC": 323, "TTCCCC": 324, "ATAGAA": 325, "TTGCCC": 326, "AGGAGG": 327, "TTTCCC": 328, "TATATT": 329, "ACCG": 330, "ACTAC": 331, "TCACTGC": 332, "GCG": 333, "TTTGTG": 334, "ACAGC": 335, "TCATG": 336, "AGTTTT": 337, "AGGAA": 338, "TTTATG": 339, "ATATTG": 340, "TGATG": 341, "TCTAA": 342, "TGTGC": 343, "AGGAAG": 344, "TTTGGG": 345, "TGTTC": 346, "AGCCCC": 347, "AGTTTC": 348, "AGGCTGG": 349, "TTTGCC": 350, "ATTTCC": 351, "ATACAC": 352, "AAAATAA": 353, "TAGAC": 354, "AGGAGAA": 355, "TGAGC": 356, "TGGAA": 357, "TTTTTAA": 358, "AGCCTCCC": 359, "ATGAA": 360, "TTTAAG": 361, "TCTGG": 362, "TTTATC": 363, "TTATAA": 364, "TGATT": 365, "AACAA": 366, "TAGCTGGG": 367, "TCAAG": 368, "AAAAAA": 369, "ACTTTGGG": 370, "TATTCC": 371, "TCAGG": 372, "AACAG": 373, "TTCTTC": 374, "TGTGGC": 375, "ATATGC": 376, "ATTACAGGC": 377, "AGGGGC": 378, "AGGGCC": 379, "TTATTC": 380, "ATATCC": 381, "TGTAATCCCAGC": 382, "TACG": 383, "AGAAAC": 384, "TGTCC": 385, "AGATGG": 386, "TGTGCC": 387, "TTTCTC": 388, "TGAAC": 389, "AGTCTC": 390, "TGTTG": 391, "ATTTTTT": 392, "AAGAAG": 393, "TGGGGC": 394, "AGCAGC": 395, "GCCC": 396, "TTTGGC": 397, "AGGCTGAGGC": 398, "TGGGCC": 399, "TTCTCC": 400, "TAGAA": 401, "TGGAGTGC": 402, "ATTAA": 403, "AGTGCC": 404, "TGTCTC": 405, "ATATGG": 406, "ACATC": 407, "TGGGGG": 408, "TGACC": 409, "ACTCC": 410, "TAAAAC": 411, "AGATAA": 412, "TAATTTT": 413, "TCAAC": 414, "TCTAC": 415, "TCTAG": 416, "GAG": 417, "TAAATG": 418, "AGCAA": 419, "TATATG": 420, "ATATATAT": 421, "ATTTGC": 422, "TCCTCC": 423, "CCCAC": 424, "ATTTATT": 425, "TCTGCC": 426, "ATGGCC": 427, "TCGC": 428, "AGTATT": 429, "AGAACC": 430, "TTAAAC": 431, "AAATTC": 432, "AGAGAC": 433, "ATTTAC": 434, "ATTGCC": 435, "AACAAC": 436, "TTTAAC": 437, "ACGG": 438, "AAGAAAA": 439, "TCTGGC": 440, "ATTCTCC": 441, "AGGTGG": 442, "TGCTGC": 443, "TTCAAG": 444, "AGAGGG": 445, "ACACC": 446, "TCTTTT": 447, "AGAGGC": 448, "ATCACC": 449, "TAAATT": 450, "AAGGCC": 451, "TTGCAGTG": 452, "TGTAC": 453, "AATTTC": 454, "ATCCCC": 455, "ACAAG": 456, "ACAGG": 457, "ACAAC": 458, "TGCCCC": 459, "AGATTC": 460, "TTAGAA": 461, "TTGGGG": 462, "AGACAC": 463, "TGGAAG": 464, "ACCTCC": 465, "ATGGGG": 466, "AGCCTCC": 467, "TTATTG": 468, "TAAAAG": 469, "ATCTTC": 470, "ATCTCC": 471, "TGAAGC": 472, "TAATC": 473, "AAATGC": 474, "TTGTTG": 475, "ATTCCC": 476, "TACTAAAA": 477, "ATAGTG": 478, "AAATAC": 479, "TTGGGC": 480, "TAGAGAC": 481, "TGTTTT": 482, "TTCTGC": 483, "TGGCCC": 484, "TCTGTC": 485, "AGCTCC": 486, "AACTCC": 487, "TTAGCC": 488, "AAAGTGCTGGG": 489, "ATAGAC": 490, "TATTTTTAG": 491, "ACTTG": 492, "ACCACC": 493, "AAACAC": 494, "GTGG": 495, "ATTTAG": 496, "AGGAGC": 497, "AGGCTGGAGTGC": 498, "ATACCC": 499, "ATGTAA": 500, "ACGC": 501, "AGTAT": 502, "TTTACC": 503, "ACTAA": 504, "AGGCCC": 505, "AAGGGG": 506, "TCTCG": 507, "ATGAAG": 508, "AAAGAC": 509, "TGAAAA": 510, "AAGGGC": 511, "ATAGGC": 512, "AGAGTG": 513, "AGCTGC": 514, "ATGTTC": 515, "TATTTC": 516, "TGATC": 517, "AGTTTG": 518, "AGCTAA": 519, "AGAGCC": 520, "TGCTTC": 521, "ATCATC": 522, "AACATGG": 523, "AGCTTC": 524, "AAGAAC": 525, "TTTTTTG": 526, "AGGGGG": 527, "ATAAGC": 528, "TAAGCC": 529, "ACTGG": 530, "ACAAAA": 531, "ATCATT": 532, "TCTTTC": 533, "ATGATG": 534, "TGCAA": 535, "AGGTTC": 536, "AACATT": 537, "ATGGGC": 538, "ATAGAG": 539, "AAATGG": 540, "AGTTCC": 541, "TTTAGC": 542, "AACTTC": 543, "AGCAAG": 544, "ATAAAAC": 545, "AAAATC": 546, "AGCCAC": 547, "AGGAAC": 548, "TTAACC": 549, "TATTTATT": 550, "TTTCTG": 551, "ATAAGG": 552, "AGCCACC": 553, "AGATGC": 554, "TTAAGC": 555, "TTGTAA": 556, "AGTGTG": 557, "AACCCC": 558, "TTCATT": 559, "ATCATG": 560, "AATGAA": 561, "AGGTGC": 562, "AAAAAAAAAAAAAAAA": 563, "AGGATG": 564, "AGCCG": 565, "TGGTGG": 566, "AGTGGG": 567, "TGCACTCCAGCC": 568, "TATTGC": 569, "TAGTC": 570, "CCCG": 571, "AAGTAA": 572, "TAGTG": 573, "TTTTTTTTTTTTTTTT": 574, "AGCATT": 575, "ATCTGC": 576, "TCTCAC": 577, "AAATTG": 578, "TTTAGG": 579, "AGACCC": 580, "GGGCC": 581, "TCCTTC": 582, "ATAGGG": 583, "AATATG": 584, "TTATAC": 585, "TAGAAG": 586, "AAAGTG": 587, "AAATCC": 588, "TTCCTC": 589, "TTTCAC": 590, "AGTATG": 591, "TACTAAAAATAC": 592, "ATGTGC": 593, "AGGAGGC": 594, "TATATC": 595, "TTCTAA": 596, "TGAGGC": 597, "ACACAC": 598, "TCCCCC": 599, "AACATC": 600, "AAGCG": 601, "AATGGC": 602, "ACCCCC": 603, "AGATAC": 604, "ATAAAAG": 605, "ATGATT": 606, "TGGAGG": 607, "AGTTAA": 608, "": 609}