diff --git "a/tokenizer.json" "b/tokenizer.json" new file mode 100644--- /dev/null +++ "b/tokenizer.json" @@ -0,0 +1,100355 @@ +{ + "version": "1.0", + "truncation": null, + "padding": null, + "added_tokens": [ + { + "id": 0, + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 1, + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 2, + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 3, + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 50264, + "content": "", + "single_word": false, + "lstrip": true, + "rstrip": false, + "normalized": false, + "special": true + } + ], + "normalizer": null, + "pre_tokenizer": { + "type": "ByteLevel", + "add_prefix_space": false, + "trim_offsets": true, + "use_regex": true + }, + "post_processor": { + "type": "RobertaProcessing", + "sep": [ + "", + 2 + ], + "cls": [ + "", + 0 + ], + "trim_offsets": true, + "add_prefix_space": false + }, + "decoder": { + "type": "ByteLevel", + "add_prefix_space": true, + "trim_offsets": true, + "use_regex": true + }, + "model": { + "type": "BPE", + "dropout": null, + "unk_token": null, + "continuing_subword_prefix": "", + "end_of_word_suffix": "", + "fuse_unk": false, + "vocab": { + "": 0, + "": 1, + "": 2, + "": 3, + ".": 4, + "Ġthe": 5, + ",": 6, + "Ġto": 7, + "Ġand": 8, + "Ġof": 9, + "Ġa": 10, + "Ġin": 11, + "-": 12, + "Ġfor": 13, + "Ġthat": 14, + "Ġon": 15, + "Ġis": 16, + "âĢ": 17, + "'s": 18, + "Ġwith": 19, + "ĠThe": 20, + "Ġwas": 21, + "Ġ\"": 22, + "Ġat": 23, + "Ġit": 24, + "Ġas": 25, + "Ġsaid": 26, + "Ļ": 27, + "Ġbe": 28, + "s": 29, + "Ġby": 30, + "Ġfrom": 31, + "Ġare": 32, + "Ġhave": 33, + "Ġhas": 34, + ":": 35, + "Ġ(": 36, + "Ġhe": 37, + "ĠI": 38, + "Ġhis": 39, + "Ġwill": 40, + "Ġan": 41, + "Ġthis": 42, + ")": 43, + "ĠâĢ": 44, + "Ġnot": 45, + "Ŀ": 46, + "Ġyou": 47, + "ľ": 48, + "Ġtheir": 49, + "Ġor": 50, + "Ġthey": 51, + "Ġwe": 52, + "Ġbut": 53, + "Ġwho": 54, + "Ġmore": 55, + "Ġhad": 56, + "Ġbeen": 57, + "Ġwere": 58, + "Ġabout": 59, + ",\"": 60, + "Ġwhich": 61, + "Ġup": 62, + "Ġits": 63, + "Ġcan": 64, + "Ġone": 65, + "Ġout": 66, + "Ġalso": 67, + "Ġ$": 68, + "Ġher": 69, + "Ġall": 70, + "Ġafter": 71, + ".\"": 72, + "/": 73, + "Ġwould": 74, + "'t": 75, + "Ġyear": 76, + "Ġwhen": 77, + "Ġfirst": 78, + "Ġshe": 79, + "Ġtwo": 80, + "Ġover": 81, + "Ġpeople": 82, + "ĠA": 83, + "Ġour": 84, + "ĠIt": 85, + "Ġtime": 86, + "Ġthan": 87, + "Ġinto": 88, + "Ġthere": 89, + "t": 90, + "ĠHe": 91, + "Ġnew": 92, + "ĠâĢĶ": 93, + "Ġlast": 94, + "Ġjust": 95, + "ĠIn": 96, + "Ġother": 97, + "Ġso": 98, + "Ġwhat": 99, + "I": 100, + "Ġlike": 101, + "a": 102, + "Ġsome": 103, + "S": 104, + "ë": 105, + "Ġthem": 106, + "Ġyears": 107, + "'": 108, + "Ġdo": 109, + "Ġyour": 110, + "Ġ-": 111, + "Ġ1": 112, + "\"": 113, + "Ġif": 114, + "Ġcould": 115, + "?": 116, + "Ġno": 117, + "i": 118, + "m": 119, + "Ġget": 120, + "ĠU": 121, + "Ġnow": 122, + "Ġhim": 123, + "Ġback": 124, + "ĠBut": 125, + "ĠâĢĵ": 126, + "Ġmy": 127, + "Ġ'": 128, + "Ġonly": 129, + "Ġthree": 130, + ";": 131, + "Ġ2": 132, + "The": 133, + "1": 134, + "Ġpercent": 135, + "Ġagainst": 136, + "Ġbefore": 137, + "Ġcompany": 138, + "o": 139, + "ĠTrump": 140, + "Ġhow": 141, + "Ġbecause": 142, + "Ġany": 143, + "Ġmost": 144, + "Ġbeing": 145, + "Ġmake": 146, + "Ġwhere": 147, + "Ġduring": 148, + "Ġthrough": 149, + "Ġwhile": 150, + "000": 151, + "ĠThis": 152, + "Ġmillion": 153, + "ing": 154, + "Ġ3": 155, + "Ġmade": 156, + "Ġwell": 157, + "Ġ10": 158, + "Ġdown": 159, + "Ġoff": 160, + "Ġsays": 161, + "Ġme": 162, + "ĠB": 163, + "Ġgoing": 164, + "Ġteam": 165, + "ĠWe": 166, + "Ġthose": 167, + "Ġgovernment": 168, + "Ġway": 169, + "We": 170, + "Ġmany": 171, + "Ġthen": 172, + "Ġwork": 173, + "Ġtold": 174, + "com": 175, + "2": 176, + "Ġgame": 177, + "ĠAnd": 178, + "in": 179, + "year": 180, + "Ġp": 181, + "Ġvery": 182, + "Ġday": 183, + "Ġhome": 184, + "Ġtake": 185, + "Ġweek": 186, + "Ġsince": 187, + "ĠNew": 188, + "Ġmay": 189, + "Ġeven": 190, + "Ġseason": 191, + "Ġsee": 192, + "Ġ2017": 193, + "Ġstate": 194, + "Ġ5": 195, + "ed": 196, + "Ġshould": 197, + "Ġaround": 198, + "Ġ2018": 199, + "Ġsecond": 200, + "Ġus": 201, + "Ġstill": 202, + "Ġmuch": 203, + "Ġ4": 204, + "Ġgood": 205, + "Ġthink": 206, + "%": 207, + "ĠS": 208, + "Ġthese": 209, + "Ġmarket": 210, + "ĠD": 211, + "th": 212, + "Ġgo": 213, + "'re": 214, + "Ġsuch": 215, + "Ġknow": 216, + "Ġincluding": 217, + "Ġdon": 218, + "y": 219, + "Ġnext": 220, + "ĠP": 221, + "Ġdid": 222, + "Ġunder": 223, + "Ġsay": 224, + "en": 225, + "ĠL": 226, + "Ġbetween": 227, + "Ġper": 228, + "ĠK": 229, + "ĠC": 230, + "Ġ6": 231, + "Ġworld": 232, + "Ġpart": 233, + "ĠN": 234, + "Ġright": 235, + "Ġwant": 236, + "Ġfour": 237, + "),": 238, + "Ġhigh": 239, + "Ġneed": 240, + "re": 241, + "e": 242, + "It": 243, + "Ġhelp": 244, + "5": 245, + "3": 246, + "Ġcountry": 247, + "ĠR": 248, + "Ġpolice": 249, + "A": 250, + "Ġlong": 251, + "ĠThey": 252, + "Ġend": 253, + "er": 254, + "ĠT": 255, + "ĠM": 256, + "u": 257, + "Ġboth": 258, + "Ġhere": 259, + "an": 260, + "on": 261, + "Ġ7": 262, + "Ġde": 263, + "ĠShe": 264, + "Ġbusiness": 265, + "Ġreport": 266, + "j": 267, + "ers": 268, + "Ġreally": 269, + "ĠPresident": 270, + "ar": 271, + "ĠG": 272, + "ĠFriday": 273, + "ĠF": 274, + "Ġbest": 275, + "Ġsame": 276, + "Ġanother": 277, + "Ġset": 278, + "old": 279, + "ĠThat": 280, + "as": 281, + "n": 282, + "Ġcome": 283, + "Ġfamily": 284, + "Ġpublic": 285, + "ĠFor": 286, + "ĠAs": 287, + "0": 288, + "ĠH": 289, + "Ġ8": 290, + "Ġ20": 291, + "Ġfive": 292, + "es": 293, + "ĠTuesday": 294, + "Ġn": 295, + "ĠThursday": 296, + "Ġquarter": 297, + "h": 298, + "Ġtop": 299, + "Ġgot": 300, + "Ġlife": 301, + "ĠMonday": 302, + "Ġfound": 303, + "Ġuse": 304, + "ĠW": 305, + "4": 306, + "ĠWednesday": 307, + "Ġown": 308, + "Ġaccording": 309, + "Ġplay": 310, + "Ġshow": 311, + "ĠSt": 312, + "Ġman": 313, + "Ġleft": 314, + "ĠUnited": 315, + "Ġ12": 316, + "Ġplace": 317, + "ĠIf": 318, + "Ġlot": 319, + "Ġformer": 320, + "Ġ0": 321, + ").": 322, + "Ġsupport": 323, + "ie": 324, + "Ġbillion": 325, + "Ġt": 326, + "Ġshares": 327, + "!": 328, + "z": 329, + "k": 330, + "ĠState": 331, + "Ġpoints": 332, + "Ġgroup": 333, + "Ġschool": 334, + "Ġinformation": 335, + "Ġ2016": 336, + "al": 337, + "r": 338, + "Ġwin": 339, + "Ġnews": 340, + "Ġused": 341, + "Ġput": 342, + "Ġcity": 343, + "ĠJ": 344, + "ĠThere": 345, + "Ġnumber": 346, + "C": 347, + "'ve": 348, + "Ġeach": 349, + "Ġtoo": 350, + "Ġwon": 351, + "ly": 352, + "Ġmonth": 353, + "is": 354, + "Ġadded": 355, + "Ġlook": 356, + "Ġbetter": 357, + "Ġevery": 358, + "Ġ&": 359, + "Ġdays": 360, + "Ġ9": 361, + "Ġtook": 362, + "Ġnight": 363, + "Ġe": 364, + "Ġ11": 365, + "os": 366, + "Ġfew": 367, + "or": 368, + "ĠNorth": 369, + "ĠYou": 370, + "Ġthird": 371, + "Ġgreat": 372, + "Ġcalled": 373, + "ĠOn": 374, + "Ġpast": 375, + "Ġcame": 376, + "Ġmonths": 377, + "ĠSaturday": 378, + "Ġ15": 379, + "Ġbig": 380, + "ĠE": 381, + "ĠUS": 382, + "Ġthings": 383, + "ĠO": 384, + "Ġd": 385, + "Ġstart": 386, + "B": 387, + "Ġstock": 388, + "Ġ30": 389, + "Ġwomen": 390, + "ĠSouth": 391, + "ĠMay": 392, + "Ġnever": 393, + "Ġpresident": 394, + "ĠSunday": 395, + "Ġwithout": 396, + "man": 397, + "8": 398, + "Ġdidn": 399, + "Ġlocal": 400, + "6": 401, + "Ġsomething": 402, + "Ġcase": 403, + "ĠAll": 404, + "it": 405, + "7": 406, + "ĠSo": 407, + "Ġchildren": 408, + "Ġaway": 409, + "Ġlittle": 410, + "Ġsix": 411, + "ĠCity": 412, + "ĠCounty": 413, + "Ġdata": 414, + "at": 415, + "Ġalready": 416, + "d": 417, + "Ġmoney": 418, + "Ġearly": 419, + "Ġacross": 420, + "Ġexpected": 421, + "Ġrun": 422, + "Ġlater": 423, + "am": 424, + "Ġprice": 425, + "Ġgames": 426, + "ĠMr": 427, + "b": 428, + "Ġmight": 429, + "Ġdifferent": 430, + "Ġreported": 431, + "Ġdeal": 432, + "Ġmedia": 433, + "Ġgrowth": 434, + "Ġcommunity": 435, + "ĠChina": 436, + "'m": 437, + "c": 438, + "Ġwent": 439, + "ĠNo": 440, + "Ġable": 441, + "Ġmaking": 442, + "Ġarea": 443, + "Ġfar": 444, + "Ġstatement": 445, + "ĠHouse": 446, + "Ġworking": 447, + "M": 448, + "Ġk": 449, + "Ġseen": 450, + "Ġcompanies": 451, + "Ġtoday": 452, + "Ġmembers": 453, + "Ġuntil": 454, + "Ġfull": 455, + "Ġagain": 456, + "Ġhalf": 457, + "Ġshare": 458, + "le": 459, + "Ġalways": 460, + "Ġcourt": 461, + "l": 462, + "and": 463, + "Ġchange": 464, + "Ġfind": 465, + "9": 466, + "Ġsystem": 467, + "ĠV": 468, + "ĠYork": 469, + "ĠAmerican": 470, + "Ġhead": 471, + "Ġplayers": 472, + "Ġdoes": 473, + "Ġhealth": 474, + "Ġm": 475, + "Ġpower": 476, + "Ġpoint": 477, + "Ġhit": 478, + "Ġ.": 479, + "Ġ--": 480, + "Ġfree": 481, + ".,": 482, + "Ġlead": 483, + "Ġseveral": 484, + "Ġrecent": 485, + "Ġcall": 486, + "N": 487, + "Ġlaw": 488, + "Ġkeep": 489, + "Ġopen": 490, + "ĠNews": 491, + "Ġgive": 492, + "ia": 493, + "ĠMarch": 494, + "D": 495, + "ĠNational": 496, + "ĠAt": 497, + "Ġtimes": 498, + "Ġfuture": 499, + "R": 500, + "Ġ14": 501, + "ĠJune": 502, + "Ġofficials": 503, + "Ġ18": 504, + "Ġimportant": 505, + "f": 506, + "Ġfinal": 507, + "Ġ13": 508, + "ĠOne": 509, + "P": 510, + "Ġfollowing": 511, + "Ġcar": 512, + "Ġleast": 513, + "Ġwater": 514, + "Ġevent": 515, + "Ġline": 516, + "Ġmove": 517, + "Ġservices": 518, + "Ġhaving": 519, + "ĠWhen": 520, + "Ġstudents": 521, + "ĠPolice": 522, + "el": 523, + "Ġam": 524, + "ĠZ": 525, + "Ġside": 526, + "Ġstory": 527, + "Ġdue": 528, + "Ġmeeting": 529, + "K": 530, + "Ġmust": 531, + "ĠStates": 532, + "Ġlikely": 533, + "G": 534, + "Ġcontinue": 535, + "Ġago": 536, + "Ġparty": 537, + "Ġmajor": 538, + "Ġindustry": 539, + "Ġless": 540, + "30": 541, + "Ġun": 542, + "Ġhard": 543, + "Ġservice": 544, + "Ġ16": 545, + "Ġlooking": 546, + "Ġheld": 547, + "ve": 548, + "Ġwhether": 549, + "ĠJuly": 550, + "Ġtaken": 551, + "Ġalong": 552, + "Ġasked": 553, + "Ġstarted": 554, + "Ġbecome": 555, + "Ġforward": 556, + "Ġresearch": 557, + "Ġoffice": 558, + "Ġpolitical": 559, + "to": 560, + "Ġtogether": 561, + "Ġgetting": 562, + "Ġplan": 563, + "Ġ25": 564, + "T": 565, + "Ġamong": 566, + "Ġcoming": 567, + "Ġdecision": 568, + "Ġvideo": 569, + "Ġ2015": 570, + "g": 571, + "ĠAfter": 572, + "Ġsecurity": 573, + "L": 574, + "Ġcare": 575, + "Ġgiven": 576, + "Ġavailable": 577, + "âĢĶ": 578, + "Ġs": 579, + "ĠWest": 580, + "'ll": 581, + "Ġpay": 582, + "Ġnear": 583, + "Ġsaying": 584, + "Ġannounced": 585, + "Ġprogram": 586, + "ĠApril": 587, + "Ġreal": 588, + "ĠUniversity": 589, + "ĠWith": 590, + "AP": 591, + "Ġsocial": 592, + "Ġclose": 593, + "et": 594, + "Ġcurrent": 595, + "Ġwhy": 596, + "F": 597, + "ĠTo": 598, + "ĠTwitter": 599, + "Ġthough": 600, + "Ġ17": 601, + "Ġtaking": 602, + "ĠInc": 603, + "Ġmen": 604, + "w": 605, + "Ġcomes": 606, + "ley": 607, + "Ġdoing": 608, + "Ġprocess": 609, + "ĠJohn": 610, + "ch": 611, + "00": 612, + "Ġfinancial": 613, + "Ġlow": 614, + "Ġenough": 615, + "ĠWhile": 616, + "Ġfurther": 617, + "Ġpost": 618, + "Ġfeel": 619, + "st": 620, + "Ġperson": 621, + "ĠFacebook": 622, + "ĠWorld": 623, + "Ġwithin": 624, + "ad": 625, + "Ġdone": 626, + "the": 627, + "Ġlate": 628, + "Ġtax": 629, + "Ġdoesn": 630, + "Ġthing": 631, + "Ġnational": 632, + "Ġjob": 633, + "Ġusing": 634, + "ĠHowever": 635, + "ic": 636, + "Ġcampaign": 637, + "Ġrecord": 638, + "Ġbehind": 639, + "://": 640, + "ĠDepartment": 641, + "p": 642, + "Ġothers": 643, + "ĠJanuary": 644, + "Ġorder": 645, + "Ġ[": 646, + "Ġsales": 647, + "Ġyet": 648, + "Ä": 649, + "Ġsmall": 650, + "Ġseries": 651, + "Ġface": 652, + "ĠWhat": 653, + "Ġ50": 654, + "Ġever": 655, + "Ġearlier": 656, + "Ġlove": 657, + "up": 658, + "Ġrights": 659, + "ĠAn": 660, + "ist": 661, + "Ġmorning": 662, + "ĠWashington": 663, + "Ġyoung": 664, + "Ġlatest": 665, + "ĠIndia": 666, + "Ġtrying": 667, + "Ġfire": 668, + "Ġled": 669, + "Ġstrong": 670, + "Ġreturn": 671, + "Ġlevel": 672, + "O": 673, + "Ġaverage": 674, + "Ġperiod": 675, + "Ġexperience": 676, + "ak": 677, + "Ġpossible": 678, + "Ġbelieve": 679, + "Ġinclude": 680, + "Ġoil": 681, + "Ġrecently": 682, + "Ġonce": 683, + "Ġknown": 684, + "Ġlost": 685, + "Ġsure": 686, + "us": 687, + "Ġweeks": 688, + "Ġfood": 689, + "Ġreports": 690, + "Ġrating": 691, + "ĠMinister": 692, + "Ġwoman": 693, + "Ġprovide": 694, + "Ġproject": 695, + "Ġissue": 696, + "Ġlive": 697, + "10": 698, + "Ġclear": 699, + "he": 700, + "Ġcost": 701, + "Ġplayed": 702, + "Ġreleased": 703, + "Ġcoach": 704, + "v": 705, + "Ġ24": 706, + "Ġseven": 707, + "Ġplans": 708, + "Ġdevelopment": 709, + "ur": 710, + "ĺ": 711, + "Ġincrease": 712, + "This": 713, + "Ġpolicy": 714, + "Ġcent": 715, + "Ġbased": 716, + "E": 717, + "il": 718, + "ĠDecember": 719, + "Ġglobal": 720, + "Ġtrade": 721, + "Ġhours": 722, + "Ġhigher": 723, + "Ġgoal": 724, + "H": 725, + "ĠAl": 726, + "Ġ100": 727, + "Ġminutes": 728, + "Ġelection": 729, + "ĠAmerica": 730, + "Ġrate": 731, + "ĠCh": 732, + "Ġ21": 733, + "...": 734, + "ĠWhite": 735, + "Ġdirector": 736, + "Ġposition": 737, + "Ġshot": 738, + "Ġlarge": 739, + "Ġc": 740, + "Ġb": 741, + "]": 742, + "Ġissues": 743, + "Ġdeath": 744, + "Ġbuilding": 745, + "Ġtotal": 746, + "Ġoften": 747, + "Ġv": 748, + "Ġcountries": 749, + "Ġhistory": 750, + "Ġoutside": 751, + "Ġfederal": 752, + "Ġ19": 753, + "Ġfact": 754, + "ĠHigh": 755, + "Ġcareer": 756, + "im": 757, + "Ġinternational": 758, + "ĠNovember": 759, + "Ġfront": 760, + "Ġkind": 761, + "Ġkey": 762, + "ra": 763, + "ĠSan": 764, + "Ġshort": 765, + "Ġname": 766, + "ĠAccording": 767, + "Ġcourse": 768, + "Ġre": 769, + "Ġwanted": 770, + "W": 771, + "ĠSeptember": 772, + "Ġinterest": 773, + "Ġrole": 774, + "Ġresults": 775, + "Ġeconomic": 776, + "Ġ2014": 777, + "Ġchance": 778, + "ĠOctober": 779, + "Ġspecial": 780, + "Ġofficial": 781, + "Ġneeds": 782, + "um": 783, + "Ġl": 784, + "Ġproducts": 785, + "Ġnon": 786, + "Ġ@": 787, + "ĠBank": 788, + "Ġahead": 789, + "Ġhouse": 790, + "U": 791, + "Ġboard": 792, + "Ġold": 793, + "Ġsaw": 794, + "Ġlower": 795, + "ĠEuropean": 796, + "Ġcontrol": 797, + "ĠRussia": 798, + "Ġeight": 799, + "Ġrelease": 800, + "Ġpotential": 801, + "Ġthought": 802, + "Ġinvestigation": 803, + "Ġonline": 804, + "based": 805, + "Ġtechnology": 806, + "ĠDonald": 807, + "id": 808, + "Ġbody": 809, + "Ġrisk": 810, + "ian": 811, + "Ġcapital": 812, + "Ġstaff": 813, + "Ġaction": 814, + "ĠLeague": 815, + "Ġplaying": 816, + "Ġmakes": 817, + "Ġalmost": 818, + "Ġperformance": 819, + "Ġ22": 820, + "Ġg": 821, + "Ġfilm": 822, + "Ġnearly": 823, + "ĠCenter": 824, + "Ġvisit": 825, + "ĠGroup": 826, + "Ġbank": 827, + "Ġbit": 828, + "Ġreceived": 829, + "ĠAugust": 830, + "Ġmilitary": 831, + "ĠHis": 832, + "ine": 833, + "Ġchief": 834, + "ĠSchool": 835, + "Ġbring": 836, + "ĠCourt": 837, + "Ġ(@": 838, + "Ġmeans": 839, + "ĠSh": 840, + "Ġfans": 841, + "Ġse": 842, + "Ġ40": 843, + "20": 844, + "\".": 845, + "V": 846, + "Ġcut": 847, + "Ġkilled": 848, + "Ġ#": 849, + "Ġprices": 850, + "Ġgave": 851, + "ĠStreet": 852, + "ir": 853, + "ĠY": 854, + "Ġcurrently": 855, + "Ġf": 856, + "ay": 857, + "ne": 858, + "te": 859, + "Ġtry": 860, + "ĠPark": 861, + "ĥ": 862, + "J": 863, + "Ġquestion": 864, + "Ġhand": 865, + "Ġeconomy": 866, + "Ġinvestors": 867, + "able": 868, + "Ġplayer": 869, + "ĠBy": 870, + "ĠDavid": 871, + "Ġloss": 872, + "ab": 873, + "Ġbelow": 874, + "Ġwr