{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.009741248097412482, "eval_steps": 500, "global_step": 200, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 4.8706240487062404e-05, "grad_norm": 5.851158142089844, "learning_rate": 1.0000000000000002e-06, "loss": 2.1245, "step": 1 }, { "epoch": 9.741248097412481e-05, "grad_norm": 18.33625602722168, "learning_rate": 2.0000000000000003e-06, "loss": 3.869, "step": 2 }, { "epoch": 0.00014611872146118722, "grad_norm": 15.931517601013184, "learning_rate": 3e-06, "loss": 3.1648, "step": 3 }, { "epoch": 0.00019482496194824962, "grad_norm": 26.00968360900879, "learning_rate": 4.000000000000001e-06, "loss": 2.3093, "step": 4 }, { "epoch": 0.000243531202435312, "grad_norm": 27.92300033569336, "learning_rate": 5e-06, "loss": 2.221, "step": 5 }, { "epoch": 0.00029223744292237444, "grad_norm": 9.341499328613281, "learning_rate": 6e-06, "loss": 2.2322, "step": 6 }, { "epoch": 0.0003409436834094368, "grad_norm": 13.460984230041504, "learning_rate": 7.000000000000001e-06, "loss": 2.5544, "step": 7 }, { "epoch": 0.00038964992389649923, "grad_norm": 12.86645793914795, "learning_rate": 8.000000000000001e-06, "loss": 2.537, "step": 8 }, { "epoch": 0.00043835616438356166, "grad_norm": 15.244658470153809, "learning_rate": 9e-06, "loss": 2.7317, "step": 9 }, { "epoch": 0.000487062404870624, "grad_norm": 11.827373504638672, "learning_rate": 1e-05, "loss": 2.2533, "step": 10 }, { "epoch": 0.0005357686453576865, "grad_norm": 10.726211547851562, "learning_rate": 1.1000000000000001e-05, "loss": 2.2206, "step": 11 }, { "epoch": 0.0005844748858447489, "grad_norm": 13.430566787719727, "learning_rate": 1.2e-05, "loss": 2.4726, "step": 12 }, { "epoch": 0.0006331811263318112, "grad_norm": 15.749054908752441, "learning_rate": 1.3000000000000001e-05, "loss": 2.3328, "step": 13 }, { "epoch": 0.0006818873668188736, "grad_norm": 13.098236083984375, "learning_rate": 1.4000000000000001e-05, "loss": 2.3669, "step": 14 }, { "epoch": 0.0007305936073059361, "grad_norm": 14.811712265014648, "learning_rate": 1.5e-05, "loss": 2.3254, "step": 15 }, { "epoch": 0.0007792998477929985, "grad_norm": 13.960674285888672, "learning_rate": 1.6000000000000003e-05, "loss": 2.3478, "step": 16 }, { "epoch": 0.0008280060882800608, "grad_norm": 13.510299682617188, "learning_rate": 1.7000000000000003e-05, "loss": 2.3439, "step": 17 }, { "epoch": 0.0008767123287671233, "grad_norm": 21.18514060974121, "learning_rate": 1.8e-05, "loss": 2.6721, "step": 18 }, { "epoch": 0.0009254185692541857, "grad_norm": 17.1453800201416, "learning_rate": 1.9e-05, "loss": 2.1923, "step": 19 }, { "epoch": 0.000974124809741248, "grad_norm": 15.410392761230469, "learning_rate": 2e-05, "loss": 2.4277, "step": 20 }, { "epoch": 0.0010228310502283105, "grad_norm": 17.85428810119629, "learning_rate": 2.1e-05, "loss": 2.1419, "step": 21 }, { "epoch": 0.001071537290715373, "grad_norm": 16.322954177856445, "learning_rate": 2.2000000000000003e-05, "loss": 2.4326, "step": 22 }, { "epoch": 0.0011202435312024353, "grad_norm": 18.91599464416504, "learning_rate": 2.3000000000000003e-05, "loss": 2.1728, "step": 23 }, { "epoch": 0.0011689497716894977, "grad_norm": 14.737173080444336, "learning_rate": 2.4e-05, "loss": 2.2486, "step": 24 }, { "epoch": 0.0012176560121765602, "grad_norm": 14.996482849121094, "learning_rate": 2.5e-05, "loss": 1.9959, "step": 25 }, { "epoch": 0.0012663622526636225, "grad_norm": 12.736573219299316, "learning_rate": 2.6000000000000002e-05, "loss": 1.8652, "step": 26 }, { "epoch": 0.001315068493150685, "grad_norm": 13.891550064086914, "learning_rate": 2.7000000000000002e-05, "loss": 2.3333, "step": 27 }, { "epoch": 0.0013637747336377472, "grad_norm": 11.90274429321289, "learning_rate": 2.8000000000000003e-05, "loss": 1.8707, "step": 28 }, { "epoch": 0.0014124809741248097, "grad_norm": 16.16830062866211, "learning_rate": 2.9e-05, "loss": 1.9199, "step": 29 }, { "epoch": 0.0014611872146118722, "grad_norm": 12.124021530151367, "learning_rate": 3e-05, "loss": 1.8475, "step": 30 }, { "epoch": 0.0015098934550989344, "grad_norm": 13.702016830444336, "learning_rate": 3.1e-05, "loss": 1.6096, "step": 31 }, { "epoch": 0.001558599695585997, "grad_norm": 16.256940841674805, "learning_rate": 3.2000000000000005e-05, "loss": 2.0673, "step": 32 }, { "epoch": 0.0016073059360730594, "grad_norm": 11.912320137023926, "learning_rate": 3.3e-05, "loss": 1.5261, "step": 33 }, { "epoch": 0.0016560121765601217, "grad_norm": 12.158217430114746, "learning_rate": 3.4000000000000007e-05, "loss": 1.566, "step": 34 }, { "epoch": 0.0017047184170471841, "grad_norm": 11.484949111938477, "learning_rate": 3.5e-05, "loss": 2.0858, "step": 35 }, { "epoch": 0.0017534246575342466, "grad_norm": 12.884050369262695, "learning_rate": 3.6e-05, "loss": 1.0508, "step": 36 }, { "epoch": 0.0018021308980213089, "grad_norm": 14.054976463317871, "learning_rate": 3.7e-05, "loss": 1.2539, "step": 37 }, { "epoch": 0.0018508371385083714, "grad_norm": 13.9093599319458, "learning_rate": 3.8e-05, "loss": 1.5044, "step": 38 }, { "epoch": 0.0018995433789954338, "grad_norm": 42.75831604003906, "learning_rate": 3.9000000000000006e-05, "loss": 1.2133, "step": 39 }, { "epoch": 0.001948249619482496, "grad_norm": 13.14990234375, "learning_rate": 4e-05, "loss": 1.4692, "step": 40 }, { "epoch": 0.0019969558599695586, "grad_norm": 11.442179679870605, "learning_rate": 4.1e-05, "loss": 1.1273, "step": 41 }, { "epoch": 0.002045662100456621, "grad_norm": 9.139272689819336, "learning_rate": 4.2e-05, "loss": 0.6377, "step": 42 }, { "epoch": 0.0020943683409436835, "grad_norm": 9.682751655578613, "learning_rate": 4.3e-05, "loss": 0.8092, "step": 43 }, { "epoch": 0.002143074581430746, "grad_norm": 10.700410842895508, "learning_rate": 4.4000000000000006e-05, "loss": 0.9342, "step": 44 }, { "epoch": 0.002191780821917808, "grad_norm": 8.799379348754883, "learning_rate": 4.5e-05, "loss": 0.6458, "step": 45 }, { "epoch": 0.0022404870624048705, "grad_norm": 7.537528038024902, "learning_rate": 4.600000000000001e-05, "loss": 0.3726, "step": 46 }, { "epoch": 0.002289193302891933, "grad_norm": 9.551314353942871, "learning_rate": 4.7e-05, "loss": 0.6441, "step": 47 }, { "epoch": 0.0023378995433789955, "grad_norm": 13.743586540222168, "learning_rate": 4.8e-05, "loss": 1.0114, "step": 48 }, { "epoch": 0.002386605783866058, "grad_norm": 9.87704849243164, "learning_rate": 4.9e-05, "loss": 0.5438, "step": 49 }, { "epoch": 0.0024353120243531205, "grad_norm": 20.906862258911133, "learning_rate": 5e-05, "loss": 1.8715, "step": 50 }, { "epoch": 0.0024840182648401825, "grad_norm": 8.490747451782227, "learning_rate": 5.1000000000000006e-05, "loss": 2.5756, "step": 51 }, { "epoch": 0.002532724505327245, "grad_norm": 12.285213470458984, "learning_rate": 5.2000000000000004e-05, "loss": 2.3924, "step": 52 }, { "epoch": 0.0025814307458143075, "grad_norm": 15.3045015335083, "learning_rate": 5.300000000000001e-05, "loss": 1.8956, "step": 53 }, { "epoch": 0.00263013698630137, "grad_norm": 15.969414710998535, "learning_rate": 5.4000000000000005e-05, "loss": 1.8218, "step": 54 }, { "epoch": 0.0026788432267884324, "grad_norm": 13.07938003540039, "learning_rate": 5.500000000000001e-05, "loss": 1.8747, "step": 55 }, { "epoch": 0.0027275494672754945, "grad_norm": 9.398449897766113, "learning_rate": 5.6000000000000006e-05, "loss": 1.8304, "step": 56 }, { "epoch": 0.002776255707762557, "grad_norm": 7.938607215881348, "learning_rate": 5.6999999999999996e-05, "loss": 1.9411, "step": 57 }, { "epoch": 0.0028249619482496194, "grad_norm": 9.889793395996094, "learning_rate": 5.8e-05, "loss": 2.0788, "step": 58 }, { "epoch": 0.002873668188736682, "grad_norm": 7.4011101722717285, "learning_rate": 5.9e-05, "loss": 1.954, "step": 59 }, { "epoch": 0.0029223744292237444, "grad_norm": 7.340896129608154, "learning_rate": 6e-05, "loss": 1.9234, "step": 60 }, { "epoch": 0.002971080669710807, "grad_norm": 13.956856727600098, "learning_rate": 6.1e-05, "loss": 2.3137, "step": 61 }, { "epoch": 0.003019786910197869, "grad_norm": 12.846822738647461, "learning_rate": 6.2e-05, "loss": 1.5154, "step": 62 }, { "epoch": 0.0030684931506849314, "grad_norm": 10.943364143371582, "learning_rate": 6.3e-05, "loss": 1.9186, "step": 63 }, { "epoch": 0.003117199391171994, "grad_norm": 9.687166213989258, "learning_rate": 6.400000000000001e-05, "loss": 1.5523, "step": 64 }, { "epoch": 0.0031659056316590563, "grad_norm": 9.057082176208496, "learning_rate": 6.500000000000001e-05, "loss": 1.9528, "step": 65 }, { "epoch": 0.003214611872146119, "grad_norm": 12.989787101745605, "learning_rate": 6.6e-05, "loss": 1.9981, "step": 66 }, { "epoch": 0.0032633181126331813, "grad_norm": 12.194509506225586, "learning_rate": 6.7e-05, "loss": 2.0458, "step": 67 }, { "epoch": 0.0033120243531202433, "grad_norm": 14.83133316040039, "learning_rate": 6.800000000000001e-05, "loss": 2.1763, "step": 68 }, { "epoch": 0.003360730593607306, "grad_norm": 12.523411750793457, "learning_rate": 6.9e-05, "loss": 2.1116, "step": 69 }, { "epoch": 0.0034094368340943683, "grad_norm": 10.275344848632812, "learning_rate": 7e-05, "loss": 2.2137, "step": 70 }, { "epoch": 0.0034581430745814308, "grad_norm": 11.111023902893066, "learning_rate": 7.1e-05, "loss": 2.3323, "step": 71 }, { "epoch": 0.0035068493150684932, "grad_norm": 11.215889930725098, "learning_rate": 7.2e-05, "loss": 2.2838, "step": 72 }, { "epoch": 0.0035555555555555557, "grad_norm": 11.466020584106445, "learning_rate": 7.3e-05, "loss": 2.3188, "step": 73 }, { "epoch": 0.0036042617960426178, "grad_norm": 12.254678726196289, "learning_rate": 7.4e-05, "loss": 1.899, "step": 74 }, { "epoch": 0.0036529680365296802, "grad_norm": 12.059733390808105, "learning_rate": 7.500000000000001e-05, "loss": 1.8779, "step": 75 }, { "epoch": 0.0037016742770167427, "grad_norm": 14.260489463806152, "learning_rate": 7.6e-05, "loss": 1.941, "step": 76 }, { "epoch": 0.003750380517503805, "grad_norm": 11.856407165527344, "learning_rate": 7.7e-05, "loss": 1.8031, "step": 77 }, { "epoch": 0.0037990867579908677, "grad_norm": 13.23192024230957, "learning_rate": 7.800000000000001e-05, "loss": 1.8466, "step": 78 }, { "epoch": 0.00384779299847793, "grad_norm": 14.03378677368164, "learning_rate": 7.900000000000001e-05, "loss": 2.0481, "step": 79 }, { "epoch": 0.003896499238964992, "grad_norm": 12.832358360290527, "learning_rate": 8e-05, "loss": 2.223, "step": 80 }, { "epoch": 0.003945205479452055, "grad_norm": 11.655765533447266, "learning_rate": 8.1e-05, "loss": 1.6975, "step": 81 }, { "epoch": 0.003993911719939117, "grad_norm": 13.00943660736084, "learning_rate": 8.2e-05, "loss": 1.6346, "step": 82 }, { "epoch": 0.00404261796042618, "grad_norm": 13.812478065490723, "learning_rate": 8.3e-05, "loss": 1.818, "step": 83 }, { "epoch": 0.004091324200913242, "grad_norm": 10.820585250854492, "learning_rate": 8.4e-05, "loss": 1.4317, "step": 84 }, { "epoch": 0.004140030441400305, "grad_norm": 11.540773391723633, "learning_rate": 8.5e-05, "loss": 1.6072, "step": 85 }, { "epoch": 0.004188736681887367, "grad_norm": 12.939353942871094, "learning_rate": 8.6e-05, "loss": 1.5664, "step": 86 }, { "epoch": 0.0042374429223744296, "grad_norm": 13.942463874816895, "learning_rate": 8.7e-05, "loss": 1.6459, "step": 87 }, { "epoch": 0.004286149162861492, "grad_norm": 10.26823902130127, "learning_rate": 8.800000000000001e-05, "loss": 1.2702, "step": 88 }, { "epoch": 0.004334855403348554, "grad_norm": 11.739928245544434, "learning_rate": 8.900000000000001e-05, "loss": 1.4725, "step": 89 }, { "epoch": 0.004383561643835616, "grad_norm": 9.966146469116211, "learning_rate": 9e-05, "loss": 0.7986, "step": 90 }, { "epoch": 0.004432267884322679, "grad_norm": 12.615833282470703, "learning_rate": 9.1e-05, "loss": 1.2814, "step": 91 }, { "epoch": 0.004480974124809741, "grad_norm": 10.074495315551758, "learning_rate": 9.200000000000001e-05, "loss": 0.6983, "step": 92 }, { "epoch": 0.0045296803652968036, "grad_norm": 14.656659126281738, "learning_rate": 9.300000000000001e-05, "loss": 0.8942, "step": 93 }, { "epoch": 0.004578386605783866, "grad_norm": 8.69896411895752, "learning_rate": 9.4e-05, "loss": 0.6231, "step": 94 }, { "epoch": 0.0046270928462709285, "grad_norm": 9.49130630493164, "learning_rate": 9.5e-05, "loss": 0.5908, "step": 95 }, { "epoch": 0.004675799086757991, "grad_norm": 10.93470287322998, "learning_rate": 9.6e-05, "loss": 0.9713, "step": 96 }, { "epoch": 0.0047245053272450535, "grad_norm": 14.361600875854492, "learning_rate": 9.7e-05, "loss": 1.1417, "step": 97 }, { "epoch": 0.004773211567732116, "grad_norm": 10.079813003540039, "learning_rate": 9.8e-05, "loss": 0.8555, "step": 98 }, { "epoch": 0.004821917808219178, "grad_norm": 10.335041999816895, "learning_rate": 9.900000000000001e-05, "loss": 0.7722, "step": 99 }, { "epoch": 0.004870624048706241, "grad_norm": 16.433164596557617, "learning_rate": 0.0001, "loss": 1.3996, "step": 100 }, { "epoch": 0.0049193302891933025, "grad_norm": 10.94658374786377, "learning_rate": 9.997532801828658e-05, "loss": 2.4347, "step": 101 }, { "epoch": 0.004968036529680365, "grad_norm": 17.193456649780273, "learning_rate": 9.990133642141359e-05, "loss": 3.2929, "step": 102 }, { "epoch": 0.0050167427701674275, "grad_norm": 12.511935234069824, "learning_rate": 9.977809823015401e-05, "loss": 1.2897, "step": 103 }, { "epoch": 0.00506544901065449, "grad_norm": 10.709287643432617, "learning_rate": 9.96057350657239e-05, "loss": 1.7389, "step": 104 }, { "epoch": 0.0051141552511415524, "grad_norm": 12.061141014099121, "learning_rate": 9.938441702975689e-05, "loss": 1.0578, "step": 105 }, { "epoch": 0.005162861491628615, "grad_norm": 8.664139747619629, "learning_rate": 9.911436253643445e-05, "loss": 1.5287, "step": 106 }, { "epoch": 0.005211567732115677, "grad_norm": 8.517009735107422, "learning_rate": 9.879583809693738e-05, "loss": 1.5803, "step": 107 }, { "epoch": 0.00526027397260274, "grad_norm": 7.50566291809082, "learning_rate": 9.842915805643155e-05, "loss": 1.6515, "step": 108 }, { "epoch": 0.005308980213089802, "grad_norm": 7.030799388885498, "learning_rate": 9.801468428384716e-05, "loss": 1.6422, "step": 109 }, { "epoch": 0.005357686453576865, "grad_norm": 8.954855918884277, "learning_rate": 9.755282581475769e-05, "loss": 2.0945, "step": 110 }, { "epoch": 0.005406392694063927, "grad_norm": 11.065245628356934, "learning_rate": 9.704403844771128e-05, "loss": 1.7066, "step": 111 }, { "epoch": 0.005455098934550989, "grad_norm": 8.86803150177002, "learning_rate": 9.648882429441257e-05, "loss": 1.2714, "step": 112 }, { "epoch": 0.005503805175038051, "grad_norm": 8.038043022155762, "learning_rate": 9.588773128419906e-05, "loss": 1.3595, "step": 113 }, { "epoch": 0.005552511415525114, "grad_norm": 7.1317877769470215, "learning_rate": 9.524135262330098e-05, "loss": 1.012, "step": 114 }, { "epoch": 0.005601217656012176, "grad_norm": 9.007568359375, "learning_rate": 9.45503262094184e-05, "loss": 1.7679, "step": 115 }, { "epoch": 0.005649923896499239, "grad_norm": 9.552806854248047, "learning_rate": 9.381533400219318e-05, "loss": 1.6309, "step": 116 }, { "epoch": 0.005698630136986301, "grad_norm": 13.193597793579102, "learning_rate": 9.30371013501972e-05, "loss": 1.8146, "step": 117 }, { "epoch": 0.005747336377473364, "grad_norm": 10.52649974822998, "learning_rate": 9.221639627510076e-05, "loss": 2.1017, "step": 118 }, { "epoch": 0.005796042617960426, "grad_norm": 13.295526504516602, "learning_rate": 9.135402871372808e-05, "loss": 1.8865, "step": 119 }, { "epoch": 0.005844748858447489, "grad_norm": 11.041900634765625, "learning_rate": 9.045084971874738e-05, "loss": 1.8419, "step": 120 }, { "epoch": 0.005893455098934551, "grad_norm": 12.388705253601074, "learning_rate": 8.950775061878453e-05, "loss": 2.1317, "step": 121 }, { "epoch": 0.005942161339421614, "grad_norm": 10.880697250366211, "learning_rate": 8.852566213878947e-05, "loss": 2.2668, "step": 122 }, { "epoch": 0.005990867579908676, "grad_norm": 9.947246551513672, "learning_rate": 8.750555348152298e-05, "loss": 2.0057, "step": 123 }, { "epoch": 0.006039573820395738, "grad_norm": 11.419881820678711, "learning_rate": 8.644843137107059e-05, "loss": 2.2503, "step": 124 }, { "epoch": 0.0060882800608828, "grad_norm": 17.00235939025879, "learning_rate": 8.535533905932738e-05, "loss": 1.7933, "step": 125 }, { "epoch": 0.006136986301369863, "grad_norm": 12.406261444091797, "learning_rate": 8.422735529643444e-05, "loss": 2.0982, "step": 126 }, { "epoch": 0.006185692541856925, "grad_norm": 11.130414009094238, "learning_rate": 8.306559326618259e-05, "loss": 2.1638, "step": 127 }, { "epoch": 0.006234398782343988, "grad_norm": 12.033727645874023, "learning_rate": 8.18711994874345e-05, "loss": 2.3324, "step": 128 }, { "epoch": 0.00628310502283105, "grad_norm": 10.567495346069336, "learning_rate": 8.064535268264883e-05, "loss": 2.0464, "step": 129 }, { "epoch": 0.006331811263318113, "grad_norm": 11.884528160095215, "learning_rate": 7.938926261462366e-05, "loss": 1.7445, "step": 130 }, { "epoch": 0.006380517503805175, "grad_norm": 11.059581756591797, "learning_rate": 7.810416889260653e-05, "loss": 1.9192, "step": 131 }, { "epoch": 0.006429223744292238, "grad_norm": 11.430746078491211, "learning_rate": 7.679133974894983e-05, "loss": 1.9024, "step": 132 }, { "epoch": 0.0064779299847793, "grad_norm": 10.843358039855957, "learning_rate": 7.545207078751857e-05, "loss": 1.882, "step": 133 }, { "epoch": 0.006526636225266363, "grad_norm": 12.441071510314941, "learning_rate": 7.408768370508576e-05, "loss": 1.2063, "step": 134 }, { "epoch": 0.006575342465753425, "grad_norm": 12.600017547607422, "learning_rate": 7.269952498697734e-05, "loss": 1.7018, "step": 135 }, { "epoch": 0.006624048706240487, "grad_norm": 10.350911140441895, "learning_rate": 7.128896457825364e-05, "loss": 1.0953, "step": 136 }, { "epoch": 0.006672754946727549, "grad_norm": 11.342569351196289, "learning_rate": 6.985739453173903e-05, "loss": 1.2846, "step": 137 }, { "epoch": 0.006721461187214612, "grad_norm": 9.398454666137695, "learning_rate": 6.840622763423391e-05, "loss": 1.0346, "step": 138 }, { "epoch": 0.006770167427701674, "grad_norm": 8.416460990905762, "learning_rate": 6.693689601226458e-05, "loss": 0.8632, "step": 139 }, { "epoch": 0.006818873668188737, "grad_norm": 9.184191703796387, "learning_rate": 6.545084971874738e-05, "loss": 1.0037, "step": 140 }, { "epoch": 0.006867579908675799, "grad_norm": 11.087357521057129, "learning_rate": 6.394955530196147e-05, "loss": 1.1403, "step": 141 }, { "epoch": 0.0069162861491628615, "grad_norm": 11.673155784606934, "learning_rate": 6.243449435824276e-05, "loss": 1.3567, "step": 142 }, { "epoch": 0.006964992389649924, "grad_norm": 9.134658813476562, "learning_rate": 6.090716206982714e-05, "loss": 0.9909, "step": 143 }, { "epoch": 0.0070136986301369865, "grad_norm": 7.556820869445801, "learning_rate": 5.9369065729286245e-05, "loss": 0.6842, "step": 144 }, { "epoch": 0.007062404870624049, "grad_norm": 8.291131973266602, "learning_rate": 5.782172325201155e-05, "loss": 0.682, "step": 145 }, { "epoch": 0.0071111111111111115, "grad_norm": 7.839075088500977, "learning_rate": 5.6266661678215216e-05, "loss": 0.6249, "step": 146 }, { "epoch": 0.007159817351598174, "grad_norm": 8.085784912109375, "learning_rate": 5.470541566592573e-05, "loss": 0.7631, "step": 147 }, { "epoch": 0.0072085235920852355, "grad_norm": 10.273898124694824, "learning_rate": 5.313952597646568e-05, "loss": 1.0338, "step": 148 }, { "epoch": 0.007257229832572298, "grad_norm": 7.386903285980225, "learning_rate": 5.157053795390642e-05, "loss": 0.4861, "step": 149 }, { "epoch": 0.0073059360730593605, "grad_norm": 14.1038236618042, "learning_rate": 5e-05, "loss": 1.4195, "step": 150 }, { "epoch": 0.007354642313546423, "grad_norm": 6.475687026977539, "learning_rate": 4.8429462046093585e-05, "loss": 2.2988, "step": 151 }, { "epoch": 0.0074033485540334855, "grad_norm": 12.853471755981445, "learning_rate": 4.6860474023534335e-05, "loss": 2.8033, "step": 152 }, { "epoch": 0.007452054794520548, "grad_norm": 9.609049797058105, "learning_rate": 4.529458433407429e-05, "loss": 1.9798, "step": 153 }, { "epoch": 0.00750076103500761, "grad_norm": 6.795541286468506, "learning_rate": 4.373333832178478e-05, "loss": 1.5672, "step": 154 }, { "epoch": 0.007549467275494673, "grad_norm": 6.115726470947266, "learning_rate": 4.2178276747988446e-05, "loss": 1.0559, "step": 155 }, { "epoch": 0.007598173515981735, "grad_norm": 6.5415239334106445, "learning_rate": 4.063093427071376e-05, "loss": 1.0537, "step": 156 }, { "epoch": 0.007646879756468798, "grad_norm": 10.223687171936035, "learning_rate": 3.9092837930172884e-05, "loss": 1.9506, "step": 157 }, { "epoch": 0.00769558599695586, "grad_norm": 11.674057006835938, "learning_rate": 3.756550564175727e-05, "loss": 1.8997, "step": 158 }, { "epoch": 0.007744292237442923, "grad_norm": 7.180853843688965, "learning_rate": 3.605044469803854e-05, "loss": 1.6581, "step": 159 }, { "epoch": 0.007792998477929984, "grad_norm": 9.071992874145508, "learning_rate": 3.4549150281252636e-05, "loss": 1.4177, "step": 160 }, { "epoch": 0.007841704718417048, "grad_norm": 7.320943355560303, "learning_rate": 3.3063103987735433e-05, "loss": 1.5026, "step": 161 }, { "epoch": 0.00789041095890411, "grad_norm": 5.533285617828369, "learning_rate": 3.1593772365766105e-05, "loss": 0.8148, "step": 162 }, { "epoch": 0.007939117199391173, "grad_norm": 7.4456610679626465, "learning_rate": 3.0142605468260978e-05, "loss": 1.2012, "step": 163 }, { "epoch": 0.007987823439878234, "grad_norm": 7.7538862228393555, "learning_rate": 2.8711035421746367e-05, "loss": 2.0698, "step": 164 }, { "epoch": 0.008036529680365296, "grad_norm": 8.773526191711426, "learning_rate": 2.7300475013022663e-05, "loss": 2.6465, "step": 165 }, { "epoch": 0.00808523592085236, "grad_norm": 9.068692207336426, "learning_rate": 2.591231629491423e-05, "loss": 2.0785, "step": 166 }, { "epoch": 0.008133942161339421, "grad_norm": 11.246406555175781, "learning_rate": 2.4547929212481435e-05, "loss": 2.3037, "step": 167 }, { "epoch": 0.008182648401826484, "grad_norm": 10.597480773925781, "learning_rate": 2.3208660251050158e-05, "loss": 2.0094, "step": 168 }, { "epoch": 0.008231354642313546, "grad_norm": 9.89315128326416, "learning_rate": 2.1895831107393484e-05, "loss": 1.8189, "step": 169 }, { "epoch": 0.00828006088280061, "grad_norm": 9.914687156677246, "learning_rate": 2.061073738537635e-05, "loss": 1.7016, "step": 170 }, { "epoch": 0.00832876712328767, "grad_norm": 9.6238431930542, "learning_rate": 1.9354647317351188e-05, "loss": 2.3516, "step": 171 }, { "epoch": 0.008377473363774734, "grad_norm": 8.912046432495117, "learning_rate": 1.8128800512565513e-05, "loss": 2.2398, "step": 172 }, { "epoch": 0.008426179604261796, "grad_norm": 9.197094917297363, "learning_rate": 1.6934406733817414e-05, "loss": 2.3229, "step": 173 }, { "epoch": 0.008474885844748859, "grad_norm": 9.847735404968262, "learning_rate": 1.5772644703565565e-05, "loss": 2.0305, "step": 174 }, { "epoch": 0.00852359208523592, "grad_norm": 10.200972557067871, "learning_rate": 1.4644660940672627e-05, "loss": 1.656, "step": 175 }, { "epoch": 0.008572298325722984, "grad_norm": 10.505123138427734, "learning_rate": 1.3551568628929434e-05, "loss": 2.147, "step": 176 }, { "epoch": 0.008621004566210046, "grad_norm": 8.955830574035645, "learning_rate": 1.2494446518477022e-05, "loss": 1.4854, "step": 177 }, { "epoch": 0.008669710806697107, "grad_norm": 12.1397066116333, "learning_rate": 1.1474337861210543e-05, "loss": 2.39, "step": 178 }, { "epoch": 0.00871841704718417, "grad_norm": 12.608519554138184, "learning_rate": 1.049224938121548e-05, "loss": 2.3129, "step": 179 }, { "epoch": 0.008767123287671232, "grad_norm": 9.86025619506836, "learning_rate": 9.549150281252633e-06, "loss": 1.6276, "step": 180 }, { "epoch": 0.008815829528158296, "grad_norm": 12.893600463867188, "learning_rate": 8.645971286271904e-06, "loss": 2.4241, "step": 181 }, { "epoch": 0.008864535768645357, "grad_norm": 11.86868667602539, "learning_rate": 7.783603724899257e-06, "loss": 2.2967, "step": 182 }, { "epoch": 0.00891324200913242, "grad_norm": 11.338330268859863, "learning_rate": 6.962898649802823e-06, "loss": 1.6438, "step": 183 }, { "epoch": 0.008961948249619482, "grad_norm": 11.070121765136719, "learning_rate": 6.184665997806832e-06, "loss": 1.3556, "step": 184 }, { "epoch": 0.009010654490106546, "grad_norm": 10.048547744750977, "learning_rate": 5.449673790581611e-06, "loss": 1.6426, "step": 185 }, { "epoch": 0.009059360730593607, "grad_norm": 14.009288787841797, "learning_rate": 4.758647376699032e-06, "loss": 1.7888, "step": 186 }, { "epoch": 0.00910806697108067, "grad_norm": 10.863405227661133, "learning_rate": 4.112268715800943e-06, "loss": 1.5613, "step": 187 }, { "epoch": 0.009156773211567732, "grad_norm": 11.333900451660156, "learning_rate": 3.511175705587433e-06, "loss": 1.4206, "step": 188 }, { "epoch": 0.009205479452054794, "grad_norm": 10.498023986816406, "learning_rate": 2.9559615522887273e-06, "loss": 1.8235, "step": 189 }, { "epoch": 0.009254185692541857, "grad_norm": 9.978139877319336, "learning_rate": 2.4471741852423237e-06, "loss": 1.4594, "step": 190 }, { "epoch": 0.009302891933028919, "grad_norm": 10.376583099365234, "learning_rate": 1.985315716152847e-06, "loss": 1.3848, "step": 191 }, { "epoch": 0.009351598173515982, "grad_norm": 9.732892036437988, "learning_rate": 1.5708419435684462e-06, "loss": 1.0441, "step": 192 }, { "epoch": 0.009400304414003044, "grad_norm": 10.896247863769531, "learning_rate": 1.2041619030626284e-06, "loss": 1.2706, "step": 193 }, { "epoch": 0.009449010654490107, "grad_norm": 9.157602310180664, "learning_rate": 8.856374635655695e-07, "loss": 0.7031, "step": 194 }, { "epoch": 0.009497716894977169, "grad_norm": 7.2205047607421875, "learning_rate": 6.15582970243117e-07, "loss": 0.4888, "step": 195 }, { "epoch": 0.009546423135464232, "grad_norm": 6.679050922393799, "learning_rate": 3.9426493427611177e-07, "loss": 0.4518, "step": 196 }, { "epoch": 0.009595129375951294, "grad_norm": 8.90599250793457, "learning_rate": 2.219017698460002e-07, "loss": 0.8044, "step": 197 }, { "epoch": 0.009643835616438357, "grad_norm": 11.641607284545898, "learning_rate": 9.866357858642205e-08, "loss": 1.0654, "step": 198 }, { "epoch": 0.009692541856925418, "grad_norm": 10.517057418823242, "learning_rate": 2.467198171342e-08, "loss": 0.7411, "step": 199 }, { "epoch": 0.009741248097412482, "grad_norm": 12.261541366577148, "learning_rate": 0.0, "loss": 1.2423, "step": 200 } ], "logging_steps": 1, "max_steps": 200, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 239, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 5236987211022336.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }