|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 0.009741248097412482, |
|
"eval_steps": 500, |
|
"global_step": 200, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 4.8706240487062404e-05, |
|
"grad_norm": 5.851158142089844, |
|
"learning_rate": 1.0000000000000002e-06, |
|
"loss": 2.1245, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 9.741248097412481e-05, |
|
"grad_norm": 18.33625602722168, |
|
"learning_rate": 2.0000000000000003e-06, |
|
"loss": 3.869, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.00014611872146118722, |
|
"grad_norm": 15.931517601013184, |
|
"learning_rate": 3e-06, |
|
"loss": 3.1648, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.00019482496194824962, |
|
"grad_norm": 26.00968360900879, |
|
"learning_rate": 4.000000000000001e-06, |
|
"loss": 2.3093, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.000243531202435312, |
|
"grad_norm": 27.92300033569336, |
|
"learning_rate": 5e-06, |
|
"loss": 2.221, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.00029223744292237444, |
|
"grad_norm": 9.341499328613281, |
|
"learning_rate": 6e-06, |
|
"loss": 2.2322, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.0003409436834094368, |
|
"grad_norm": 13.460984230041504, |
|
"learning_rate": 7.000000000000001e-06, |
|
"loss": 2.5544, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 0.00038964992389649923, |
|
"grad_norm": 12.86645793914795, |
|
"learning_rate": 8.000000000000001e-06, |
|
"loss": 2.537, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.00043835616438356166, |
|
"grad_norm": 15.244658470153809, |
|
"learning_rate": 9e-06, |
|
"loss": 2.7317, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.000487062404870624, |
|
"grad_norm": 11.827373504638672, |
|
"learning_rate": 1e-05, |
|
"loss": 2.2533, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.0005357686453576865, |
|
"grad_norm": 10.726211547851562, |
|
"learning_rate": 1.1000000000000001e-05, |
|
"loss": 2.2206, |
|
"step": 11 |
|
}, |
|
{ |
|
"epoch": 0.0005844748858447489, |
|
"grad_norm": 13.430566787719727, |
|
"learning_rate": 1.2e-05, |
|
"loss": 2.4726, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.0006331811263318112, |
|
"grad_norm": 15.749054908752441, |
|
"learning_rate": 1.3000000000000001e-05, |
|
"loss": 2.3328, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 0.0006818873668188736, |
|
"grad_norm": 13.098236083984375, |
|
"learning_rate": 1.4000000000000001e-05, |
|
"loss": 2.3669, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.0007305936073059361, |
|
"grad_norm": 14.811712265014648, |
|
"learning_rate": 1.5e-05, |
|
"loss": 2.3254, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.0007792998477929985, |
|
"grad_norm": 13.960674285888672, |
|
"learning_rate": 1.6000000000000003e-05, |
|
"loss": 2.3478, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.0008280060882800608, |
|
"grad_norm": 13.510299682617188, |
|
"learning_rate": 1.7000000000000003e-05, |
|
"loss": 2.3439, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 0.0008767123287671233, |
|
"grad_norm": 21.18514060974121, |
|
"learning_rate": 1.8e-05, |
|
"loss": 2.6721, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.0009254185692541857, |
|
"grad_norm": 17.1453800201416, |
|
"learning_rate": 1.9e-05, |
|
"loss": 2.1923, |
|
"step": 19 |
|
}, |
|
{ |
|
"epoch": 0.000974124809741248, |
|
"grad_norm": 15.410392761230469, |
|
"learning_rate": 2e-05, |
|
"loss": 2.4277, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.0010228310502283105, |
|
"grad_norm": 17.85428810119629, |
|
"learning_rate": 2.1e-05, |
|
"loss": 2.1419, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.001071537290715373, |
|
"grad_norm": 16.322954177856445, |
|
"learning_rate": 2.2000000000000003e-05, |
|
"loss": 2.4326, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.0011202435312024353, |
|
"grad_norm": 18.91599464416504, |
|
"learning_rate": 2.3000000000000003e-05, |
|
"loss": 2.1728, |
|
"step": 23 |
|
}, |
|
{ |
|
"epoch": 0.0011689497716894977, |
|
"grad_norm": 14.737173080444336, |
|
"learning_rate": 2.4e-05, |
|
"loss": 2.2486, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.0012176560121765602, |
|
"grad_norm": 14.996482849121094, |
|
"learning_rate": 2.5e-05, |
|
"loss": 1.9959, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.0012663622526636225, |
|
"grad_norm": 12.736573219299316, |
|
"learning_rate": 2.6000000000000002e-05, |
|
"loss": 1.8652, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.001315068493150685, |
|
"grad_norm": 13.891550064086914, |
|
"learning_rate": 2.7000000000000002e-05, |
|
"loss": 2.3333, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.0013637747336377472, |
|
"grad_norm": 11.90274429321289, |
|
"learning_rate": 2.8000000000000003e-05, |
|
"loss": 1.8707, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.0014124809741248097, |
|
"grad_norm": 16.16830062866211, |
|
"learning_rate": 2.9e-05, |
|
"loss": 1.9199, |
|
"step": 29 |
|
}, |
|
{ |
|
"epoch": 0.0014611872146118722, |
|
"grad_norm": 12.124021530151367, |
|
"learning_rate": 3e-05, |
|
"loss": 1.8475, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.0015098934550989344, |
|
"grad_norm": 13.702016830444336, |
|
"learning_rate": 3.1e-05, |
|
"loss": 1.6096, |
|
"step": 31 |
|
}, |
|
{ |
|
"epoch": 0.001558599695585997, |
|
"grad_norm": 16.256940841674805, |
|
"learning_rate": 3.2000000000000005e-05, |
|
"loss": 2.0673, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.0016073059360730594, |
|
"grad_norm": 11.912320137023926, |
|
"learning_rate": 3.3e-05, |
|
"loss": 1.5261, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 0.0016560121765601217, |
|
"grad_norm": 12.158217430114746, |
|
"learning_rate": 3.4000000000000007e-05, |
|
"loss": 1.566, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 0.0017047184170471841, |
|
"grad_norm": 11.484949111938477, |
|
"learning_rate": 3.5e-05, |
|
"loss": 2.0858, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.0017534246575342466, |
|
"grad_norm": 12.884050369262695, |
|
"learning_rate": 3.6e-05, |
|
"loss": 1.0508, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.0018021308980213089, |
|
"grad_norm": 14.054976463317871, |
|
"learning_rate": 3.7e-05, |
|
"loss": 1.2539, |
|
"step": 37 |
|
}, |
|
{ |
|
"epoch": 0.0018508371385083714, |
|
"grad_norm": 13.9093599319458, |
|
"learning_rate": 3.8e-05, |
|
"loss": 1.5044, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 0.0018995433789954338, |
|
"grad_norm": 42.75831604003906, |
|
"learning_rate": 3.9000000000000006e-05, |
|
"loss": 1.2133, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 0.001948249619482496, |
|
"grad_norm": 13.14990234375, |
|
"learning_rate": 4e-05, |
|
"loss": 1.4692, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.0019969558599695586, |
|
"grad_norm": 11.442179679870605, |
|
"learning_rate": 4.1e-05, |
|
"loss": 1.1273, |
|
"step": 41 |
|
}, |
|
{ |
|
"epoch": 0.002045662100456621, |
|
"grad_norm": 9.139272689819336, |
|
"learning_rate": 4.2e-05, |
|
"loss": 0.6377, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 0.0020943683409436835, |
|
"grad_norm": 9.682751655578613, |
|
"learning_rate": 4.3e-05, |
|
"loss": 0.8092, |
|
"step": 43 |
|
}, |
|
{ |
|
"epoch": 0.002143074581430746, |
|
"grad_norm": 10.700410842895508, |
|
"learning_rate": 4.4000000000000006e-05, |
|
"loss": 0.9342, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 0.002191780821917808, |
|
"grad_norm": 8.799379348754883, |
|
"learning_rate": 4.5e-05, |
|
"loss": 0.6458, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.0022404870624048705, |
|
"grad_norm": 7.537528038024902, |
|
"learning_rate": 4.600000000000001e-05, |
|
"loss": 0.3726, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 0.002289193302891933, |
|
"grad_norm": 9.551314353942871, |
|
"learning_rate": 4.7e-05, |
|
"loss": 0.6441, |
|
"step": 47 |
|
}, |
|
{ |
|
"epoch": 0.0023378995433789955, |
|
"grad_norm": 13.743586540222168, |
|
"learning_rate": 4.8e-05, |
|
"loss": 1.0114, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.002386605783866058, |
|
"grad_norm": 9.87704849243164, |
|
"learning_rate": 4.9e-05, |
|
"loss": 0.5438, |
|
"step": 49 |
|
}, |
|
{ |
|
"epoch": 0.0024353120243531205, |
|
"grad_norm": 20.906862258911133, |
|
"learning_rate": 5e-05, |
|
"loss": 1.8715, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.0024840182648401825, |
|
"grad_norm": 8.490747451782227, |
|
"learning_rate": 5.1000000000000006e-05, |
|
"loss": 2.5756, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 0.002532724505327245, |
|
"grad_norm": 12.285213470458984, |
|
"learning_rate": 5.2000000000000004e-05, |
|
"loss": 2.3924, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 0.0025814307458143075, |
|
"grad_norm": 15.3045015335083, |
|
"learning_rate": 5.300000000000001e-05, |
|
"loss": 1.8956, |
|
"step": 53 |
|
}, |
|
{ |
|
"epoch": 0.00263013698630137, |
|
"grad_norm": 15.969414710998535, |
|
"learning_rate": 5.4000000000000005e-05, |
|
"loss": 1.8218, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 0.0026788432267884324, |
|
"grad_norm": 13.07938003540039, |
|
"learning_rate": 5.500000000000001e-05, |
|
"loss": 1.8747, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.0027275494672754945, |
|
"grad_norm": 9.398449897766113, |
|
"learning_rate": 5.6000000000000006e-05, |
|
"loss": 1.8304, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 0.002776255707762557, |
|
"grad_norm": 7.938607215881348, |
|
"learning_rate": 5.6999999999999996e-05, |
|
"loss": 1.9411, |
|
"step": 57 |
|
}, |
|
{ |
|
"epoch": 0.0028249619482496194, |
|
"grad_norm": 9.889793395996094, |
|
"learning_rate": 5.8e-05, |
|
"loss": 2.0788, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 0.002873668188736682, |
|
"grad_norm": 7.4011101722717285, |
|
"learning_rate": 5.9e-05, |
|
"loss": 1.954, |
|
"step": 59 |
|
}, |
|
{ |
|
"epoch": 0.0029223744292237444, |
|
"grad_norm": 7.340896129608154, |
|
"learning_rate": 6e-05, |
|
"loss": 1.9234, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.002971080669710807, |
|
"grad_norm": 13.956856727600098, |
|
"learning_rate": 6.1e-05, |
|
"loss": 2.3137, |
|
"step": 61 |
|
}, |
|
{ |
|
"epoch": 0.003019786910197869, |
|
"grad_norm": 12.846822738647461, |
|
"learning_rate": 6.2e-05, |
|
"loss": 1.5154, |
|
"step": 62 |
|
}, |
|
{ |
|
"epoch": 0.0030684931506849314, |
|
"grad_norm": 10.943364143371582, |
|
"learning_rate": 6.3e-05, |
|
"loss": 1.9186, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 0.003117199391171994, |
|
"grad_norm": 9.687166213989258, |
|
"learning_rate": 6.400000000000001e-05, |
|
"loss": 1.5523, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 0.0031659056316590563, |
|
"grad_norm": 9.057082176208496, |
|
"learning_rate": 6.500000000000001e-05, |
|
"loss": 1.9528, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.003214611872146119, |
|
"grad_norm": 12.989787101745605, |
|
"learning_rate": 6.6e-05, |
|
"loss": 1.9981, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 0.0032633181126331813, |
|
"grad_norm": 12.194509506225586, |
|
"learning_rate": 6.7e-05, |
|
"loss": 2.0458, |
|
"step": 67 |
|
}, |
|
{ |
|
"epoch": 0.0033120243531202433, |
|
"grad_norm": 14.83133316040039, |
|
"learning_rate": 6.800000000000001e-05, |
|
"loss": 2.1763, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 0.003360730593607306, |
|
"grad_norm": 12.523411750793457, |
|
"learning_rate": 6.9e-05, |
|
"loss": 2.1116, |
|
"step": 69 |
|
}, |
|
{ |
|
"epoch": 0.0034094368340943683, |
|
"grad_norm": 10.275344848632812, |
|
"learning_rate": 7e-05, |
|
"loss": 2.2137, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.0034581430745814308, |
|
"grad_norm": 11.111023902893066, |
|
"learning_rate": 7.1e-05, |
|
"loss": 2.3323, |
|
"step": 71 |
|
}, |
|
{ |
|
"epoch": 0.0035068493150684932, |
|
"grad_norm": 11.215889930725098, |
|
"learning_rate": 7.2e-05, |
|
"loss": 2.2838, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 0.0035555555555555557, |
|
"grad_norm": 11.466020584106445, |
|
"learning_rate": 7.3e-05, |
|
"loss": 2.3188, |
|
"step": 73 |
|
}, |
|
{ |
|
"epoch": 0.0036042617960426178, |
|
"grad_norm": 12.254678726196289, |
|
"learning_rate": 7.4e-05, |
|
"loss": 1.899, |
|
"step": 74 |
|
}, |
|
{ |
|
"epoch": 0.0036529680365296802, |
|
"grad_norm": 12.059733390808105, |
|
"learning_rate": 7.500000000000001e-05, |
|
"loss": 1.8779, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.0037016742770167427, |
|
"grad_norm": 14.260489463806152, |
|
"learning_rate": 7.6e-05, |
|
"loss": 1.941, |
|
"step": 76 |
|
}, |
|
{ |
|
"epoch": 0.003750380517503805, |
|
"grad_norm": 11.856407165527344, |
|
"learning_rate": 7.7e-05, |
|
"loss": 1.8031, |
|
"step": 77 |
|
}, |
|
{ |
|
"epoch": 0.0037990867579908677, |
|
"grad_norm": 13.23192024230957, |
|
"learning_rate": 7.800000000000001e-05, |
|
"loss": 1.8466, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 0.00384779299847793, |
|
"grad_norm": 14.03378677368164, |
|
"learning_rate": 7.900000000000001e-05, |
|
"loss": 2.0481, |
|
"step": 79 |
|
}, |
|
{ |
|
"epoch": 0.003896499238964992, |
|
"grad_norm": 12.832358360290527, |
|
"learning_rate": 8e-05, |
|
"loss": 2.223, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.003945205479452055, |
|
"grad_norm": 11.655765533447266, |
|
"learning_rate": 8.1e-05, |
|
"loss": 1.6975, |
|
"step": 81 |
|
}, |
|
{ |
|
"epoch": 0.003993911719939117, |
|
"grad_norm": 13.00943660736084, |
|
"learning_rate": 8.2e-05, |
|
"loss": 1.6346, |
|
"step": 82 |
|
}, |
|
{ |
|
"epoch": 0.00404261796042618, |
|
"grad_norm": 13.812478065490723, |
|
"learning_rate": 8.3e-05, |
|
"loss": 1.818, |
|
"step": 83 |
|
}, |
|
{ |
|
"epoch": 0.004091324200913242, |
|
"grad_norm": 10.820585250854492, |
|
"learning_rate": 8.4e-05, |
|
"loss": 1.4317, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 0.004140030441400305, |
|
"grad_norm": 11.540773391723633, |
|
"learning_rate": 8.5e-05, |
|
"loss": 1.6072, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.004188736681887367, |
|
"grad_norm": 12.939353942871094, |
|
"learning_rate": 8.6e-05, |
|
"loss": 1.5664, |
|
"step": 86 |
|
}, |
|
{ |
|
"epoch": 0.0042374429223744296, |
|
"grad_norm": 13.942463874816895, |
|
"learning_rate": 8.7e-05, |
|
"loss": 1.6459, |
|
"step": 87 |
|
}, |
|
{ |
|
"epoch": 0.004286149162861492, |
|
"grad_norm": 10.26823902130127, |
|
"learning_rate": 8.800000000000001e-05, |
|
"loss": 1.2702, |
|
"step": 88 |
|
}, |
|
{ |
|
"epoch": 0.004334855403348554, |
|
"grad_norm": 11.739928245544434, |
|
"learning_rate": 8.900000000000001e-05, |
|
"loss": 1.4725, |
|
"step": 89 |
|
}, |
|
{ |
|
"epoch": 0.004383561643835616, |
|
"grad_norm": 9.966146469116211, |
|
"learning_rate": 9e-05, |
|
"loss": 0.7986, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.004432267884322679, |
|
"grad_norm": 12.615833282470703, |
|
"learning_rate": 9.1e-05, |
|
"loss": 1.2814, |
|
"step": 91 |
|
}, |
|
{ |
|
"epoch": 0.004480974124809741, |
|
"grad_norm": 10.074495315551758, |
|
"learning_rate": 9.200000000000001e-05, |
|
"loss": 0.6983, |
|
"step": 92 |
|
}, |
|
{ |
|
"epoch": 0.0045296803652968036, |
|
"grad_norm": 14.656659126281738, |
|
"learning_rate": 9.300000000000001e-05, |
|
"loss": 0.8942, |
|
"step": 93 |
|
}, |
|
{ |
|
"epoch": 0.004578386605783866, |
|
"grad_norm": 8.69896411895752, |
|
"learning_rate": 9.4e-05, |
|
"loss": 0.6231, |
|
"step": 94 |
|
}, |
|
{ |
|
"epoch": 0.0046270928462709285, |
|
"grad_norm": 9.49130630493164, |
|
"learning_rate": 9.5e-05, |
|
"loss": 0.5908, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.004675799086757991, |
|
"grad_norm": 10.93470287322998, |
|
"learning_rate": 9.6e-05, |
|
"loss": 0.9713, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 0.0047245053272450535, |
|
"grad_norm": 14.361600875854492, |
|
"learning_rate": 9.7e-05, |
|
"loss": 1.1417, |
|
"step": 97 |
|
}, |
|
{ |
|
"epoch": 0.004773211567732116, |
|
"grad_norm": 10.079813003540039, |
|
"learning_rate": 9.8e-05, |
|
"loss": 0.8555, |
|
"step": 98 |
|
}, |
|
{ |
|
"epoch": 0.004821917808219178, |
|
"grad_norm": 10.335041999816895, |
|
"learning_rate": 9.900000000000001e-05, |
|
"loss": 0.7722, |
|
"step": 99 |
|
}, |
|
{ |
|
"epoch": 0.004870624048706241, |
|
"grad_norm": 16.433164596557617, |
|
"learning_rate": 0.0001, |
|
"loss": 1.3996, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.0049193302891933025, |
|
"grad_norm": 10.94658374786377, |
|
"learning_rate": 9.997532801828658e-05, |
|
"loss": 2.4347, |
|
"step": 101 |
|
}, |
|
{ |
|
"epoch": 0.004968036529680365, |
|
"grad_norm": 17.193456649780273, |
|
"learning_rate": 9.990133642141359e-05, |
|
"loss": 3.2929, |
|
"step": 102 |
|
}, |
|
{ |
|
"epoch": 0.0050167427701674275, |
|
"grad_norm": 12.511935234069824, |
|
"learning_rate": 9.977809823015401e-05, |
|
"loss": 1.2897, |
|
"step": 103 |
|
}, |
|
{ |
|
"epoch": 0.00506544901065449, |
|
"grad_norm": 10.709287643432617, |
|
"learning_rate": 9.96057350657239e-05, |
|
"loss": 1.7389, |
|
"step": 104 |
|
}, |
|
{ |
|
"epoch": 0.0051141552511415524, |
|
"grad_norm": 12.061141014099121, |
|
"learning_rate": 9.938441702975689e-05, |
|
"loss": 1.0578, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.005162861491628615, |
|
"grad_norm": 8.664139747619629, |
|
"learning_rate": 9.911436253643445e-05, |
|
"loss": 1.5287, |
|
"step": 106 |
|
}, |
|
{ |
|
"epoch": 0.005211567732115677, |
|
"grad_norm": 8.517009735107422, |
|
"learning_rate": 9.879583809693738e-05, |
|
"loss": 1.5803, |
|
"step": 107 |
|
}, |
|
{ |
|
"epoch": 0.00526027397260274, |
|
"grad_norm": 7.50566291809082, |
|
"learning_rate": 9.842915805643155e-05, |
|
"loss": 1.6515, |
|
"step": 108 |
|
}, |
|
{ |
|
"epoch": 0.005308980213089802, |
|
"grad_norm": 7.030799388885498, |
|
"learning_rate": 9.801468428384716e-05, |
|
"loss": 1.6422, |
|
"step": 109 |
|
}, |
|
{ |
|
"epoch": 0.005357686453576865, |
|
"grad_norm": 8.954855918884277, |
|
"learning_rate": 9.755282581475769e-05, |
|
"loss": 2.0945, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.005406392694063927, |
|
"grad_norm": 11.065245628356934, |
|
"learning_rate": 9.704403844771128e-05, |
|
"loss": 1.7066, |
|
"step": 111 |
|
}, |
|
{ |
|
"epoch": 0.005455098934550989, |
|
"grad_norm": 8.86803150177002, |
|
"learning_rate": 9.648882429441257e-05, |
|
"loss": 1.2714, |
|
"step": 112 |
|
}, |
|
{ |
|
"epoch": 0.005503805175038051, |
|
"grad_norm": 8.038043022155762, |
|
"learning_rate": 9.588773128419906e-05, |
|
"loss": 1.3595, |
|
"step": 113 |
|
}, |
|
{ |
|
"epoch": 0.005552511415525114, |
|
"grad_norm": 7.1317877769470215, |
|
"learning_rate": 9.524135262330098e-05, |
|
"loss": 1.012, |
|
"step": 114 |
|
}, |
|
{ |
|
"epoch": 0.005601217656012176, |
|
"grad_norm": 9.007568359375, |
|
"learning_rate": 9.45503262094184e-05, |
|
"loss": 1.7679, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.005649923896499239, |
|
"grad_norm": 9.552806854248047, |
|
"learning_rate": 9.381533400219318e-05, |
|
"loss": 1.6309, |
|
"step": 116 |
|
}, |
|
{ |
|
"epoch": 0.005698630136986301, |
|
"grad_norm": 13.193597793579102, |
|
"learning_rate": 9.30371013501972e-05, |
|
"loss": 1.8146, |
|
"step": 117 |
|
}, |
|
{ |
|
"epoch": 0.005747336377473364, |
|
"grad_norm": 10.52649974822998, |
|
"learning_rate": 9.221639627510076e-05, |
|
"loss": 2.1017, |
|
"step": 118 |
|
}, |
|
{ |
|
"epoch": 0.005796042617960426, |
|
"grad_norm": 13.295526504516602, |
|
"learning_rate": 9.135402871372808e-05, |
|
"loss": 1.8865, |
|
"step": 119 |
|
}, |
|
{ |
|
"epoch": 0.005844748858447489, |
|
"grad_norm": 11.041900634765625, |
|
"learning_rate": 9.045084971874738e-05, |
|
"loss": 1.8419, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.005893455098934551, |
|
"grad_norm": 12.388705253601074, |
|
"learning_rate": 8.950775061878453e-05, |
|
"loss": 2.1317, |
|
"step": 121 |
|
}, |
|
{ |
|
"epoch": 0.005942161339421614, |
|
"grad_norm": 10.880697250366211, |
|
"learning_rate": 8.852566213878947e-05, |
|
"loss": 2.2668, |
|
"step": 122 |
|
}, |
|
{ |
|
"epoch": 0.005990867579908676, |
|
"grad_norm": 9.947246551513672, |
|
"learning_rate": 8.750555348152298e-05, |
|
"loss": 2.0057, |
|
"step": 123 |
|
}, |
|
{ |
|
"epoch": 0.006039573820395738, |
|
"grad_norm": 11.419881820678711, |
|
"learning_rate": 8.644843137107059e-05, |
|
"loss": 2.2503, |
|
"step": 124 |
|
}, |
|
{ |
|
"epoch": 0.0060882800608828, |
|
"grad_norm": 17.00235939025879, |
|
"learning_rate": 8.535533905932738e-05, |
|
"loss": 1.7933, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.006136986301369863, |
|
"grad_norm": 12.406261444091797, |
|
"learning_rate": 8.422735529643444e-05, |
|
"loss": 2.0982, |
|
"step": 126 |
|
}, |
|
{ |
|
"epoch": 0.006185692541856925, |
|
"grad_norm": 11.130414009094238, |
|
"learning_rate": 8.306559326618259e-05, |
|
"loss": 2.1638, |
|
"step": 127 |
|
}, |
|
{ |
|
"epoch": 0.006234398782343988, |
|
"grad_norm": 12.033727645874023, |
|
"learning_rate": 8.18711994874345e-05, |
|
"loss": 2.3324, |
|
"step": 128 |
|
}, |
|
{ |
|
"epoch": 0.00628310502283105, |
|
"grad_norm": 10.567495346069336, |
|
"learning_rate": 8.064535268264883e-05, |
|
"loss": 2.0464, |
|
"step": 129 |
|
}, |
|
{ |
|
"epoch": 0.006331811263318113, |
|
"grad_norm": 11.884528160095215, |
|
"learning_rate": 7.938926261462366e-05, |
|
"loss": 1.7445, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.006380517503805175, |
|
"grad_norm": 11.059581756591797, |
|
"learning_rate": 7.810416889260653e-05, |
|
"loss": 1.9192, |
|
"step": 131 |
|
}, |
|
{ |
|
"epoch": 0.006429223744292238, |
|
"grad_norm": 11.430746078491211, |
|
"learning_rate": 7.679133974894983e-05, |
|
"loss": 1.9024, |
|
"step": 132 |
|
}, |
|
{ |
|
"epoch": 0.0064779299847793, |
|
"grad_norm": 10.843358039855957, |
|
"learning_rate": 7.545207078751857e-05, |
|
"loss": 1.882, |
|
"step": 133 |
|
}, |
|
{ |
|
"epoch": 0.006526636225266363, |
|
"grad_norm": 12.441071510314941, |
|
"learning_rate": 7.408768370508576e-05, |
|
"loss": 1.2063, |
|
"step": 134 |
|
}, |
|
{ |
|
"epoch": 0.006575342465753425, |
|
"grad_norm": 12.600017547607422, |
|
"learning_rate": 7.269952498697734e-05, |
|
"loss": 1.7018, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.006624048706240487, |
|
"grad_norm": 10.350911140441895, |
|
"learning_rate": 7.128896457825364e-05, |
|
"loss": 1.0953, |
|
"step": 136 |
|
}, |
|
{ |
|
"epoch": 0.006672754946727549, |
|
"grad_norm": 11.342569351196289, |
|
"learning_rate": 6.985739453173903e-05, |
|
"loss": 1.2846, |
|
"step": 137 |
|
}, |
|
{ |
|
"epoch": 0.006721461187214612, |
|
"grad_norm": 9.398454666137695, |
|
"learning_rate": 6.840622763423391e-05, |
|
"loss": 1.0346, |
|
"step": 138 |
|
}, |
|
{ |
|
"epoch": 0.006770167427701674, |
|
"grad_norm": 8.416460990905762, |
|
"learning_rate": 6.693689601226458e-05, |
|
"loss": 0.8632, |
|
"step": 139 |
|
}, |
|
{ |
|
"epoch": 0.006818873668188737, |
|
"grad_norm": 9.184191703796387, |
|
"learning_rate": 6.545084971874738e-05, |
|
"loss": 1.0037, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.006867579908675799, |
|
"grad_norm": 11.087357521057129, |
|
"learning_rate": 6.394955530196147e-05, |
|
"loss": 1.1403, |
|
"step": 141 |
|
}, |
|
{ |
|
"epoch": 0.0069162861491628615, |
|
"grad_norm": 11.673155784606934, |
|
"learning_rate": 6.243449435824276e-05, |
|
"loss": 1.3567, |
|
"step": 142 |
|
}, |
|
{ |
|
"epoch": 0.006964992389649924, |
|
"grad_norm": 9.134658813476562, |
|
"learning_rate": 6.090716206982714e-05, |
|
"loss": 0.9909, |
|
"step": 143 |
|
}, |
|
{ |
|
"epoch": 0.0070136986301369865, |
|
"grad_norm": 7.556820869445801, |
|
"learning_rate": 5.9369065729286245e-05, |
|
"loss": 0.6842, |
|
"step": 144 |
|
}, |
|
{ |
|
"epoch": 0.007062404870624049, |
|
"grad_norm": 8.291131973266602, |
|
"learning_rate": 5.782172325201155e-05, |
|
"loss": 0.682, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 0.0071111111111111115, |
|
"grad_norm": 7.839075088500977, |
|
"learning_rate": 5.6266661678215216e-05, |
|
"loss": 0.6249, |
|
"step": 146 |
|
}, |
|
{ |
|
"epoch": 0.007159817351598174, |
|
"grad_norm": 8.085784912109375, |
|
"learning_rate": 5.470541566592573e-05, |
|
"loss": 0.7631, |
|
"step": 147 |
|
}, |
|
{ |
|
"epoch": 0.0072085235920852355, |
|
"grad_norm": 10.273898124694824, |
|
"learning_rate": 5.313952597646568e-05, |
|
"loss": 1.0338, |
|
"step": 148 |
|
}, |
|
{ |
|
"epoch": 0.007257229832572298, |
|
"grad_norm": 7.386903285980225, |
|
"learning_rate": 5.157053795390642e-05, |
|
"loss": 0.4861, |
|
"step": 149 |
|
}, |
|
{ |
|
"epoch": 0.0073059360730593605, |
|
"grad_norm": 14.1038236618042, |
|
"learning_rate": 5e-05, |
|
"loss": 1.4195, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.007354642313546423, |
|
"grad_norm": 6.475687026977539, |
|
"learning_rate": 4.8429462046093585e-05, |
|
"loss": 2.2988, |
|
"step": 151 |
|
}, |
|
{ |
|
"epoch": 0.0074033485540334855, |
|
"grad_norm": 12.853471755981445, |
|
"learning_rate": 4.6860474023534335e-05, |
|
"loss": 2.8033, |
|
"step": 152 |
|
}, |
|
{ |
|
"epoch": 0.007452054794520548, |
|
"grad_norm": 9.609049797058105, |
|
"learning_rate": 4.529458433407429e-05, |
|
"loss": 1.9798, |
|
"step": 153 |
|
}, |
|
{ |
|
"epoch": 0.00750076103500761, |
|
"grad_norm": 6.795541286468506, |
|
"learning_rate": 4.373333832178478e-05, |
|
"loss": 1.5672, |
|
"step": 154 |
|
}, |
|
{ |
|
"epoch": 0.007549467275494673, |
|
"grad_norm": 6.115726470947266, |
|
"learning_rate": 4.2178276747988446e-05, |
|
"loss": 1.0559, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 0.007598173515981735, |
|
"grad_norm": 6.5415239334106445, |
|
"learning_rate": 4.063093427071376e-05, |
|
"loss": 1.0537, |
|
"step": 156 |
|
}, |
|
{ |
|
"epoch": 0.007646879756468798, |
|
"grad_norm": 10.223687171936035, |
|
"learning_rate": 3.9092837930172884e-05, |
|
"loss": 1.9506, |
|
"step": 157 |
|
}, |
|
{ |
|
"epoch": 0.00769558599695586, |
|
"grad_norm": 11.674057006835938, |
|
"learning_rate": 3.756550564175727e-05, |
|
"loss": 1.8997, |
|
"step": 158 |
|
}, |
|
{ |
|
"epoch": 0.007744292237442923, |
|
"grad_norm": 7.180853843688965, |
|
"learning_rate": 3.605044469803854e-05, |
|
"loss": 1.6581, |
|
"step": 159 |
|
}, |
|
{ |
|
"epoch": 0.007792998477929984, |
|
"grad_norm": 9.071992874145508, |
|
"learning_rate": 3.4549150281252636e-05, |
|
"loss": 1.4177, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.007841704718417048, |
|
"grad_norm": 7.320943355560303, |
|
"learning_rate": 3.3063103987735433e-05, |
|
"loss": 1.5026, |
|
"step": 161 |
|
}, |
|
{ |
|
"epoch": 0.00789041095890411, |
|
"grad_norm": 5.533285617828369, |
|
"learning_rate": 3.1593772365766105e-05, |
|
"loss": 0.8148, |
|
"step": 162 |
|
}, |
|
{ |
|
"epoch": 0.007939117199391173, |
|
"grad_norm": 7.4456610679626465, |
|
"learning_rate": 3.0142605468260978e-05, |
|
"loss": 1.2012, |
|
"step": 163 |
|
}, |
|
{ |
|
"epoch": 0.007987823439878234, |
|
"grad_norm": 7.7538862228393555, |
|
"learning_rate": 2.8711035421746367e-05, |
|
"loss": 2.0698, |
|
"step": 164 |
|
}, |
|
{ |
|
"epoch": 0.008036529680365296, |
|
"grad_norm": 8.773526191711426, |
|
"learning_rate": 2.7300475013022663e-05, |
|
"loss": 2.6465, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 0.00808523592085236, |
|
"grad_norm": 9.068692207336426, |
|
"learning_rate": 2.591231629491423e-05, |
|
"loss": 2.0785, |
|
"step": 166 |
|
}, |
|
{ |
|
"epoch": 0.008133942161339421, |
|
"grad_norm": 11.246406555175781, |
|
"learning_rate": 2.4547929212481435e-05, |
|
"loss": 2.3037, |
|
"step": 167 |
|
}, |
|
{ |
|
"epoch": 0.008182648401826484, |
|
"grad_norm": 10.597480773925781, |
|
"learning_rate": 2.3208660251050158e-05, |
|
"loss": 2.0094, |
|
"step": 168 |
|
}, |
|
{ |
|
"epoch": 0.008231354642313546, |
|
"grad_norm": 9.89315128326416, |
|
"learning_rate": 2.1895831107393484e-05, |
|
"loss": 1.8189, |
|
"step": 169 |
|
}, |
|
{ |
|
"epoch": 0.00828006088280061, |
|
"grad_norm": 9.914687156677246, |
|
"learning_rate": 2.061073738537635e-05, |
|
"loss": 1.7016, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.00832876712328767, |
|
"grad_norm": 9.6238431930542, |
|
"learning_rate": 1.9354647317351188e-05, |
|
"loss": 2.3516, |
|
"step": 171 |
|
}, |
|
{ |
|
"epoch": 0.008377473363774734, |
|
"grad_norm": 8.912046432495117, |
|
"learning_rate": 1.8128800512565513e-05, |
|
"loss": 2.2398, |
|
"step": 172 |
|
}, |
|
{ |
|
"epoch": 0.008426179604261796, |
|
"grad_norm": 9.197094917297363, |
|
"learning_rate": 1.6934406733817414e-05, |
|
"loss": 2.3229, |
|
"step": 173 |
|
}, |
|
{ |
|
"epoch": 0.008474885844748859, |
|
"grad_norm": 9.847735404968262, |
|
"learning_rate": 1.5772644703565565e-05, |
|
"loss": 2.0305, |
|
"step": 174 |
|
}, |
|
{ |
|
"epoch": 0.00852359208523592, |
|
"grad_norm": 10.200972557067871, |
|
"learning_rate": 1.4644660940672627e-05, |
|
"loss": 1.656, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 0.008572298325722984, |
|
"grad_norm": 10.505123138427734, |
|
"learning_rate": 1.3551568628929434e-05, |
|
"loss": 2.147, |
|
"step": 176 |
|
}, |
|
{ |
|
"epoch": 0.008621004566210046, |
|
"grad_norm": 8.955830574035645, |
|
"learning_rate": 1.2494446518477022e-05, |
|
"loss": 1.4854, |
|
"step": 177 |
|
}, |
|
{ |
|
"epoch": 0.008669710806697107, |
|
"grad_norm": 12.1397066116333, |
|
"learning_rate": 1.1474337861210543e-05, |
|
"loss": 2.39, |
|
"step": 178 |
|
}, |
|
{ |
|
"epoch": 0.00871841704718417, |
|
"grad_norm": 12.608519554138184, |
|
"learning_rate": 1.049224938121548e-05, |
|
"loss": 2.3129, |
|
"step": 179 |
|
}, |
|
{ |
|
"epoch": 0.008767123287671232, |
|
"grad_norm": 9.86025619506836, |
|
"learning_rate": 9.549150281252633e-06, |
|
"loss": 1.6276, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.008815829528158296, |
|
"grad_norm": 12.893600463867188, |
|
"learning_rate": 8.645971286271904e-06, |
|
"loss": 2.4241, |
|
"step": 181 |
|
}, |
|
{ |
|
"epoch": 0.008864535768645357, |
|
"grad_norm": 11.86868667602539, |
|
"learning_rate": 7.783603724899257e-06, |
|
"loss": 2.2967, |
|
"step": 182 |
|
}, |
|
{ |
|
"epoch": 0.00891324200913242, |
|
"grad_norm": 11.338330268859863, |
|
"learning_rate": 6.962898649802823e-06, |
|
"loss": 1.6438, |
|
"step": 183 |
|
}, |
|
{ |
|
"epoch": 0.008961948249619482, |
|
"grad_norm": 11.070121765136719, |
|
"learning_rate": 6.184665997806832e-06, |
|
"loss": 1.3556, |
|
"step": 184 |
|
}, |
|
{ |
|
"epoch": 0.009010654490106546, |
|
"grad_norm": 10.048547744750977, |
|
"learning_rate": 5.449673790581611e-06, |
|
"loss": 1.6426, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 0.009059360730593607, |
|
"grad_norm": 14.009288787841797, |
|
"learning_rate": 4.758647376699032e-06, |
|
"loss": 1.7888, |
|
"step": 186 |
|
}, |
|
{ |
|
"epoch": 0.00910806697108067, |
|
"grad_norm": 10.863405227661133, |
|
"learning_rate": 4.112268715800943e-06, |
|
"loss": 1.5613, |
|
"step": 187 |
|
}, |
|
{ |
|
"epoch": 0.009156773211567732, |
|
"grad_norm": 11.333900451660156, |
|
"learning_rate": 3.511175705587433e-06, |
|
"loss": 1.4206, |
|
"step": 188 |
|
}, |
|
{ |
|
"epoch": 0.009205479452054794, |
|
"grad_norm": 10.498023986816406, |
|
"learning_rate": 2.9559615522887273e-06, |
|
"loss": 1.8235, |
|
"step": 189 |
|
}, |
|
{ |
|
"epoch": 0.009254185692541857, |
|
"grad_norm": 9.978139877319336, |
|
"learning_rate": 2.4471741852423237e-06, |
|
"loss": 1.4594, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.009302891933028919, |
|
"grad_norm": 10.376583099365234, |
|
"learning_rate": 1.985315716152847e-06, |
|
"loss": 1.3848, |
|
"step": 191 |
|
}, |
|
{ |
|
"epoch": 0.009351598173515982, |
|
"grad_norm": 9.732892036437988, |
|
"learning_rate": 1.5708419435684462e-06, |
|
"loss": 1.0441, |
|
"step": 192 |
|
}, |
|
{ |
|
"epoch": 0.009400304414003044, |
|
"grad_norm": 10.896247863769531, |
|
"learning_rate": 1.2041619030626284e-06, |
|
"loss": 1.2706, |
|
"step": 193 |
|
}, |
|
{ |
|
"epoch": 0.009449010654490107, |
|
"grad_norm": 9.157602310180664, |
|
"learning_rate": 8.856374635655695e-07, |
|
"loss": 0.7031, |
|
"step": 194 |
|
}, |
|
{ |
|
"epoch": 0.009497716894977169, |
|
"grad_norm": 7.2205047607421875, |
|
"learning_rate": 6.15582970243117e-07, |
|
"loss": 0.4888, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 0.009546423135464232, |
|
"grad_norm": 6.679050922393799, |
|
"learning_rate": 3.9426493427611177e-07, |
|
"loss": 0.4518, |
|
"step": 196 |
|
}, |
|
{ |
|
"epoch": 0.009595129375951294, |
|
"grad_norm": 8.90599250793457, |
|
"learning_rate": 2.219017698460002e-07, |
|
"loss": 0.8044, |
|
"step": 197 |
|
}, |
|
{ |
|
"epoch": 0.009643835616438357, |
|
"grad_norm": 11.641607284545898, |
|
"learning_rate": 9.866357858642205e-08, |
|
"loss": 1.0654, |
|
"step": 198 |
|
}, |
|
{ |
|
"epoch": 0.009692541856925418, |
|
"grad_norm": 10.517057418823242, |
|
"learning_rate": 2.467198171342e-08, |
|
"loss": 0.7411, |
|
"step": 199 |
|
}, |
|
{ |
|
"epoch": 0.009741248097412482, |
|
"grad_norm": 12.261541366577148, |
|
"learning_rate": 0.0, |
|
"loss": 1.2423, |
|
"step": 200 |
|
} |
|
], |
|
"logging_steps": 1, |
|
"max_steps": 200, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 239, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 5236987211022336.0, |
|
"train_batch_size": 4, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|