|
{ |
|
"best_metric": 0.04876178875565529, |
|
"best_model_checkpoint": "/content/train/Qwen2-VL-7B-Instruct-unsloth-r4-rslora-bf16-tuned/checkpoint-220", |
|
"epoch": 1.6629001883239172, |
|
"eval_steps": 10, |
|
"global_step": 220, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.007532956685499058, |
|
"grad_norm": 0.29860490560531616, |
|
"learning_rate": 1e-05, |
|
"loss": 1.462, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.015065913370998116, |
|
"grad_norm": 0.5057499408721924, |
|
"learning_rate": 2e-05, |
|
"loss": 1.5835, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.022598870056497175, |
|
"grad_norm": 0.32325854897499084, |
|
"learning_rate": 3e-05, |
|
"loss": 1.4388, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.030131826741996232, |
|
"grad_norm": 0.357746422290802, |
|
"learning_rate": 4e-05, |
|
"loss": 1.5551, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.03766478342749529, |
|
"grad_norm": 0.6203590035438538, |
|
"learning_rate": 5e-05, |
|
"loss": 1.6868, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.04519774011299435, |
|
"grad_norm": 0.3148535192012787, |
|
"learning_rate": 6e-05, |
|
"loss": 1.2584, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.05273069679849341, |
|
"grad_norm": 0.8858975768089294, |
|
"learning_rate": 7e-05, |
|
"loss": 1.4839, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 0.060263653483992465, |
|
"grad_norm": 0.41307175159454346, |
|
"learning_rate": 8e-05, |
|
"loss": 1.465, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.06779661016949153, |
|
"grad_norm": 0.711910605430603, |
|
"learning_rate": 9e-05, |
|
"loss": 1.4146, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.07532956685499058, |
|
"grad_norm": 0.6907219886779785, |
|
"learning_rate": 0.0001, |
|
"loss": 1.5244, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.07532956685499058, |
|
"eval_loss": 1.4021275043487549, |
|
"eval_runtime": 60.9671, |
|
"eval_samples_per_second": 1.23, |
|
"eval_steps_per_second": 0.623, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.08286252354048965, |
|
"grad_norm": 0.6071083545684814, |
|
"learning_rate": 9.999834399079165e-05, |
|
"loss": 1.5861, |
|
"step": 11 |
|
}, |
|
{ |
|
"epoch": 0.0903954802259887, |
|
"grad_norm": 0.4335213601589203, |
|
"learning_rate": 9.99933760728612e-05, |
|
"loss": 1.3366, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.09792843691148775, |
|
"grad_norm": 0.43898504972457886, |
|
"learning_rate": 9.99850965752854e-05, |
|
"loss": 1.2906, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 0.10546139359698682, |
|
"grad_norm": 0.400079607963562, |
|
"learning_rate": 9.997350604650123e-05, |
|
"loss": 1.2677, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.11299435028248588, |
|
"grad_norm": 0.41405242681503296, |
|
"learning_rate": 9.995860525426954e-05, |
|
"loss": 1.285, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.12052730696798493, |
|
"grad_norm": 0.7896008491516113, |
|
"learning_rate": 9.994039518562432e-05, |
|
"loss": 1.1366, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.128060263653484, |
|
"grad_norm": 0.45816895365715027, |
|
"learning_rate": 9.991887704680724e-05, |
|
"loss": 1.2656, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 0.13559322033898305, |
|
"grad_norm": 0.5465638637542725, |
|
"learning_rate": 9.989405226318772e-05, |
|
"loss": 1.362, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.1431261770244821, |
|
"grad_norm": 0.46899810433387756, |
|
"learning_rate": 9.986592247916858e-05, |
|
"loss": 1.1479, |
|
"step": 19 |
|
}, |
|
{ |
|
"epoch": 0.15065913370998116, |
|
"grad_norm": 0.5480923056602478, |
|
"learning_rate": 9.983448955807708e-05, |
|
"loss": 1.1834, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.15065913370998116, |
|
"eval_loss": 1.1339441537857056, |
|
"eval_runtime": 50.1262, |
|
"eval_samples_per_second": 1.496, |
|
"eval_steps_per_second": 0.758, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.15819209039548024, |
|
"grad_norm": 0.6135687232017517, |
|
"learning_rate": 9.979975558204147e-05, |
|
"loss": 1.1895, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.1657250470809793, |
|
"grad_norm": 0.6562265753746033, |
|
"learning_rate": 9.976172285185314e-05, |
|
"loss": 1.1213, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.17325800376647835, |
|
"grad_norm": 0.7003594040870667, |
|
"learning_rate": 9.972039388681413e-05, |
|
"loss": 1.1117, |
|
"step": 23 |
|
}, |
|
{ |
|
"epoch": 0.1807909604519774, |
|
"grad_norm": 0.6009169220924377, |
|
"learning_rate": 9.967577142457032e-05, |
|
"loss": 1.0146, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.18832391713747645, |
|
"grad_norm": 0.9375618696212769, |
|
"learning_rate": 9.962785842093003e-05, |
|
"loss": 1.0485, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.1958568738229755, |
|
"grad_norm": 0.7120280861854553, |
|
"learning_rate": 9.957665804966829e-05, |
|
"loss": 0.8638, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.2033898305084746, |
|
"grad_norm": 0.9676161408424377, |
|
"learning_rate": 9.952217370231653e-05, |
|
"loss": 0.9569, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.21092278719397364, |
|
"grad_norm": 0.8478333353996277, |
|
"learning_rate": 9.946440898793801e-05, |
|
"loss": 0.8989, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.2184557438794727, |
|
"grad_norm": 0.9060878753662109, |
|
"learning_rate": 9.940336773288865e-05, |
|
"loss": 0.8083, |
|
"step": 29 |
|
}, |
|
{ |
|
"epoch": 0.22598870056497175, |
|
"grad_norm": 0.9778569936752319, |
|
"learning_rate": 9.933905398056372e-05, |
|
"loss": 0.7268, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.22598870056497175, |
|
"eval_loss": 0.7190647125244141, |
|
"eval_runtime": 50.1226, |
|
"eval_samples_per_second": 1.496, |
|
"eval_steps_per_second": 0.758, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.2335216572504708, |
|
"grad_norm": 0.9858911633491516, |
|
"learning_rate": 9.92714719911298e-05, |
|
"loss": 0.6885, |
|
"step": 31 |
|
}, |
|
{ |
|
"epoch": 0.24105461393596986, |
|
"grad_norm": 1.0453399419784546, |
|
"learning_rate": 9.920062624124282e-05, |
|
"loss": 0.6971, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.24858757062146894, |
|
"grad_norm": 0.9439508318901062, |
|
"learning_rate": 9.912652142375132e-05, |
|
"loss": 0.7523, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 0.256120527306968, |
|
"grad_norm": 0.8011656999588013, |
|
"learning_rate": 9.904916244738571e-05, |
|
"loss": 0.6094, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 0.263653483992467, |
|
"grad_norm": 0.8053231835365295, |
|
"learning_rate": 9.896855443643308e-05, |
|
"loss": 0.5719, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.2711864406779661, |
|
"grad_norm": 0.7347720265388489, |
|
"learning_rate": 9.888470273039775e-05, |
|
"loss": 0.5913, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.2787193973634652, |
|
"grad_norm": 0.8015826344490051, |
|
"learning_rate": 9.879761288364766e-05, |
|
"loss": 0.5636, |
|
"step": 37 |
|
}, |
|
{ |
|
"epoch": 0.2862523540489642, |
|
"grad_norm": 0.4674820303916931, |
|
"learning_rate": 9.870729066504629e-05, |
|
"loss": 0.4962, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 0.2937853107344633, |
|
"grad_norm": 0.3237204849720001, |
|
"learning_rate": 9.861374205757068e-05, |
|
"loss": 0.5201, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 0.3013182674199623, |
|
"grad_norm": 0.3314957320690155, |
|
"learning_rate": 9.851697325791505e-05, |
|
"loss": 0.5162, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.3013182674199623, |
|
"eval_loss": 0.4833310544490814, |
|
"eval_runtime": 50.1791, |
|
"eval_samples_per_second": 1.495, |
|
"eval_steps_per_second": 0.757, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.3088512241054614, |
|
"grad_norm": 0.44042736291885376, |
|
"learning_rate": 9.841699067608033e-05, |
|
"loss": 0.4832, |
|
"step": 41 |
|
}, |
|
{ |
|
"epoch": 0.3163841807909605, |
|
"grad_norm": 0.5744893550872803, |
|
"learning_rate": 9.831380093494957e-05, |
|
"loss": 0.4676, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 0.3239171374764595, |
|
"grad_norm": 0.6231745481491089, |
|
"learning_rate": 9.820741086984924e-05, |
|
"loss": 0.4778, |
|
"step": 43 |
|
}, |
|
{ |
|
"epoch": 0.3314500941619586, |
|
"grad_norm": 0.652641773223877, |
|
"learning_rate": 9.809782752809644e-05, |
|
"loss": 0.4618, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 0.3389830508474576, |
|
"grad_norm": 0.34044504165649414, |
|
"learning_rate": 9.798505816853208e-05, |
|
"loss": 0.5053, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.3465160075329567, |
|
"grad_norm": 0.3873239755630493, |
|
"learning_rate": 9.786911026104007e-05, |
|
"loss": 0.4319, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 0.3540489642184557, |
|
"grad_norm": 0.34049129486083984, |
|
"learning_rate": 9.774999148605251e-05, |
|
"loss": 0.4713, |
|
"step": 47 |
|
}, |
|
{ |
|
"epoch": 0.3615819209039548, |
|
"grad_norm": 0.39110735058784485, |
|
"learning_rate": 9.762770973404094e-05, |
|
"loss": 0.4605, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.3691148775894539, |
|
"grad_norm": 0.3248206675052643, |
|
"learning_rate": 9.750227310499366e-05, |
|
"loss": 0.4259, |
|
"step": 49 |
|
}, |
|
{ |
|
"epoch": 0.3766478342749529, |
|
"grad_norm": 0.624284029006958, |
|
"learning_rate": 9.737368990787916e-05, |
|
"loss": 0.3921, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.3766478342749529, |
|
"eval_loss": 0.42401236295700073, |
|
"eval_runtime": 50.1482, |
|
"eval_samples_per_second": 1.496, |
|
"eval_steps_per_second": 0.758, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.384180790960452, |
|
"grad_norm": 0.3228764533996582, |
|
"learning_rate": 9.72419686600958e-05, |
|
"loss": 0.4269, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 0.391713747645951, |
|
"grad_norm": 0.2958717346191406, |
|
"learning_rate": 9.710711808690754e-05, |
|
"loss": 0.4804, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 0.3992467043314501, |
|
"grad_norm": 0.5469731092453003, |
|
"learning_rate": 9.696914712086603e-05, |
|
"loss": 0.4463, |
|
"step": 53 |
|
}, |
|
{ |
|
"epoch": 0.4067796610169492, |
|
"grad_norm": 0.35308223962783813, |
|
"learning_rate": 9.682806490121885e-05, |
|
"loss": 0.3708, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 0.4143126177024482, |
|
"grad_norm": 0.4369649589061737, |
|
"learning_rate": 9.668388077330421e-05, |
|
"loss": 0.4226, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.4218455743879473, |
|
"grad_norm": 0.7406517863273621, |
|
"learning_rate": 9.653660428793188e-05, |
|
"loss": 0.4082, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 0.4293785310734463, |
|
"grad_norm": 0.4191276729106903, |
|
"learning_rate": 9.638624520075046e-05, |
|
"loss": 0.3736, |
|
"step": 57 |
|
}, |
|
{ |
|
"epoch": 0.4369114877589454, |
|
"grad_norm": 0.3738877475261688, |
|
"learning_rate": 9.623281347160127e-05, |
|
"loss": 0.3987, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 0.4444444444444444, |
|
"grad_norm": 0.3371034562587738, |
|
"learning_rate": 9.607631926385859e-05, |
|
"loss": 0.4276, |
|
"step": 59 |
|
}, |
|
{ |
|
"epoch": 0.4519774011299435, |
|
"grad_norm": 0.3625945746898651, |
|
"learning_rate": 9.591677294375636e-05, |
|
"loss": 0.3952, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.4519774011299435, |
|
"eval_loss": 0.3758457601070404, |
|
"eval_runtime": 50.0723, |
|
"eval_samples_per_second": 1.498, |
|
"eval_steps_per_second": 0.759, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.4595103578154426, |
|
"grad_norm": 0.4314848780632019, |
|
"learning_rate": 9.575418507970161e-05, |
|
"loss": 0.4264, |
|
"step": 61 |
|
}, |
|
{ |
|
"epoch": 0.4670433145009416, |
|
"grad_norm": 0.3458515703678131, |
|
"learning_rate": 9.558856644157432e-05, |
|
"loss": 0.3845, |
|
"step": 62 |
|
}, |
|
{ |
|
"epoch": 0.4745762711864407, |
|
"grad_norm": 0.30270910263061523, |
|
"learning_rate": 9.541992800001409e-05, |
|
"loss": 0.3984, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 0.4821092278719397, |
|
"grad_norm": 0.3931046426296234, |
|
"learning_rate": 9.52482809256934e-05, |
|
"loss": 0.4019, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 0.4896421845574388, |
|
"grad_norm": 0.49052339792251587, |
|
"learning_rate": 9.507363658857768e-05, |
|
"loss": 0.4353, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.4971751412429379, |
|
"grad_norm": 0.3206973075866699, |
|
"learning_rate": 9.489600655717217e-05, |
|
"loss": 0.4599, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 0.504708097928437, |
|
"grad_norm": 0.36185964941978455, |
|
"learning_rate": 9.471540259775554e-05, |
|
"loss": 0.3684, |
|
"step": 67 |
|
}, |
|
{ |
|
"epoch": 0.512241054613936, |
|
"grad_norm": 0.4438144266605377, |
|
"learning_rate": 9.453183667360062e-05, |
|
"loss": 0.3924, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 0.519774011299435, |
|
"grad_norm": 0.408000111579895, |
|
"learning_rate": 9.43453209441818e-05, |
|
"loss": 0.3165, |
|
"step": 69 |
|
}, |
|
{ |
|
"epoch": 0.527306967984934, |
|
"grad_norm": 0.5293291211128235, |
|
"learning_rate": 9.415586776436973e-05, |
|
"loss": 0.3696, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.527306967984934, |
|
"eval_loss": 0.3250181972980499, |
|
"eval_runtime": 50.0681, |
|
"eval_samples_per_second": 1.498, |
|
"eval_steps_per_second": 0.759, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.5348399246704332, |
|
"grad_norm": 0.7923517227172852, |
|
"learning_rate": 9.396348968361281e-05, |
|
"loss": 0.388, |
|
"step": 71 |
|
}, |
|
{ |
|
"epoch": 0.5423728813559322, |
|
"grad_norm": 0.5696645975112915, |
|
"learning_rate": 9.376819944510598e-05, |
|
"loss": 0.3383, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 0.5499058380414312, |
|
"grad_norm": 0.8812904357910156, |
|
"learning_rate": 9.357000998494656e-05, |
|
"loss": 0.2872, |
|
"step": 73 |
|
}, |
|
{ |
|
"epoch": 0.5574387947269304, |
|
"grad_norm": 0.4949493110179901, |
|
"learning_rate": 9.336893443127738e-05, |
|
"loss": 0.3039, |
|
"step": 74 |
|
}, |
|
{ |
|
"epoch": 0.5649717514124294, |
|
"grad_norm": 0.40088051557540894, |
|
"learning_rate": 9.31649861034172e-05, |
|
"loss": 0.3246, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.5725047080979284, |
|
"grad_norm": 0.40099695324897766, |
|
"learning_rate": 9.295817851097837e-05, |
|
"loss": 0.3544, |
|
"step": 76 |
|
}, |
|
{ |
|
"epoch": 0.5800376647834274, |
|
"grad_norm": 0.4039609432220459, |
|
"learning_rate": 9.274852535297198e-05, |
|
"loss": 0.333, |
|
"step": 77 |
|
}, |
|
{ |
|
"epoch": 0.5875706214689266, |
|
"grad_norm": 0.46073368191719055, |
|
"learning_rate": 9.253604051690046e-05, |
|
"loss": 0.2505, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 0.5951035781544256, |
|
"grad_norm": 0.3831747770309448, |
|
"learning_rate": 9.232073807783759e-05, |
|
"loss": 0.2545, |
|
"step": 79 |
|
}, |
|
{ |
|
"epoch": 0.6026365348399246, |
|
"grad_norm": 0.668875515460968, |
|
"learning_rate": 9.210263229749626e-05, |
|
"loss": 0.2892, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.6026365348399246, |
|
"eval_loss": 0.25938624143600464, |
|
"eval_runtime": 50.0589, |
|
"eval_samples_per_second": 1.498, |
|
"eval_steps_per_second": 0.759, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.6101694915254238, |
|
"grad_norm": 0.4702552556991577, |
|
"learning_rate": 9.188173762328367e-05, |
|
"loss": 0.259, |
|
"step": 81 |
|
}, |
|
{ |
|
"epoch": 0.6177024482109228, |
|
"grad_norm": 0.6220546364784241, |
|
"learning_rate": 9.165806868734444e-05, |
|
"loss": 0.2636, |
|
"step": 82 |
|
}, |
|
{ |
|
"epoch": 0.6252354048964218, |
|
"grad_norm": 0.526050329208374, |
|
"learning_rate": 9.143164030559122e-05, |
|
"loss": 0.2903, |
|
"step": 83 |
|
}, |
|
{ |
|
"epoch": 0.632768361581921, |
|
"grad_norm": 0.48076707124710083, |
|
"learning_rate": 9.120246747672347e-05, |
|
"loss": 0.2847, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 0.64030131826742, |
|
"grad_norm": 0.5487018823623657, |
|
"learning_rate": 9.097056538123376e-05, |
|
"loss": 0.2625, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.647834274952919, |
|
"grad_norm": 0.6256678700447083, |
|
"learning_rate": 9.073594938040231e-05, |
|
"loss": 0.267, |
|
"step": 86 |
|
}, |
|
{ |
|
"epoch": 0.655367231638418, |
|
"grad_norm": 0.6361900568008423, |
|
"learning_rate": 9.049863501527947e-05, |
|
"loss": 0.2927, |
|
"step": 87 |
|
}, |
|
{ |
|
"epoch": 0.6629001883239172, |
|
"grad_norm": 0.5942044258117676, |
|
"learning_rate": 9.025863800565613e-05, |
|
"loss": 0.2995, |
|
"step": 88 |
|
}, |
|
{ |
|
"epoch": 0.6704331450094162, |
|
"grad_norm": 0.5061842203140259, |
|
"learning_rate": 9.001597424902267e-05, |
|
"loss": 0.232, |
|
"step": 89 |
|
}, |
|
{ |
|
"epoch": 0.6779661016949152, |
|
"grad_norm": 0.826519250869751, |
|
"learning_rate": 8.977065981951566e-05, |
|
"loss": 0.2039, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.6779661016949152, |
|
"eval_loss": 0.2045535147190094, |
|
"eval_runtime": 50.1117, |
|
"eval_samples_per_second": 1.497, |
|
"eval_steps_per_second": 0.758, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.6854990583804144, |
|
"grad_norm": 0.5888376832008362, |
|
"learning_rate": 8.952271096685332e-05, |
|
"loss": 0.1997, |
|
"step": 91 |
|
}, |
|
{ |
|
"epoch": 0.6930320150659134, |
|
"grad_norm": 0.7269095182418823, |
|
"learning_rate": 8.927214411525895e-05, |
|
"loss": 0.2589, |
|
"step": 92 |
|
}, |
|
{ |
|
"epoch": 0.7005649717514124, |
|
"grad_norm": 0.8657869696617126, |
|
"learning_rate": 8.90189758623731e-05, |
|
"loss": 0.1956, |
|
"step": 93 |
|
}, |
|
{ |
|
"epoch": 0.7080979284369114, |
|
"grad_norm": 0.7702244520187378, |
|
"learning_rate": 8.876322297815405e-05, |
|
"loss": 0.2218, |
|
"step": 94 |
|
}, |
|
{ |
|
"epoch": 0.7156308851224106, |
|
"grad_norm": 0.6147786974906921, |
|
"learning_rate": 8.850490240376711e-05, |
|
"loss": 0.212, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.7231638418079096, |
|
"grad_norm": 0.5774387717247009, |
|
"learning_rate": 8.824403125046225e-05, |
|
"loss": 0.1908, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 0.7306967984934086, |
|
"grad_norm": 0.75771164894104, |
|
"learning_rate": 8.798062679844077e-05, |
|
"loss": 0.1696, |
|
"step": 97 |
|
}, |
|
{ |
|
"epoch": 0.7382297551789078, |
|
"grad_norm": 0.6371806859970093, |
|
"learning_rate": 8.771470649571056e-05, |
|
"loss": 0.2085, |
|
"step": 98 |
|
}, |
|
{ |
|
"epoch": 0.7457627118644068, |
|
"grad_norm": 0.5286451578140259, |
|
"learning_rate": 8.744628795693047e-05, |
|
"loss": 0.1982, |
|
"step": 99 |
|
}, |
|
{ |
|
"epoch": 0.7532956685499058, |
|
"grad_norm": 0.5060010552406311, |
|
"learning_rate": 8.717538896224332e-05, |
|
"loss": 0.2027, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.7532956685499058, |
|
"eval_loss": 0.1616896241903305, |
|
"eval_runtime": 50.0794, |
|
"eval_samples_per_second": 1.498, |
|
"eval_steps_per_second": 0.759, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.7608286252354048, |
|
"grad_norm": 0.7836325764656067, |
|
"learning_rate": 8.690202745609835e-05, |
|
"loss": 0.1358, |
|
"step": 101 |
|
}, |
|
{ |
|
"epoch": 0.768361581920904, |
|
"grad_norm": 0.675710141658783, |
|
"learning_rate": 8.662622154606237e-05, |
|
"loss": 0.1846, |
|
"step": 102 |
|
}, |
|
{ |
|
"epoch": 0.775894538606403, |
|
"grad_norm": 0.8798868656158447, |
|
"learning_rate": 8.634798950162048e-05, |
|
"loss": 0.1966, |
|
"step": 103 |
|
}, |
|
{ |
|
"epoch": 0.783427495291902, |
|
"grad_norm": 0.5943530201911926, |
|
"learning_rate": 8.606734975296578e-05, |
|
"loss": 0.1632, |
|
"step": 104 |
|
}, |
|
{ |
|
"epoch": 0.7909604519774012, |
|
"grad_norm": 0.4987153708934784, |
|
"learning_rate": 8.578432088977859e-05, |
|
"loss": 0.1113, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.7984934086629002, |
|
"grad_norm": 0.7684951424598694, |
|
"learning_rate": 8.549892165999505e-05, |
|
"loss": 0.1434, |
|
"step": 106 |
|
}, |
|
{ |
|
"epoch": 0.8060263653483992, |
|
"grad_norm": 0.624474823474884, |
|
"learning_rate": 8.521117096856528e-05, |
|
"loss": 0.1754, |
|
"step": 107 |
|
}, |
|
{ |
|
"epoch": 0.8135593220338984, |
|
"grad_norm": 1.0357481241226196, |
|
"learning_rate": 8.492108787620105e-05, |
|
"loss": 0.1761, |
|
"step": 108 |
|
}, |
|
{ |
|
"epoch": 0.8210922787193974, |
|
"grad_norm": 0.7299608588218689, |
|
"learning_rate": 8.462869159811327e-05, |
|
"loss": 0.1145, |
|
"step": 109 |
|
}, |
|
{ |
|
"epoch": 0.8286252354048964, |
|
"grad_norm": 0.7747260332107544, |
|
"learning_rate": 8.433400150273906e-05, |
|
"loss": 0.1494, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.8286252354048964, |
|
"eval_loss": 0.12168504297733307, |
|
"eval_runtime": 50.0938, |
|
"eval_samples_per_second": 1.497, |
|
"eval_steps_per_second": 0.759, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.8361581920903954, |
|
"grad_norm": 0.7779563665390015, |
|
"learning_rate": 8.403703711045892e-05, |
|
"loss": 0.159, |
|
"step": 111 |
|
}, |
|
{ |
|
"epoch": 0.8436911487758946, |
|
"grad_norm": 1.1017848253250122, |
|
"learning_rate": 8.373781809230355e-05, |
|
"loss": 0.2012, |
|
"step": 112 |
|
}, |
|
{ |
|
"epoch": 0.8512241054613936, |
|
"grad_norm": 0.4848591983318329, |
|
"learning_rate": 8.343636426865096e-05, |
|
"loss": 0.1215, |
|
"step": 113 |
|
}, |
|
{ |
|
"epoch": 0.8587570621468926, |
|
"grad_norm": 0.8865452408790588, |
|
"learning_rate": 8.313269560791342e-05, |
|
"loss": 0.0923, |
|
"step": 114 |
|
}, |
|
{ |
|
"epoch": 0.8662900188323918, |
|
"grad_norm": 0.699765145778656, |
|
"learning_rate": 8.28268322252149e-05, |
|
"loss": 0.1587, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.8738229755178908, |
|
"grad_norm": 0.5934118628501892, |
|
"learning_rate": 8.251879438105854e-05, |
|
"loss": 0.1288, |
|
"step": 116 |
|
}, |
|
{ |
|
"epoch": 0.8813559322033898, |
|
"grad_norm": 0.7120998501777649, |
|
"learning_rate": 8.220860247998456e-05, |
|
"loss": 0.101, |
|
"step": 117 |
|
}, |
|
{ |
|
"epoch": 0.8888888888888888, |
|
"grad_norm": 1.0435795783996582, |
|
"learning_rate": 8.189627706921877e-05, |
|
"loss": 0.1067, |
|
"step": 118 |
|
}, |
|
{ |
|
"epoch": 0.896421845574388, |
|
"grad_norm": 1.4066674709320068, |
|
"learning_rate": 8.15818388373114e-05, |
|
"loss": 0.1313, |
|
"step": 119 |
|
}, |
|
{ |
|
"epoch": 0.903954802259887, |
|
"grad_norm": 0.7823694348335266, |
|
"learning_rate": 8.126530861276677e-05, |
|
"loss": 0.0863, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.903954802259887, |
|
"eval_loss": 0.09002000838518143, |
|
"eval_runtime": 50.1325, |
|
"eval_samples_per_second": 1.496, |
|
"eval_steps_per_second": 0.758, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.911487758945386, |
|
"grad_norm": 0.617436945438385, |
|
"learning_rate": 8.094670736266353e-05, |
|
"loss": 0.0868, |
|
"step": 121 |
|
}, |
|
{ |
|
"epoch": 0.9190207156308852, |
|
"grad_norm": 0.602743923664093, |
|
"learning_rate": 8.062605619126584e-05, |
|
"loss": 0.0983, |
|
"step": 122 |
|
}, |
|
{ |
|
"epoch": 0.9265536723163842, |
|
"grad_norm": 1.1149927377700806, |
|
"learning_rate": 8.030337633862542e-05, |
|
"loss": 0.1078, |
|
"step": 123 |
|
}, |
|
{ |
|
"epoch": 0.9340866290018832, |
|
"grad_norm": 0.688164472579956, |
|
"learning_rate": 7.997868917917453e-05, |
|
"loss": 0.0722, |
|
"step": 124 |
|
}, |
|
{ |
|
"epoch": 0.9416195856873822, |
|
"grad_norm": 0.6507720351219177, |
|
"learning_rate": 7.965201622031021e-05, |
|
"loss": 0.0824, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.9491525423728814, |
|
"grad_norm": 0.5873366594314575, |
|
"learning_rate": 7.932337910096961e-05, |
|
"loss": 0.0633, |
|
"step": 126 |
|
}, |
|
{ |
|
"epoch": 0.9566854990583804, |
|
"grad_norm": 0.48810163140296936, |
|
"learning_rate": 7.899279959019654e-05, |
|
"loss": 0.0579, |
|
"step": 127 |
|
}, |
|
{ |
|
"epoch": 0.9642184557438794, |
|
"grad_norm": 0.7808175086975098, |
|
"learning_rate": 7.866029958569956e-05, |
|
"loss": 0.0959, |
|
"step": 128 |
|
}, |
|
{ |
|
"epoch": 0.9717514124293786, |
|
"grad_norm": 0.7213129997253418, |
|
"learning_rate": 7.832590111240145e-05, |
|
"loss": 0.1074, |
|
"step": 129 |
|
}, |
|
{ |
|
"epoch": 0.9792843691148776, |
|
"grad_norm": 0.7429501414299011, |
|
"learning_rate": 7.798962632098024e-05, |
|
"loss": 0.0883, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.9792843691148776, |
|
"eval_loss": 0.07171642780303955, |
|
"eval_runtime": 50.1218, |
|
"eval_samples_per_second": 1.496, |
|
"eval_steps_per_second": 0.758, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.9868173258003766, |
|
"grad_norm": 1.2204382419586182, |
|
"learning_rate": 7.765149748640197e-05, |
|
"loss": 0.0543, |
|
"step": 131 |
|
}, |
|
{ |
|
"epoch": 0.9943502824858758, |
|
"grad_norm": 0.4847294092178345, |
|
"learning_rate": 7.73115370064452e-05, |
|
"loss": 0.0646, |
|
"step": 132 |
|
}, |
|
{ |
|
"epoch": 1.0075329566854991, |
|
"grad_norm": 0.7584130764007568, |
|
"learning_rate": 7.696976740021733e-05, |
|
"loss": 0.1122, |
|
"step": 133 |
|
}, |
|
{ |
|
"epoch": 1.015065913370998, |
|
"grad_norm": 0.4241285026073456, |
|
"learning_rate": 7.6626211306663e-05, |
|
"loss": 0.0382, |
|
"step": 134 |
|
}, |
|
{ |
|
"epoch": 1.0225988700564972, |
|
"grad_norm": 0.6364421844482422, |
|
"learning_rate": 7.628089148306434e-05, |
|
"loss": 0.0707, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 1.0301318267419963, |
|
"grad_norm": 0.6198002099990845, |
|
"learning_rate": 7.59338308035337e-05, |
|
"loss": 0.0942, |
|
"step": 136 |
|
}, |
|
{ |
|
"epoch": 1.0376647834274952, |
|
"grad_norm": 0.43464764952659607, |
|
"learning_rate": 7.558505225749827e-05, |
|
"loss": 0.086, |
|
"step": 137 |
|
}, |
|
{ |
|
"epoch": 1.0451977401129944, |
|
"grad_norm": 0.3746040463447571, |
|
"learning_rate": 7.523457894817745e-05, |
|
"loss": 0.0426, |
|
"step": 138 |
|
}, |
|
{ |
|
"epoch": 1.0527306967984935, |
|
"grad_norm": 0.4349992573261261, |
|
"learning_rate": 7.488243409105233e-05, |
|
"loss": 0.0665, |
|
"step": 139 |
|
}, |
|
{ |
|
"epoch": 1.0602636534839924, |
|
"grad_norm": 0.44264963269233704, |
|
"learning_rate": 7.452864101232798e-05, |
|
"loss": 0.0854, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 1.0602636534839924, |
|
"eval_loss": 0.061161428689956665, |
|
"eval_runtime": 50.0795, |
|
"eval_samples_per_second": 1.498, |
|
"eval_steps_per_second": 0.759, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 1.0677966101694916, |
|
"grad_norm": 0.535400390625, |
|
"learning_rate": 7.417322314738822e-05, |
|
"loss": 0.0502, |
|
"step": 141 |
|
}, |
|
{ |
|
"epoch": 1.0753295668549905, |
|
"grad_norm": 0.4334566295146942, |
|
"learning_rate": 7.381620403924333e-05, |
|
"loss": 0.0547, |
|
"step": 142 |
|
}, |
|
{ |
|
"epoch": 1.0828625235404896, |
|
"grad_norm": 0.6763977408409119, |
|
"learning_rate": 7.345760733697055e-05, |
|
"loss": 0.0576, |
|
"step": 143 |
|
}, |
|
{ |
|
"epoch": 1.0903954802259888, |
|
"grad_norm": 0.6230652928352356, |
|
"learning_rate": 7.30974567941475e-05, |
|
"loss": 0.0746, |
|
"step": 144 |
|
}, |
|
{ |
|
"epoch": 1.0979284369114877, |
|
"grad_norm": 0.5115144848823547, |
|
"learning_rate": 7.273577626727884e-05, |
|
"loss": 0.0707, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 1.1054613935969868, |
|
"grad_norm": 0.4081382751464844, |
|
"learning_rate": 7.237258971421587e-05, |
|
"loss": 0.0417, |
|
"step": 146 |
|
}, |
|
{ |
|
"epoch": 1.112994350282486, |
|
"grad_norm": 0.45929810404777527, |
|
"learning_rate": 7.20079211925696e-05, |
|
"loss": 0.0402, |
|
"step": 147 |
|
}, |
|
{ |
|
"epoch": 1.1205273069679849, |
|
"grad_norm": 0.41027042269706726, |
|
"learning_rate": 7.164179485811727e-05, |
|
"loss": 0.057, |
|
"step": 148 |
|
}, |
|
{ |
|
"epoch": 1.128060263653484, |
|
"grad_norm": 0.732723593711853, |
|
"learning_rate": 7.127423496320212e-05, |
|
"loss": 0.0686, |
|
"step": 149 |
|
}, |
|
{ |
|
"epoch": 1.1355932203389831, |
|
"grad_norm": 0.4201294481754303, |
|
"learning_rate": 7.090526585512696e-05, |
|
"loss": 0.075, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 1.1355932203389831, |
|
"eval_loss": 0.056692853569984436, |
|
"eval_runtime": 50.061, |
|
"eval_samples_per_second": 1.498, |
|
"eval_steps_per_second": 0.759, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 1.143126177024482, |
|
"grad_norm": 0.3101370930671692, |
|
"learning_rate": 7.053491197454142e-05, |
|
"loss": 0.0366, |
|
"step": 151 |
|
}, |
|
{ |
|
"epoch": 1.1506591337099812, |
|
"grad_norm": 0.4567885100841522, |
|
"learning_rate": 7.016319785382296e-05, |
|
"loss": 0.0778, |
|
"step": 152 |
|
}, |
|
{ |
|
"epoch": 1.1581920903954803, |
|
"grad_norm": 0.3678460419178009, |
|
"learning_rate": 6.979014811545189e-05, |
|
"loss": 0.0495, |
|
"step": 153 |
|
}, |
|
{ |
|
"epoch": 1.1657250470809792, |
|
"grad_norm": 0.5862255096435547, |
|
"learning_rate": 6.941578747038023e-05, |
|
"loss": 0.0735, |
|
"step": 154 |
|
}, |
|
{ |
|
"epoch": 1.1732580037664784, |
|
"grad_norm": 0.5297702550888062, |
|
"learning_rate": 6.904014071639503e-05, |
|
"loss": 0.034, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 1.1807909604519775, |
|
"grad_norm": 0.3692375123500824, |
|
"learning_rate": 6.866323273647563e-05, |
|
"loss": 0.0587, |
|
"step": 156 |
|
}, |
|
{ |
|
"epoch": 1.1883239171374764, |
|
"grad_norm": 0.3478219509124756, |
|
"learning_rate": 6.828508849714546e-05, |
|
"loss": 0.0676, |
|
"step": 157 |
|
}, |
|
{ |
|
"epoch": 1.1958568738229756, |
|
"grad_norm": 0.5066402554512024, |
|
"learning_rate": 6.79057330468182e-05, |
|
"loss": 0.0362, |
|
"step": 158 |
|
}, |
|
{ |
|
"epoch": 1.2033898305084745, |
|
"grad_norm": 0.38956108689308167, |
|
"learning_rate": 6.752519151413861e-05, |
|
"loss": 0.0475, |
|
"step": 159 |
|
}, |
|
{ |
|
"epoch": 1.2109227871939736, |
|
"grad_norm": 0.3372087776660919, |
|
"learning_rate": 6.7143489106318e-05, |
|
"loss": 0.0835, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 1.2109227871939736, |
|
"eval_loss": 0.054186370223760605, |
|
"eval_runtime": 50.1057, |
|
"eval_samples_per_second": 1.497, |
|
"eval_steps_per_second": 0.758, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 1.2184557438794728, |
|
"grad_norm": 0.4884163737297058, |
|
"learning_rate": 6.676065110746444e-05, |
|
"loss": 0.057, |
|
"step": 161 |
|
}, |
|
{ |
|
"epoch": 1.2259887005649717, |
|
"grad_norm": 0.42741522192955017, |
|
"learning_rate": 6.637670287690799e-05, |
|
"loss": 0.0493, |
|
"step": 162 |
|
}, |
|
{ |
|
"epoch": 1.2335216572504708, |
|
"grad_norm": 0.37808606028556824, |
|
"learning_rate": 6.599166984752087e-05, |
|
"loss": 0.0324, |
|
"step": 163 |
|
}, |
|
{ |
|
"epoch": 1.24105461393597, |
|
"grad_norm": 0.6114926934242249, |
|
"learning_rate": 6.560557752403277e-05, |
|
"loss": 0.0975, |
|
"step": 164 |
|
}, |
|
{ |
|
"epoch": 1.2485875706214689, |
|
"grad_norm": 0.2513774633407593, |
|
"learning_rate": 6.52184514813414e-05, |
|
"loss": 0.0323, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 1.256120527306968, |
|
"grad_norm": 0.319553941488266, |
|
"learning_rate": 6.483031736281843e-05, |
|
"loss": 0.0246, |
|
"step": 166 |
|
}, |
|
{ |
|
"epoch": 1.2636534839924671, |
|
"grad_norm": 0.6170870661735535, |
|
"learning_rate": 6.444120087861081e-05, |
|
"loss": 0.05, |
|
"step": 167 |
|
}, |
|
{ |
|
"epoch": 1.271186440677966, |
|
"grad_norm": 0.4168320596218109, |
|
"learning_rate": 6.40511278039378e-05, |
|
"loss": 0.0586, |
|
"step": 168 |
|
}, |
|
{ |
|
"epoch": 1.2787193973634652, |
|
"grad_norm": 0.3265027403831482, |
|
"learning_rate": 6.366012397738355e-05, |
|
"loss": 0.0666, |
|
"step": 169 |
|
}, |
|
{ |
|
"epoch": 1.286252354048964, |
|
"grad_norm": 0.609319806098938, |
|
"learning_rate": 6.326821529918553e-05, |
|
"loss": 0.085, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 1.286252354048964, |
|
"eval_loss": 0.053176406770944595, |
|
"eval_runtime": 50.1048, |
|
"eval_samples_per_second": 1.497, |
|
"eval_steps_per_second": 0.758, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 1.2937853107344632, |
|
"grad_norm": 0.5599777698516846, |
|
"learning_rate": 6.287542772951897e-05, |
|
"loss": 0.0723, |
|
"step": 171 |
|
}, |
|
{ |
|
"epoch": 1.3013182674199624, |
|
"grad_norm": 0.4176930785179138, |
|
"learning_rate": 6.248178728677711e-05, |
|
"loss": 0.0728, |
|
"step": 172 |
|
}, |
|
{ |
|
"epoch": 1.3088512241054615, |
|
"grad_norm": 0.3369368314743042, |
|
"learning_rate": 6.208732004584791e-05, |
|
"loss": 0.0499, |
|
"step": 173 |
|
}, |
|
{ |
|
"epoch": 1.3163841807909604, |
|
"grad_norm": 0.45373421907424927, |
|
"learning_rate": 6.16920521363867e-05, |
|
"loss": 0.0792, |
|
"step": 174 |
|
}, |
|
{ |
|
"epoch": 1.3239171374764596, |
|
"grad_norm": 0.3415266275405884, |
|
"learning_rate": 6.129600974108538e-05, |
|
"loss": 0.048, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 1.3314500941619585, |
|
"grad_norm": 0.5422535538673401, |
|
"learning_rate": 6.089921909393812e-05, |
|
"loss": 0.0883, |
|
"step": 176 |
|
}, |
|
{ |
|
"epoch": 1.3389830508474576, |
|
"grad_norm": 0.25546208024024963, |
|
"learning_rate": 6.050170647850351e-05, |
|
"loss": 0.0473, |
|
"step": 177 |
|
}, |
|
{ |
|
"epoch": 1.3465160075329567, |
|
"grad_norm": 0.5453774929046631, |
|
"learning_rate": 6.0103498226163603e-05, |
|
"loss": 0.0984, |
|
"step": 178 |
|
}, |
|
{ |
|
"epoch": 1.3540489642184557, |
|
"grad_norm": 0.340578556060791, |
|
"learning_rate": 5.970462071437973e-05, |
|
"loss": 0.0354, |
|
"step": 179 |
|
}, |
|
{ |
|
"epoch": 1.3615819209039548, |
|
"grad_norm": 0.4705830216407776, |
|
"learning_rate": 5.93051003649452e-05, |
|
"loss": 0.054, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 1.3615819209039548, |
|
"eval_loss": 0.051415782421827316, |
|
"eval_runtime": 50.1031, |
|
"eval_samples_per_second": 1.497, |
|
"eval_steps_per_second": 0.758, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 1.369114877589454, |
|
"grad_norm": 0.5732293128967285, |
|
"learning_rate": 5.890496364223509e-05, |
|
"loss": 0.0626, |
|
"step": 181 |
|
}, |
|
{ |
|
"epoch": 1.3766478342749529, |
|
"grad_norm": 0.4315062463283539, |
|
"learning_rate": 5.850423705145334e-05, |
|
"loss": 0.0613, |
|
"step": 182 |
|
}, |
|
{ |
|
"epoch": 1.384180790960452, |
|
"grad_norm": 0.5873361825942993, |
|
"learning_rate": 5.8102947136876876e-05, |
|
"loss": 0.0915, |
|
"step": 183 |
|
}, |
|
{ |
|
"epoch": 1.3917137476459511, |
|
"grad_norm": 0.2784740626811981, |
|
"learning_rate": 5.770112048009747e-05, |
|
"loss": 0.0259, |
|
"step": 184 |
|
}, |
|
{ |
|
"epoch": 1.39924670433145, |
|
"grad_norm": 0.36659327149391174, |
|
"learning_rate": 5.7298783698260874e-05, |
|
"loss": 0.0643, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 1.4067796610169492, |
|
"grad_norm": 0.31498560309410095, |
|
"learning_rate": 5.68959634423037e-05, |
|
"loss": 0.0345, |
|
"step": 186 |
|
}, |
|
{ |
|
"epoch": 1.414312617702448, |
|
"grad_norm": 0.4332311451435089, |
|
"learning_rate": 5.64926863951881e-05, |
|
"loss": 0.0461, |
|
"step": 187 |
|
}, |
|
{ |
|
"epoch": 1.4218455743879472, |
|
"grad_norm": 0.35596826672554016, |
|
"learning_rate": 5.60889792701342e-05, |
|
"loss": 0.032, |
|
"step": 188 |
|
}, |
|
{ |
|
"epoch": 1.4293785310734464, |
|
"grad_norm": 0.43455907702445984, |
|
"learning_rate": 5.568486880885068e-05, |
|
"loss": 0.0283, |
|
"step": 189 |
|
}, |
|
{ |
|
"epoch": 1.4369114877589455, |
|
"grad_norm": 2.2231392860412598, |
|
"learning_rate": 5.52803817797633e-05, |
|
"loss": 0.0337, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 1.4369114877589455, |
|
"eval_loss": 0.050475846976041794, |
|
"eval_runtime": 50.1291, |
|
"eval_samples_per_second": 1.496, |
|
"eval_steps_per_second": 0.758, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 1.4444444444444444, |
|
"grad_norm": 0.3397436738014221, |
|
"learning_rate": 5.487554497624189e-05, |
|
"loss": 0.0481, |
|
"step": 191 |
|
}, |
|
{ |
|
"epoch": 1.4519774011299436, |
|
"grad_norm": 0.3190496265888214, |
|
"learning_rate": 5.4470385214825416e-05, |
|
"loss": 0.0606, |
|
"step": 192 |
|
}, |
|
{ |
|
"epoch": 1.4595103578154425, |
|
"grad_norm": 0.3939787447452545, |
|
"learning_rate": 5.406492933344571e-05, |
|
"loss": 0.1776, |
|
"step": 193 |
|
}, |
|
{ |
|
"epoch": 1.4670433145009416, |
|
"grad_norm": 0.33525291085243225, |
|
"learning_rate": 5.365920418964973e-05, |
|
"loss": 0.0516, |
|
"step": 194 |
|
}, |
|
{ |
|
"epoch": 1.4745762711864407, |
|
"grad_norm": 0.32538145780563354, |
|
"learning_rate": 5.3253236658820396e-05, |
|
"loss": 0.0465, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 1.4821092278719397, |
|
"grad_norm": 0.3278179466724396, |
|
"learning_rate": 5.28470536323965e-05, |
|
"loss": 0.067, |
|
"step": 196 |
|
}, |
|
{ |
|
"epoch": 1.4896421845574388, |
|
"grad_norm": 0.41677576303482056, |
|
"learning_rate": 5.244068201609133e-05, |
|
"loss": 0.0519, |
|
"step": 197 |
|
}, |
|
{ |
|
"epoch": 1.497175141242938, |
|
"grad_norm": 0.44194525480270386, |
|
"learning_rate": 5.2034148728110424e-05, |
|
"loss": 0.0686, |
|
"step": 198 |
|
}, |
|
{ |
|
"epoch": 1.5047080979284368, |
|
"grad_norm": 0.3261134922504425, |
|
"learning_rate": 5.162748069736851e-05, |
|
"loss": 0.0534, |
|
"step": 199 |
|
}, |
|
{ |
|
"epoch": 1.512241054613936, |
|
"grad_norm": 0.4644230008125305, |
|
"learning_rate": 5.1220704861705774e-05, |
|
"loss": 0.0421, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 1.512241054613936, |
|
"eval_loss": 0.04965050518512726, |
|
"eval_runtime": 50.0829, |
|
"eval_samples_per_second": 1.498, |
|
"eval_steps_per_second": 0.759, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 1.5197740112994351, |
|
"grad_norm": 0.3440283238887787, |
|
"learning_rate": 5.081384816610336e-05, |
|
"loss": 0.0621, |
|
"step": 201 |
|
}, |
|
{ |
|
"epoch": 1.527306967984934, |
|
"grad_norm": 0.2635885179042816, |
|
"learning_rate": 5.0406937560898646e-05, |
|
"loss": 0.0443, |
|
"step": 202 |
|
}, |
|
{ |
|
"epoch": 1.5348399246704332, |
|
"grad_norm": 0.6189826726913452, |
|
"learning_rate": 5e-05, |
|
"loss": 0.084, |
|
"step": 203 |
|
}, |
|
{ |
|
"epoch": 1.542372881355932, |
|
"grad_norm": 0.3221701383590698, |
|
"learning_rate": 4.9593062439101365e-05, |
|
"loss": 0.0348, |
|
"step": 204 |
|
}, |
|
{ |
|
"epoch": 1.5499058380414312, |
|
"grad_norm": 0.30998164415359497, |
|
"learning_rate": 4.918615183389665e-05, |
|
"loss": 0.0529, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 1.5574387947269304, |
|
"grad_norm": 0.47885751724243164, |
|
"learning_rate": 4.877929513829424e-05, |
|
"loss": 0.0454, |
|
"step": 206 |
|
}, |
|
{ |
|
"epoch": 1.5649717514124295, |
|
"grad_norm": 0.5070298910140991, |
|
"learning_rate": 4.8372519302631486e-05, |
|
"loss": 0.0473, |
|
"step": 207 |
|
}, |
|
{ |
|
"epoch": 1.5725047080979284, |
|
"grad_norm": 0.3871539533138275, |
|
"learning_rate": 4.796585127188958e-05, |
|
"loss": 0.056, |
|
"step": 208 |
|
}, |
|
{ |
|
"epoch": 1.5800376647834273, |
|
"grad_norm": 0.4367206394672394, |
|
"learning_rate": 4.755931798390867e-05, |
|
"loss": 0.0734, |
|
"step": 209 |
|
}, |
|
{ |
|
"epoch": 1.5875706214689265, |
|
"grad_norm": 0.2854059338569641, |
|
"learning_rate": 4.715294636760352e-05, |
|
"loss": 0.0563, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 1.5875706214689265, |
|
"eval_loss": 0.04947199299931526, |
|
"eval_runtime": 50.1478, |
|
"eval_samples_per_second": 1.496, |
|
"eval_steps_per_second": 0.758, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 1.5951035781544256, |
|
"grad_norm": 0.3172830641269684, |
|
"learning_rate": 4.674676334117962e-05, |
|
"loss": 0.0356, |
|
"step": 211 |
|
}, |
|
{ |
|
"epoch": 1.6026365348399247, |
|
"grad_norm": 0.5930951237678528, |
|
"learning_rate": 4.634079581035029e-05, |
|
"loss": 0.0687, |
|
"step": 212 |
|
}, |
|
{ |
|
"epoch": 1.6101694915254239, |
|
"grad_norm": 0.3214702010154724, |
|
"learning_rate": 4.59350706665543e-05, |
|
"loss": 0.0433, |
|
"step": 213 |
|
}, |
|
{ |
|
"epoch": 1.6177024482109228, |
|
"grad_norm": 0.3200390636920929, |
|
"learning_rate": 4.55296147851746e-05, |
|
"loss": 0.0518, |
|
"step": 214 |
|
}, |
|
{ |
|
"epoch": 1.6252354048964217, |
|
"grad_norm": 0.32656821608543396, |
|
"learning_rate": 4.512445502375813e-05, |
|
"loss": 0.0391, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 1.6327683615819208, |
|
"grad_norm": 0.45625510811805725, |
|
"learning_rate": 4.471961822023671e-05, |
|
"loss": 0.0244, |
|
"step": 216 |
|
}, |
|
{ |
|
"epoch": 1.64030131826742, |
|
"grad_norm": 0.32512399554252625, |
|
"learning_rate": 4.431513119114934e-05, |
|
"loss": 0.1139, |
|
"step": 217 |
|
}, |
|
{ |
|
"epoch": 1.6478342749529191, |
|
"grad_norm": 0.3270999491214752, |
|
"learning_rate": 4.391102072986581e-05, |
|
"loss": 0.0988, |
|
"step": 218 |
|
}, |
|
{ |
|
"epoch": 1.655367231638418, |
|
"grad_norm": 0.44575634598731995, |
|
"learning_rate": 4.350731360481191e-05, |
|
"loss": 0.0525, |
|
"step": 219 |
|
}, |
|
{ |
|
"epoch": 1.6629001883239172, |
|
"grad_norm": 0.3585101366043091, |
|
"learning_rate": 4.3104036557696295e-05, |
|
"loss": 0.186, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 1.6629001883239172, |
|
"eval_loss": 0.04876178875565529, |
|
"eval_runtime": 50.0932, |
|
"eval_samples_per_second": 1.497, |
|
"eval_steps_per_second": 0.759, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 1.6629001883239172, |
|
"step": 220, |
|
"total_flos": 6.808102961998848e+16, |
|
"train_loss": 0.33265126400034534, |
|
"train_runtime": 4751.8019, |
|
"train_samples_per_second": 0.67, |
|
"train_steps_per_second": 0.083 |
|
} |
|
], |
|
"logging_steps": 1, |
|
"max_steps": 396, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 3, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"EarlyStoppingCallback": { |
|
"args": { |
|
"early_stopping_patience": 4, |
|
"early_stopping_threshold": 0.0015 |
|
}, |
|
"attributes": { |
|
"early_stopping_patience_counter": 4 |
|
} |
|
}, |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 6.808102961998848e+16, |
|
"train_batch_size": 2, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|