|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 0.5028013216491883, |
|
"eval_steps": 500, |
|
"global_step": 7000, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.007182876023559833, |
|
"grad_norm": 102.55565643310547, |
|
"learning_rate": 3.339317773788151e-07, |
|
"loss": 4.51, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.014365752047119667, |
|
"grad_norm": 20.8671875, |
|
"learning_rate": 6.929982046678636e-07, |
|
"loss": 1.3058, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.0215486280706795, |
|
"grad_norm": 16.238826751708984, |
|
"learning_rate": 1.0520646319569122e-06, |
|
"loss": 0.7671, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.028731504094239333, |
|
"grad_norm": 12.669879913330078, |
|
"learning_rate": 1.4111310592459606e-06, |
|
"loss": 0.639, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.03591438011779917, |
|
"grad_norm": 13.375199317932129, |
|
"learning_rate": 1.770197486535009e-06, |
|
"loss": 0.5396, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.043097256141359, |
|
"grad_norm": 36.680416107177734, |
|
"learning_rate": 2.1292639138240576e-06, |
|
"loss": 0.4649, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.050280132164918834, |
|
"grad_norm": 15.553112983703613, |
|
"learning_rate": 2.488330341113106e-06, |
|
"loss": 0.4561, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.057463008188478666, |
|
"grad_norm": 28.032434463500977, |
|
"learning_rate": 2.847396768402155e-06, |
|
"loss": 0.4152, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.0646458842120385, |
|
"grad_norm": 40.55122756958008, |
|
"learning_rate": 3.2064631956912027e-06, |
|
"loss": 0.412, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.07182876023559834, |
|
"grad_norm": 48.380340576171875, |
|
"learning_rate": 3.5655296229802514e-06, |
|
"loss": 0.4159, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.07901163625915816, |
|
"grad_norm": 18.708255767822266, |
|
"learning_rate": 3.9245960502693e-06, |
|
"loss": 0.3397, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 0.086194512282718, |
|
"grad_norm": 10.165841102600098, |
|
"learning_rate": 4.283662477558348e-06, |
|
"loss": 0.355, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 0.09337738830627783, |
|
"grad_norm": 17.107013702392578, |
|
"learning_rate": 4.6427289048473974e-06, |
|
"loss": 0.3312, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 0.10056026432983767, |
|
"grad_norm": 38.87083053588867, |
|
"learning_rate": 5.001795332136446e-06, |
|
"loss": 0.2452, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 0.1077431403533975, |
|
"grad_norm": 17.743247985839844, |
|
"learning_rate": 5.360861759425494e-06, |
|
"loss": 0.2204, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.11492601637695733, |
|
"grad_norm": 35.18050765991211, |
|
"learning_rate": 5.719928186714543e-06, |
|
"loss": 0.2338, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 0.12210889240051717, |
|
"grad_norm": 8.888273239135742, |
|
"learning_rate": 6.078994614003591e-06, |
|
"loss": 0.2269, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 0.129291768424077, |
|
"grad_norm": 6.807362079620361, |
|
"learning_rate": 6.4380610412926396e-06, |
|
"loss": 0.233, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 0.13647464444763682, |
|
"grad_norm": 3.313542366027832, |
|
"learning_rate": 6.797127468581688e-06, |
|
"loss": 0.2033, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 0.14365752047119668, |
|
"grad_norm": 4.184378623962402, |
|
"learning_rate": 7.156193895870737e-06, |
|
"loss": 0.1904, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.1508403964947565, |
|
"grad_norm": 29.64693832397461, |
|
"learning_rate": 7.515260323159785e-06, |
|
"loss": 0.2063, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 0.15802327251831633, |
|
"grad_norm": 19.15960121154785, |
|
"learning_rate": 7.874326750448834e-06, |
|
"loss": 0.2206, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 0.16520614854187618, |
|
"grad_norm": 12.115074157714844, |
|
"learning_rate": 8.233393177737883e-06, |
|
"loss": 0.1949, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 0.172389024565436, |
|
"grad_norm": 9.53875732421875, |
|
"learning_rate": 8.59245960502693e-06, |
|
"loss": 0.2, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 0.17957190058899583, |
|
"grad_norm": 3.5382916927337646, |
|
"learning_rate": 8.951526032315979e-06, |
|
"loss": 0.1875, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 0.18675477661255566, |
|
"grad_norm": 1.2547463178634644, |
|
"learning_rate": 9.310592459605027e-06, |
|
"loss": 0.2033, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 0.1939376526361155, |
|
"grad_norm": 1.9213370084762573, |
|
"learning_rate": 9.669658886894077e-06, |
|
"loss": 0.1946, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 0.20112052865967533, |
|
"grad_norm": 35.58647155761719, |
|
"learning_rate": 9.996807534219244e-06, |
|
"loss": 0.2231, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 0.20830340468323516, |
|
"grad_norm": 3.3178770542144775, |
|
"learning_rate": 9.956901711959775e-06, |
|
"loss": 0.1868, |
|
"step": 2900 |
|
}, |
|
{ |
|
"epoch": 0.215486280706795, |
|
"grad_norm": 22.328224182128906, |
|
"learning_rate": 9.916995889700309e-06, |
|
"loss": 0.2107, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 0.22266915673035484, |
|
"grad_norm": 28.90311050415039, |
|
"learning_rate": 9.877090067440841e-06, |
|
"loss": 0.2068, |
|
"step": 3100 |
|
}, |
|
{ |
|
"epoch": 0.22985203275391466, |
|
"grad_norm": 4.699060916900635, |
|
"learning_rate": 9.837184245181373e-06, |
|
"loss": 0.1987, |
|
"step": 3200 |
|
}, |
|
{ |
|
"epoch": 0.2370349087774745, |
|
"grad_norm": 16.968143463134766, |
|
"learning_rate": 9.797278422921906e-06, |
|
"loss": 0.1688, |
|
"step": 3300 |
|
}, |
|
{ |
|
"epoch": 0.24421778480103434, |
|
"grad_norm": 8.749013900756836, |
|
"learning_rate": 9.757372600662436e-06, |
|
"loss": 0.1803, |
|
"step": 3400 |
|
}, |
|
{ |
|
"epoch": 0.25140066082459417, |
|
"grad_norm": 16.449424743652344, |
|
"learning_rate": 9.71746677840297e-06, |
|
"loss": 0.2231, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 0.258583536848154, |
|
"grad_norm": 0.8674511313438416, |
|
"learning_rate": 9.677560956143502e-06, |
|
"loss": 0.1853, |
|
"step": 3600 |
|
}, |
|
{ |
|
"epoch": 0.2657664128717138, |
|
"grad_norm": 6.190998554229736, |
|
"learning_rate": 9.637655133884035e-06, |
|
"loss": 0.1669, |
|
"step": 3700 |
|
}, |
|
{ |
|
"epoch": 0.27294928889527365, |
|
"grad_norm": 10.102142333984375, |
|
"learning_rate": 9.597749311624567e-06, |
|
"loss": 0.2075, |
|
"step": 3800 |
|
}, |
|
{ |
|
"epoch": 0.2801321649188335, |
|
"grad_norm": 22.571224212646484, |
|
"learning_rate": 9.557843489365099e-06, |
|
"loss": 0.2019, |
|
"step": 3900 |
|
}, |
|
{ |
|
"epoch": 0.28731504094239335, |
|
"grad_norm": 29.87354278564453, |
|
"learning_rate": 9.517937667105631e-06, |
|
"loss": 0.2022, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 0.2944979169659532, |
|
"grad_norm": 4.910442352294922, |
|
"learning_rate": 9.478031844846164e-06, |
|
"loss": 0.1816, |
|
"step": 4100 |
|
}, |
|
{ |
|
"epoch": 0.301680792989513, |
|
"grad_norm": 18.66014289855957, |
|
"learning_rate": 9.438126022586697e-06, |
|
"loss": 0.1998, |
|
"step": 4200 |
|
}, |
|
{ |
|
"epoch": 0.30886366901307283, |
|
"grad_norm": 42.844818115234375, |
|
"learning_rate": 9.398220200327228e-06, |
|
"loss": 0.193, |
|
"step": 4300 |
|
}, |
|
{ |
|
"epoch": 0.31604654503663265, |
|
"grad_norm": 8.83779525756836, |
|
"learning_rate": 9.35831437806776e-06, |
|
"loss": 0.1681, |
|
"step": 4400 |
|
}, |
|
{ |
|
"epoch": 0.3232294210601925, |
|
"grad_norm": 3.474717140197754, |
|
"learning_rate": 9.318408555808294e-06, |
|
"loss": 0.1608, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 0.33041229708375236, |
|
"grad_norm": 4.746242046356201, |
|
"learning_rate": 9.278502733548825e-06, |
|
"loss": 0.1552, |
|
"step": 4600 |
|
}, |
|
{ |
|
"epoch": 0.3375951731073122, |
|
"grad_norm": 4.5783915519714355, |
|
"learning_rate": 9.238596911289359e-06, |
|
"loss": 0.1646, |
|
"step": 4700 |
|
}, |
|
{ |
|
"epoch": 0.344778049130872, |
|
"grad_norm": 9.975970268249512, |
|
"learning_rate": 9.198691089029889e-06, |
|
"loss": 0.1791, |
|
"step": 4800 |
|
}, |
|
{ |
|
"epoch": 0.35196092515443184, |
|
"grad_norm": 1.2798261642456055, |
|
"learning_rate": 9.158785266770423e-06, |
|
"loss": 0.1842, |
|
"step": 4900 |
|
}, |
|
{ |
|
"epoch": 0.35914380117799166, |
|
"grad_norm": 6.015448570251465, |
|
"learning_rate": 9.118879444510955e-06, |
|
"loss": 0.1751, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 0.3663266772015515, |
|
"grad_norm": 7.062718391418457, |
|
"learning_rate": 9.078973622251488e-06, |
|
"loss": 0.1751, |
|
"step": 5100 |
|
}, |
|
{ |
|
"epoch": 0.3735095532251113, |
|
"grad_norm": 0.9326837062835693, |
|
"learning_rate": 9.03906779999202e-06, |
|
"loss": 0.1646, |
|
"step": 5200 |
|
}, |
|
{ |
|
"epoch": 0.3806924292486712, |
|
"grad_norm": 1.695087194442749, |
|
"learning_rate": 8.999161977732552e-06, |
|
"loss": 0.1577, |
|
"step": 5300 |
|
}, |
|
{ |
|
"epoch": 0.387875305272231, |
|
"grad_norm": 19.320249557495117, |
|
"learning_rate": 8.959256155473084e-06, |
|
"loss": 0.1729, |
|
"step": 5400 |
|
}, |
|
{ |
|
"epoch": 0.39505818129579084, |
|
"grad_norm": 3.8522274494171143, |
|
"learning_rate": 8.919350333213616e-06, |
|
"loss": 0.1781, |
|
"step": 5500 |
|
}, |
|
{ |
|
"epoch": 0.40224105731935067, |
|
"grad_norm": 2.3610100746154785, |
|
"learning_rate": 8.879444510954149e-06, |
|
"loss": 0.1975, |
|
"step": 5600 |
|
}, |
|
{ |
|
"epoch": 0.4094239333429105, |
|
"grad_norm": 6.998334884643555, |
|
"learning_rate": 8.839538688694681e-06, |
|
"loss": 0.1629, |
|
"step": 5700 |
|
}, |
|
{ |
|
"epoch": 0.4166068093664703, |
|
"grad_norm": 15.58069896697998, |
|
"learning_rate": 8.799632866435213e-06, |
|
"loss": 0.1764, |
|
"step": 5800 |
|
}, |
|
{ |
|
"epoch": 0.42378968539003015, |
|
"grad_norm": 40.70027160644531, |
|
"learning_rate": 8.759727044175747e-06, |
|
"loss": 0.1923, |
|
"step": 5900 |
|
}, |
|
{ |
|
"epoch": 0.43097256141359, |
|
"grad_norm": 8.034761428833008, |
|
"learning_rate": 8.719821221916278e-06, |
|
"loss": 0.1795, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 0.43815543743714985, |
|
"grad_norm": 1.1275875568389893, |
|
"learning_rate": 8.679915399656812e-06, |
|
"loss": 0.1477, |
|
"step": 6100 |
|
}, |
|
{ |
|
"epoch": 0.4453383134607097, |
|
"grad_norm": 8.35661792755127, |
|
"learning_rate": 8.640009577397342e-06, |
|
"loss": 0.1687, |
|
"step": 6200 |
|
}, |
|
{ |
|
"epoch": 0.4525211894842695, |
|
"grad_norm": 19.17778968811035, |
|
"learning_rate": 8.600103755137876e-06, |
|
"loss": 0.1409, |
|
"step": 6300 |
|
}, |
|
{ |
|
"epoch": 0.45970406550782933, |
|
"grad_norm": 22.45587730407715, |
|
"learning_rate": 8.560197932878408e-06, |
|
"loss": 0.1676, |
|
"step": 6400 |
|
}, |
|
{ |
|
"epoch": 0.46688694153138915, |
|
"grad_norm": 27.29585075378418, |
|
"learning_rate": 8.520292110618939e-06, |
|
"loss": 0.1656, |
|
"step": 6500 |
|
}, |
|
{ |
|
"epoch": 0.474069817554949, |
|
"grad_norm": 2.3996095657348633, |
|
"learning_rate": 8.480386288359473e-06, |
|
"loss": 0.1622, |
|
"step": 6600 |
|
}, |
|
{ |
|
"epoch": 0.48125269357850886, |
|
"grad_norm": 15.577359199523926, |
|
"learning_rate": 8.440480466100005e-06, |
|
"loss": 0.1748, |
|
"step": 6700 |
|
}, |
|
{ |
|
"epoch": 0.4884355696020687, |
|
"grad_norm": 16.934419631958008, |
|
"learning_rate": 8.400574643840537e-06, |
|
"loss": 0.1689, |
|
"step": 6800 |
|
}, |
|
{ |
|
"epoch": 0.4956184456256285, |
|
"grad_norm": 2.9546115398406982, |
|
"learning_rate": 8.36066882158107e-06, |
|
"loss": 0.1695, |
|
"step": 6900 |
|
}, |
|
{ |
|
"epoch": 0.5028013216491883, |
|
"grad_norm": 2.370490550994873, |
|
"learning_rate": 8.320762999321602e-06, |
|
"loss": 0.1396, |
|
"step": 7000 |
|
} |
|
], |
|
"logging_steps": 100, |
|
"max_steps": 27844, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 2, |
|
"save_steps": 100, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 0.0, |
|
"train_batch_size": 1, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|