{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.5028013216491883, "eval_steps": 500, "global_step": 7000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.007182876023559833, "grad_norm": 102.55565643310547, "learning_rate": 3.339317773788151e-07, "loss": 4.51, "step": 100 }, { "epoch": 0.014365752047119667, "grad_norm": 20.8671875, "learning_rate": 6.929982046678636e-07, "loss": 1.3058, "step": 200 }, { "epoch": 0.0215486280706795, "grad_norm": 16.238826751708984, "learning_rate": 1.0520646319569122e-06, "loss": 0.7671, "step": 300 }, { "epoch": 0.028731504094239333, "grad_norm": 12.669879913330078, "learning_rate": 1.4111310592459606e-06, "loss": 0.639, "step": 400 }, { "epoch": 0.03591438011779917, "grad_norm": 13.375199317932129, "learning_rate": 1.770197486535009e-06, "loss": 0.5396, "step": 500 }, { "epoch": 0.043097256141359, "grad_norm": 36.680416107177734, "learning_rate": 2.1292639138240576e-06, "loss": 0.4649, "step": 600 }, { "epoch": 0.050280132164918834, "grad_norm": 15.553112983703613, "learning_rate": 2.488330341113106e-06, "loss": 0.4561, "step": 700 }, { "epoch": 0.057463008188478666, "grad_norm": 28.032434463500977, "learning_rate": 2.847396768402155e-06, "loss": 0.4152, "step": 800 }, { "epoch": 0.0646458842120385, "grad_norm": 40.55122756958008, "learning_rate": 3.2064631956912027e-06, "loss": 0.412, "step": 900 }, { "epoch": 0.07182876023559834, "grad_norm": 48.380340576171875, "learning_rate": 3.5655296229802514e-06, "loss": 0.4159, "step": 1000 }, { "epoch": 0.07901163625915816, "grad_norm": 18.708255767822266, "learning_rate": 3.9245960502693e-06, "loss": 0.3397, "step": 1100 }, { "epoch": 0.086194512282718, "grad_norm": 10.165841102600098, "learning_rate": 4.283662477558348e-06, "loss": 0.355, "step": 1200 }, { "epoch": 0.09337738830627783, "grad_norm": 17.107013702392578, "learning_rate": 4.6427289048473974e-06, "loss": 0.3312, "step": 1300 }, { "epoch": 0.10056026432983767, "grad_norm": 38.87083053588867, "learning_rate": 5.001795332136446e-06, "loss": 0.2452, "step": 1400 }, { "epoch": 0.1077431403533975, "grad_norm": 17.743247985839844, "learning_rate": 5.360861759425494e-06, "loss": 0.2204, "step": 1500 }, { "epoch": 0.11492601637695733, "grad_norm": 35.18050765991211, "learning_rate": 5.719928186714543e-06, "loss": 0.2338, "step": 1600 }, { "epoch": 0.12210889240051717, "grad_norm": 8.888273239135742, "learning_rate": 6.078994614003591e-06, "loss": 0.2269, "step": 1700 }, { "epoch": 0.129291768424077, "grad_norm": 6.807362079620361, "learning_rate": 6.4380610412926396e-06, "loss": 0.233, "step": 1800 }, { "epoch": 0.13647464444763682, "grad_norm": 3.313542366027832, "learning_rate": 6.797127468581688e-06, "loss": 0.2033, "step": 1900 }, { "epoch": 0.14365752047119668, "grad_norm": 4.184378623962402, "learning_rate": 7.156193895870737e-06, "loss": 0.1904, "step": 2000 }, { "epoch": 0.1508403964947565, "grad_norm": 29.64693832397461, "learning_rate": 7.515260323159785e-06, "loss": 0.2063, "step": 2100 }, { "epoch": 0.15802327251831633, "grad_norm": 19.15960121154785, "learning_rate": 7.874326750448834e-06, "loss": 0.2206, "step": 2200 }, { "epoch": 0.16520614854187618, "grad_norm": 12.115074157714844, "learning_rate": 8.233393177737883e-06, "loss": 0.1949, "step": 2300 }, { "epoch": 0.172389024565436, "grad_norm": 9.53875732421875, "learning_rate": 8.59245960502693e-06, "loss": 0.2, "step": 2400 }, { "epoch": 0.17957190058899583, "grad_norm": 3.5382916927337646, "learning_rate": 8.951526032315979e-06, "loss": 0.1875, "step": 2500 }, { "epoch": 0.18675477661255566, "grad_norm": 1.2547463178634644, "learning_rate": 9.310592459605027e-06, "loss": 0.2033, "step": 2600 }, { "epoch": 0.1939376526361155, "grad_norm": 1.9213370084762573, "learning_rate": 9.669658886894077e-06, "loss": 0.1946, "step": 2700 }, { "epoch": 0.20112052865967533, "grad_norm": 35.58647155761719, "learning_rate": 9.996807534219244e-06, "loss": 0.2231, "step": 2800 }, { "epoch": 0.20830340468323516, "grad_norm": 3.3178770542144775, "learning_rate": 9.956901711959775e-06, "loss": 0.1868, "step": 2900 }, { "epoch": 0.215486280706795, "grad_norm": 22.328224182128906, "learning_rate": 9.916995889700309e-06, "loss": 0.2107, "step": 3000 }, { "epoch": 0.22266915673035484, "grad_norm": 28.90311050415039, "learning_rate": 9.877090067440841e-06, "loss": 0.2068, "step": 3100 }, { "epoch": 0.22985203275391466, "grad_norm": 4.699060916900635, "learning_rate": 9.837184245181373e-06, "loss": 0.1987, "step": 3200 }, { "epoch": 0.2370349087774745, "grad_norm": 16.968143463134766, "learning_rate": 9.797278422921906e-06, "loss": 0.1688, "step": 3300 }, { "epoch": 0.24421778480103434, "grad_norm": 8.749013900756836, "learning_rate": 9.757372600662436e-06, "loss": 0.1803, "step": 3400 }, { "epoch": 0.25140066082459417, "grad_norm": 16.449424743652344, "learning_rate": 9.71746677840297e-06, "loss": 0.2231, "step": 3500 }, { "epoch": 0.258583536848154, "grad_norm": 0.8674511313438416, "learning_rate": 9.677560956143502e-06, "loss": 0.1853, "step": 3600 }, { "epoch": 0.2657664128717138, "grad_norm": 6.190998554229736, "learning_rate": 9.637655133884035e-06, "loss": 0.1669, "step": 3700 }, { "epoch": 0.27294928889527365, "grad_norm": 10.102142333984375, "learning_rate": 9.597749311624567e-06, "loss": 0.2075, "step": 3800 }, { "epoch": 0.2801321649188335, "grad_norm": 22.571224212646484, "learning_rate": 9.557843489365099e-06, "loss": 0.2019, "step": 3900 }, { "epoch": 0.28731504094239335, "grad_norm": 29.87354278564453, "learning_rate": 9.517937667105631e-06, "loss": 0.2022, "step": 4000 }, { "epoch": 0.2944979169659532, "grad_norm": 4.910442352294922, "learning_rate": 9.478031844846164e-06, "loss": 0.1816, "step": 4100 }, { "epoch": 0.301680792989513, "grad_norm": 18.66014289855957, "learning_rate": 9.438126022586697e-06, "loss": 0.1998, "step": 4200 }, { "epoch": 0.30886366901307283, "grad_norm": 42.844818115234375, "learning_rate": 9.398220200327228e-06, "loss": 0.193, "step": 4300 }, { "epoch": 0.31604654503663265, "grad_norm": 8.83779525756836, "learning_rate": 9.35831437806776e-06, "loss": 0.1681, "step": 4400 }, { "epoch": 0.3232294210601925, "grad_norm": 3.474717140197754, "learning_rate": 9.318408555808294e-06, "loss": 0.1608, "step": 4500 }, { "epoch": 0.33041229708375236, "grad_norm": 4.746242046356201, "learning_rate": 9.278502733548825e-06, "loss": 0.1552, "step": 4600 }, { "epoch": 0.3375951731073122, "grad_norm": 4.5783915519714355, "learning_rate": 9.238596911289359e-06, "loss": 0.1646, "step": 4700 }, { "epoch": 0.344778049130872, "grad_norm": 9.975970268249512, "learning_rate": 9.198691089029889e-06, "loss": 0.1791, "step": 4800 }, { "epoch": 0.35196092515443184, "grad_norm": 1.2798261642456055, "learning_rate": 9.158785266770423e-06, "loss": 0.1842, "step": 4900 }, { "epoch": 0.35914380117799166, "grad_norm": 6.015448570251465, "learning_rate": 9.118879444510955e-06, "loss": 0.1751, "step": 5000 }, { "epoch": 0.3663266772015515, "grad_norm": 7.062718391418457, "learning_rate": 9.078973622251488e-06, "loss": 0.1751, "step": 5100 }, { "epoch": 0.3735095532251113, "grad_norm": 0.9326837062835693, "learning_rate": 9.03906779999202e-06, "loss": 0.1646, "step": 5200 }, { "epoch": 0.3806924292486712, "grad_norm": 1.695087194442749, "learning_rate": 8.999161977732552e-06, "loss": 0.1577, "step": 5300 }, { "epoch": 0.387875305272231, "grad_norm": 19.320249557495117, "learning_rate": 8.959256155473084e-06, "loss": 0.1729, "step": 5400 }, { "epoch": 0.39505818129579084, "grad_norm": 3.8522274494171143, "learning_rate": 8.919350333213616e-06, "loss": 0.1781, "step": 5500 }, { "epoch": 0.40224105731935067, "grad_norm": 2.3610100746154785, "learning_rate": 8.879444510954149e-06, "loss": 0.1975, "step": 5600 }, { "epoch": 0.4094239333429105, "grad_norm": 6.998334884643555, "learning_rate": 8.839538688694681e-06, "loss": 0.1629, "step": 5700 }, { "epoch": 0.4166068093664703, "grad_norm": 15.58069896697998, "learning_rate": 8.799632866435213e-06, "loss": 0.1764, "step": 5800 }, { "epoch": 0.42378968539003015, "grad_norm": 40.70027160644531, "learning_rate": 8.759727044175747e-06, "loss": 0.1923, "step": 5900 }, { "epoch": 0.43097256141359, "grad_norm": 8.034761428833008, "learning_rate": 8.719821221916278e-06, "loss": 0.1795, "step": 6000 }, { "epoch": 0.43815543743714985, "grad_norm": 1.1275875568389893, "learning_rate": 8.679915399656812e-06, "loss": 0.1477, "step": 6100 }, { "epoch": 0.4453383134607097, "grad_norm": 8.35661792755127, "learning_rate": 8.640009577397342e-06, "loss": 0.1687, "step": 6200 }, { "epoch": 0.4525211894842695, "grad_norm": 19.17778968811035, "learning_rate": 8.600103755137876e-06, "loss": 0.1409, "step": 6300 }, { "epoch": 0.45970406550782933, "grad_norm": 22.45587730407715, "learning_rate": 8.560197932878408e-06, "loss": 0.1676, "step": 6400 }, { "epoch": 0.46688694153138915, "grad_norm": 27.29585075378418, "learning_rate": 8.520292110618939e-06, "loss": 0.1656, "step": 6500 }, { "epoch": 0.474069817554949, "grad_norm": 2.3996095657348633, "learning_rate": 8.480386288359473e-06, "loss": 0.1622, "step": 6600 }, { "epoch": 0.48125269357850886, "grad_norm": 15.577359199523926, "learning_rate": 8.440480466100005e-06, "loss": 0.1748, "step": 6700 }, { "epoch": 0.4884355696020687, "grad_norm": 16.934419631958008, "learning_rate": 8.400574643840537e-06, "loss": 0.1689, "step": 6800 }, { "epoch": 0.4956184456256285, "grad_norm": 2.9546115398406982, "learning_rate": 8.36066882158107e-06, "loss": 0.1695, "step": 6900 }, { "epoch": 0.5028013216491883, "grad_norm": 2.370490550994873, "learning_rate": 8.320762999321602e-06, "loss": 0.1396, "step": 7000 } ], "logging_steps": 100, "max_steps": 27844, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }