{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 3.0, "eval_steps": 500, "global_step": 189, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.015873015873015872, "grad_norm": 2.3559653063882835, "learning_rate": 0.0, "loss": 1.0469, "step": 1 }, { "epoch": 0.031746031746031744, "grad_norm": 2.29659253609106, "learning_rate": 3.125e-07, "loss": 0.9929, "step": 2 }, { "epoch": 0.047619047619047616, "grad_norm": 2.8899785663629123, "learning_rate": 6.25e-07, "loss": 1.0189, "step": 3 }, { "epoch": 0.06349206349206349, "grad_norm": 2.2527917765154153, "learning_rate": 9.375000000000001e-07, "loss": 0.9098, "step": 4 }, { "epoch": 0.07936507936507936, "grad_norm": 2.2029116064708907, "learning_rate": 1.25e-06, "loss": 1.0462, "step": 5 }, { "epoch": 0.09523809523809523, "grad_norm": 2.122312207060731, "learning_rate": 1.5625e-06, "loss": 0.9986, "step": 6 }, { "epoch": 0.1111111111111111, "grad_norm": 2.045608321522422, "learning_rate": 1.8750000000000003e-06, "loss": 0.9554, "step": 7 }, { "epoch": 0.12698412698412698, "grad_norm": 1.8619152723479657, "learning_rate": 2.1875000000000002e-06, "loss": 0.9522, "step": 8 }, { "epoch": 0.14285714285714285, "grad_norm": 1.8663709940802706, "learning_rate": 2.5e-06, "loss": 0.9994, "step": 9 }, { "epoch": 0.15873015873015872, "grad_norm": 1.756651050051264, "learning_rate": 2.8125e-06, "loss": 0.9373, "step": 10 }, { "epoch": 0.1746031746031746, "grad_norm": 1.6604489437599113, "learning_rate": 3.125e-06, "loss": 0.8839, "step": 11 }, { "epoch": 0.19047619047619047, "grad_norm": 1.292095856227553, "learning_rate": 3.4375e-06, "loss": 0.9907, "step": 12 }, { "epoch": 0.20634920634920634, "grad_norm": 1.2693344466908103, "learning_rate": 3.7500000000000005e-06, "loss": 0.9322, "step": 13 }, { "epoch": 0.2222222222222222, "grad_norm": 1.6344663181288221, "learning_rate": 4.0625000000000005e-06, "loss": 1.0934, "step": 14 }, { "epoch": 0.23809523809523808, "grad_norm": 1.1511973531225708, "learning_rate": 4.3750000000000005e-06, "loss": 0.9771, "step": 15 }, { "epoch": 0.25396825396825395, "grad_norm": 1.1464903643465947, "learning_rate": 4.6875000000000004e-06, "loss": 1.0442, "step": 16 }, { "epoch": 0.2698412698412698, "grad_norm": 1.1549964376534243, "learning_rate": 5e-06, "loss": 0.949, "step": 17 }, { "epoch": 0.2857142857142857, "grad_norm": 1.2909225755543452, "learning_rate": 5.3125e-06, "loss": 1.0098, "step": 18 }, { "epoch": 0.30158730158730157, "grad_norm": 1.0413894105842352, "learning_rate": 5.625e-06, "loss": 0.7468, "step": 19 }, { "epoch": 0.31746031746031744, "grad_norm": 1.2863204827157997, "learning_rate": 5.9375e-06, "loss": 0.9232, "step": 20 }, { "epoch": 0.3333333333333333, "grad_norm": 0.901486908411037, "learning_rate": 6.25e-06, "loss": 0.9885, "step": 21 }, { "epoch": 0.3492063492063492, "grad_norm": 1.0229681930848715, "learning_rate": 6.5625e-06, "loss": 1.0508, "step": 22 }, { "epoch": 0.36507936507936506, "grad_norm": 1.0772270927236638, "learning_rate": 6.875e-06, "loss": 0.8728, "step": 23 }, { "epoch": 0.38095238095238093, "grad_norm": 0.9393952588410857, "learning_rate": 7.1875e-06, "loss": 0.8349, "step": 24 }, { "epoch": 0.3968253968253968, "grad_norm": 1.0822345499912303, "learning_rate": 7.500000000000001e-06, "loss": 1.0283, "step": 25 }, { "epoch": 0.4126984126984127, "grad_norm": 0.8124841375138875, "learning_rate": 7.8125e-06, "loss": 0.8612, "step": 26 }, { "epoch": 0.42857142857142855, "grad_norm": 0.7868328056966778, "learning_rate": 8.125000000000001e-06, "loss": 0.7232, "step": 27 }, { "epoch": 0.4444444444444444, "grad_norm": 0.8283002452965974, "learning_rate": 8.4375e-06, "loss": 0.9752, "step": 28 }, { "epoch": 0.4603174603174603, "grad_norm": 0.8117454313345658, "learning_rate": 8.750000000000001e-06, "loss": 0.9053, "step": 29 }, { "epoch": 0.47619047619047616, "grad_norm": 0.8594519052279771, "learning_rate": 9.0625e-06, "loss": 0.9479, "step": 30 }, { "epoch": 0.49206349206349204, "grad_norm": 0.9958158956912483, "learning_rate": 9.375000000000001e-06, "loss": 0.9188, "step": 31 }, { "epoch": 0.5079365079365079, "grad_norm": 0.8918575253813723, "learning_rate": 9.6875e-06, "loss": 0.8847, "step": 32 }, { "epoch": 0.5238095238095238, "grad_norm": 0.8125329449215294, "learning_rate": 1e-05, "loss": 1.0379, "step": 33 }, { "epoch": 0.5396825396825397, "grad_norm": 0.7108806956906407, "learning_rate": 9.999691920767945e-06, "loss": 0.8376, "step": 34 }, { "epoch": 0.5555555555555556, "grad_norm": 0.641926463787557, "learning_rate": 9.998767721036901e-06, "loss": 0.8241, "step": 35 }, { "epoch": 0.5714285714285714, "grad_norm": 0.7048636529194373, "learning_rate": 9.997227514697568e-06, "loss": 0.9693, "step": 36 }, { "epoch": 0.5873015873015873, "grad_norm": 0.6041864409794199, "learning_rate": 9.99507149155218e-06, "loss": 0.9839, "step": 37 }, { "epoch": 0.6031746031746031, "grad_norm": 0.6529381186048961, "learning_rate": 9.992299917291118e-06, "loss": 0.8479, "step": 38 }, { "epoch": 0.6190476190476191, "grad_norm": 0.7457758141141355, "learning_rate": 9.98891313346017e-06, "loss": 0.9095, "step": 39 }, { "epoch": 0.6349206349206349, "grad_norm": 0.6700791615416641, "learning_rate": 9.984911557418444e-06, "loss": 0.7685, "step": 40 }, { "epoch": 0.6507936507936508, "grad_norm": 0.6202447937301818, "learning_rate": 9.980295682286924e-06, "loss": 0.8387, "step": 41 }, { "epoch": 0.6666666666666666, "grad_norm": 0.6888680420644837, "learning_rate": 9.97506607688772e-06, "loss": 0.9107, "step": 42 }, { "epoch": 0.6825396825396826, "grad_norm": 0.5229452850388104, "learning_rate": 9.969223385673958e-06, "loss": 0.8308, "step": 43 }, { "epoch": 0.6984126984126984, "grad_norm": 0.5679326043532053, "learning_rate": 9.962768328650367e-06, "loss": 0.7516, "step": 44 }, { "epoch": 0.7142857142857143, "grad_norm": 0.5234412349262514, "learning_rate": 9.95570170128455e-06, "loss": 0.8443, "step": 45 }, { "epoch": 0.7301587301587301, "grad_norm": 0.5148736685750067, "learning_rate": 9.94802437440896e-06, "loss": 0.7959, "step": 46 }, { "epoch": 0.746031746031746, "grad_norm": 0.6223703419413371, "learning_rate": 9.939737294113585e-06, "loss": 0.8964, "step": 47 }, { "epoch": 0.7619047619047619, "grad_norm": 0.4712938980573866, "learning_rate": 9.930841481629358e-06, "loss": 0.8884, "step": 48 }, { "epoch": 0.7777777777777778, "grad_norm": 0.6385581101993485, "learning_rate": 9.92133803320231e-06, "loss": 0.7817, "step": 49 }, { "epoch": 0.7936507936507936, "grad_norm": 0.47528095545287, "learning_rate": 9.91122811995848e-06, "loss": 0.819, "step": 50 }, { "epoch": 0.8095238095238095, "grad_norm": 0.5522186664203698, "learning_rate": 9.90051298775959e-06, "loss": 0.8691, "step": 51 }, { "epoch": 0.8253968253968254, "grad_norm": 0.3924890188917555, "learning_rate": 9.88919395704952e-06, "loss": 0.8259, "step": 52 }, { "epoch": 0.8412698412698413, "grad_norm": 0.5584015479821739, "learning_rate": 9.877272422691583e-06, "loss": 0.9318, "step": 53 }, { "epoch": 0.8571428571428571, "grad_norm": 0.5472693893320031, "learning_rate": 9.864749853796642e-06, "loss": 0.7983, "step": 54 }, { "epoch": 0.873015873015873, "grad_norm": 0.5011856989250408, "learning_rate": 9.85162779354206e-06, "loss": 0.7289, "step": 55 }, { "epoch": 0.8888888888888888, "grad_norm": 0.48176520075987733, "learning_rate": 9.837907858981536e-06, "loss": 0.8795, "step": 56 }, { "epoch": 0.9047619047619048, "grad_norm": 0.4693619944653085, "learning_rate": 9.823591740845831e-06, "loss": 0.8625, "step": 57 }, { "epoch": 0.9206349206349206, "grad_norm": 0.5158078748351012, "learning_rate": 9.808681203334416e-06, "loss": 0.7975, "step": 58 }, { "epoch": 0.9365079365079365, "grad_norm": 0.467299048377056, "learning_rate": 9.793178083898073e-06, "loss": 0.878, "step": 59 }, { "epoch": 0.9523809523809523, "grad_norm": 0.4360100853426926, "learning_rate": 9.777084293012448e-06, "loss": 0.842, "step": 60 }, { "epoch": 0.9682539682539683, "grad_norm": 0.4999196363033725, "learning_rate": 9.760401813942641e-06, "loss": 0.7661, "step": 61 }, { "epoch": 0.9841269841269841, "grad_norm": 0.49451715958225617, "learning_rate": 9.743132702498785e-06, "loss": 0.8685, "step": 62 }, { "epoch": 1.0, "grad_norm": 0.51449429417728, "learning_rate": 9.725279086782719e-06, "loss": 0.7676, "step": 63 }, { "epoch": 1.0158730158730158, "grad_norm": 0.5392465569053122, "learning_rate": 9.706843166925733e-06, "loss": 0.7978, "step": 64 }, { "epoch": 1.0317460317460316, "grad_norm": 0.49426185655546884, "learning_rate": 9.687827214817433e-06, "loss": 0.8264, "step": 65 }, { "epoch": 1.0476190476190477, "grad_norm": 0.5050909892528982, "learning_rate": 9.668233573825794e-06, "loss": 0.8898, "step": 66 }, { "epoch": 1.0634920634920635, "grad_norm": 0.45134127922296613, "learning_rate": 9.64806465850836e-06, "loss": 0.7317, "step": 67 }, { "epoch": 1.0793650793650793, "grad_norm": 0.5413266326970981, "learning_rate": 9.62732295431471e-06, "loss": 0.7307, "step": 68 }, { "epoch": 1.0952380952380953, "grad_norm": 0.4781316290575908, "learning_rate": 9.606011017280166e-06, "loss": 0.8977, "step": 69 }, { "epoch": 1.1111111111111112, "grad_norm": 0.5064140744904799, "learning_rate": 9.5841314737108e-06, "loss": 0.7648, "step": 70 }, { "epoch": 1.126984126984127, "grad_norm": 0.5543523877170532, "learning_rate": 9.56168701985981e-06, "loss": 0.7995, "step": 71 }, { "epoch": 1.1428571428571428, "grad_norm": 0.4891764300467825, "learning_rate": 9.538680421595236e-06, "loss": 0.8072, "step": 72 }, { "epoch": 1.1587301587301586, "grad_norm": 0.48203192054287314, "learning_rate": 9.515114514059127e-06, "loss": 0.8128, "step": 73 }, { "epoch": 1.1746031746031746, "grad_norm": 0.499915788005329, "learning_rate": 9.490992201318165e-06, "loss": 0.7876, "step": 74 }, { "epoch": 1.1904761904761905, "grad_norm": 0.43129451868532453, "learning_rate": 9.466316456005783e-06, "loss": 0.7755, "step": 75 }, { "epoch": 1.2063492063492063, "grad_norm": 0.49436944947590167, "learning_rate": 9.441090318955843e-06, "loss": 0.7015, "step": 76 }, { "epoch": 1.2222222222222223, "grad_norm": 0.5018080177691097, "learning_rate": 9.415316898827923e-06, "loss": 0.7346, "step": 77 }, { "epoch": 1.2380952380952381, "grad_norm": 0.42117192002428844, "learning_rate": 9.388999371724212e-06, "loss": 0.8242, "step": 78 }, { "epoch": 1.253968253968254, "grad_norm": 0.47397540901194374, "learning_rate": 9.362140980798127e-06, "loss": 0.8928, "step": 79 }, { "epoch": 1.2698412698412698, "grad_norm": 0.48823131897505534, "learning_rate": 9.334745035854646e-06, "loss": 0.7581, "step": 80 }, { "epoch": 1.2857142857142856, "grad_norm": 0.5170460810325518, "learning_rate": 9.306814912942445e-06, "loss": 0.8361, "step": 81 }, { "epoch": 1.3015873015873016, "grad_norm": 0.41118521047488926, "learning_rate": 9.278354053937848e-06, "loss": 0.7794, "step": 82 }, { "epoch": 1.3174603174603174, "grad_norm": 0.4827654705693697, "learning_rate": 9.249365966120692e-06, "loss": 0.8542, "step": 83 }, { "epoch": 1.3333333333333333, "grad_norm": 0.45176873751511454, "learning_rate": 9.219854221742106e-06, "loss": 0.8101, "step": 84 }, { "epoch": 1.3492063492063493, "grad_norm": 0.44526540495239475, "learning_rate": 9.189822457584311e-06, "loss": 0.7419, "step": 85 }, { "epoch": 1.3650793650793651, "grad_norm": 0.41133066066087726, "learning_rate": 9.159274374512444e-06, "loss": 0.6576, "step": 86 }, { "epoch": 1.380952380952381, "grad_norm": 0.4500027229237173, "learning_rate": 9.128213737018493e-06, "loss": 0.8058, "step": 87 }, { "epoch": 1.3968253968253967, "grad_norm": 0.40834920107678924, "learning_rate": 9.096644372757393e-06, "loss": 0.8849, "step": 88 }, { "epoch": 1.4126984126984126, "grad_norm": 0.5843795042717066, "learning_rate": 9.064570172075349e-06, "loss": 0.7969, "step": 89 }, { "epoch": 1.4285714285714286, "grad_norm": 0.5139681695756663, "learning_rate": 9.031995087530403e-06, "loss": 0.7983, "step": 90 }, { "epoch": 1.4444444444444444, "grad_norm": 0.47799160571848326, "learning_rate": 8.99892313340537e-06, "loss": 0.6612, "step": 91 }, { "epoch": 1.4603174603174602, "grad_norm": 0.48090290795792257, "learning_rate": 8.96535838521314e-06, "loss": 0.8026, "step": 92 }, { "epoch": 1.4761904761904763, "grad_norm": 0.48955363216016506, "learning_rate": 8.931304979194452e-06, "loss": 0.8051, "step": 93 }, { "epoch": 1.492063492063492, "grad_norm": 0.47949685756309185, "learning_rate": 8.896767111808177e-06, "loss": 0.7354, "step": 94 }, { "epoch": 1.507936507936508, "grad_norm": 0.5732670061875946, "learning_rate": 8.861749039214177e-06, "loss": 0.9129, "step": 95 }, { "epoch": 1.5238095238095237, "grad_norm": 0.48050508555262206, "learning_rate": 8.826255076748823e-06, "loss": 0.8445, "step": 96 }, { "epoch": 1.5396825396825395, "grad_norm": 0.4329532952395629, "learning_rate": 8.790289598393186e-06, "loss": 0.7212, "step": 97 }, { "epoch": 1.5555555555555556, "grad_norm": 0.522751486773223, "learning_rate": 8.753857036234055e-06, "loss": 0.8149, "step": 98 }, { "epoch": 1.5714285714285714, "grad_norm": 0.4570961856172299, "learning_rate": 8.716961879917734e-06, "loss": 0.7365, "step": 99 }, { "epoch": 1.5873015873015874, "grad_norm": 0.4363179134183329, "learning_rate": 8.679608676096793e-06, "loss": 0.8131, "step": 100 }, { "epoch": 1.6031746031746033, "grad_norm": 0.4655541415571893, "learning_rate": 8.641802027869774e-06, "loss": 0.7946, "step": 101 }, { "epoch": 1.619047619047619, "grad_norm": 0.5743139418639736, "learning_rate": 8.603546594213935e-06, "loss": 0.8574, "step": 102 }, { "epoch": 1.6349206349206349, "grad_norm": 0.5267570867681096, "learning_rate": 8.564847089411128e-06, "loss": 0.8286, "step": 103 }, { "epoch": 1.6507936507936507, "grad_norm": 0.40799736834923667, "learning_rate": 8.525708282466839e-06, "loss": 0.8412, "step": 104 }, { "epoch": 1.6666666666666665, "grad_norm": 0.4236881481332967, "learning_rate": 8.486134996522502e-06, "loss": 0.8172, "step": 105 }, { "epoch": 1.6825396825396826, "grad_norm": 0.5593679767726464, "learning_rate": 8.446132108261136e-06, "loss": 0.8058, "step": 106 }, { "epoch": 1.6984126984126984, "grad_norm": 0.5031166228419733, "learning_rate": 8.405704547306379e-06, "loss": 0.8031, "step": 107 }, { "epoch": 1.7142857142857144, "grad_norm": 0.45322610730579044, "learning_rate": 8.364857295615006e-06, "loss": 0.8903, "step": 108 }, { "epoch": 1.7301587301587302, "grad_norm": 0.5335556769284883, "learning_rate": 8.323595386862985e-06, "loss": 0.7925, "step": 109 }, { "epoch": 1.746031746031746, "grad_norm": 0.4699718024263939, "learning_rate": 8.281923905825188e-06, "loss": 0.7664, "step": 110 }, { "epoch": 1.7619047619047619, "grad_norm": 0.47207237316096745, "learning_rate": 8.23984798774876e-06, "loss": 0.7347, "step": 111 }, { "epoch": 1.7777777777777777, "grad_norm": 0.4532509556288616, "learning_rate": 8.197372817720314e-06, "loss": 0.7369, "step": 112 }, { "epoch": 1.7936507936507935, "grad_norm": 0.5443221798521994, "learning_rate": 8.154503630026955e-06, "loss": 0.7261, "step": 113 }, { "epoch": 1.8095238095238095, "grad_norm": 0.4456098920838456, "learning_rate": 8.111245707511253e-06, "loss": 0.7194, "step": 114 }, { "epoch": 1.8253968253968254, "grad_norm": 0.4159654938486175, "learning_rate": 8.067604380920228e-06, "loss": 0.7945, "step": 115 }, { "epoch": 1.8412698412698414, "grad_norm": 0.4706342532274064, "learning_rate": 8.023585028248435e-06, "loss": 0.8487, "step": 116 }, { "epoch": 1.8571428571428572, "grad_norm": 0.5701232470412769, "learning_rate": 7.979193074075216e-06, "loss": 0.8887, "step": 117 }, { "epoch": 1.873015873015873, "grad_norm": 0.530430629054239, "learning_rate": 7.934433988896233e-06, "loss": 0.6534, "step": 118 }, { "epoch": 1.8888888888888888, "grad_norm": 0.48414840419963984, "learning_rate": 7.889313288449323e-06, "loss": 0.8214, "step": 119 }, { "epoch": 1.9047619047619047, "grad_norm": 0.4200926363513126, "learning_rate": 7.843836533034784e-06, "loss": 0.7614, "step": 120 }, { "epoch": 1.9206349206349205, "grad_norm": 0.4941849127950555, "learning_rate": 7.798009326830167e-06, "loss": 0.7996, "step": 121 }, { "epoch": 1.9365079365079365, "grad_norm": 0.41647477043231534, "learning_rate": 7.751837317199673e-06, "loss": 0.867, "step": 122 }, { "epoch": 1.9523809523809523, "grad_norm": 0.4462896414872465, "learning_rate": 7.705326193998207e-06, "loss": 0.7547, "step": 123 }, { "epoch": 1.9682539682539684, "grad_norm": 0.46366747032871125, "learning_rate": 7.658481688870218e-06, "loss": 0.7582, "step": 124 }, { "epoch": 1.9841269841269842, "grad_norm": 0.4714130206121814, "learning_rate": 7.611309574543373e-06, "loss": 0.7606, "step": 125 }, { "epoch": 2.0, "grad_norm": 0.46690869317456135, "learning_rate": 7.563815664117173e-06, "loss": 0.9121, "step": 126 }, { "epoch": 2.015873015873016, "grad_norm": 0.8060769356732992, "learning_rate": 7.5160058103465985e-06, "loss": 0.7122, "step": 127 }, { "epoch": 2.0317460317460316, "grad_norm": 0.5953210710991091, "learning_rate": 7.467885904920864e-06, "loss": 0.7567, "step": 128 }, { "epoch": 2.0476190476190474, "grad_norm": 0.7560871980312371, "learning_rate": 7.419461877737373e-06, "loss": 0.8318, "step": 129 }, { "epoch": 2.0634920634920633, "grad_norm": 2.1011598702400667, "learning_rate": 7.370739696170971e-06, "loss": 0.7428, "step": 130 }, { "epoch": 2.0793650793650795, "grad_norm": 0.940827314326734, "learning_rate": 7.321725364338566e-06, "loss": 0.6161, "step": 131 }, { "epoch": 2.0952380952380953, "grad_norm": 0.5381465457966281, "learning_rate": 7.272424922359246e-06, "loss": 0.6432, "step": 132 }, { "epoch": 2.111111111111111, "grad_norm": 0.7212158226191104, "learning_rate": 7.222844445609931e-06, "loss": 0.7817, "step": 133 }, { "epoch": 2.126984126984127, "grad_norm": 0.6031927565028607, "learning_rate": 7.172990043976703e-06, "loss": 0.7291, "step": 134 }, { "epoch": 2.142857142857143, "grad_norm": 0.5554913039306149, "learning_rate": 7.122867861101868e-06, "loss": 0.7928, "step": 135 }, { "epoch": 2.1587301587301586, "grad_norm": 0.4440614086169425, "learning_rate": 7.072484073626872e-06, "loss": 0.6864, "step": 136 }, { "epoch": 2.1746031746031744, "grad_norm": 2.875286251781212, "learning_rate": 7.021844890431136e-06, "loss": 0.7627, "step": 137 }, { "epoch": 2.1904761904761907, "grad_norm": 0.7248206082063566, "learning_rate": 6.970956551866925e-06, "loss": 0.728, "step": 138 }, { "epoch": 2.2063492063492065, "grad_norm": 0.5344769870855947, "learning_rate": 6.9198253289903515e-06, "loss": 0.6621, "step": 139 }, { "epoch": 2.2222222222222223, "grad_norm": 0.5334409779130068, "learning_rate": 6.868457522788561e-06, "loss": 0.7351, "step": 140 }, { "epoch": 2.238095238095238, "grad_norm": 0.4791675678917909, "learning_rate": 6.816859463403271e-06, "loss": 0.6568, "step": 141 }, { "epoch": 2.253968253968254, "grad_norm": 0.5667962259074942, "learning_rate": 6.765037509350685e-06, "loss": 0.758, "step": 142 }, { "epoch": 2.2698412698412698, "grad_norm": 0.523154654898243, "learning_rate": 6.7129980467379265e-06, "loss": 0.6657, "step": 143 }, { "epoch": 2.2857142857142856, "grad_norm": 0.45239550513741295, "learning_rate": 6.660747488476066e-06, "loss": 0.6615, "step": 144 }, { "epoch": 2.3015873015873014, "grad_norm": 0.6580127713752147, "learning_rate": 6.608292273489851e-06, "loss": 0.6112, "step": 145 }, { "epoch": 2.317460317460317, "grad_norm": 0.6033248382665617, "learning_rate": 6.555638865924221e-06, "loss": 0.7033, "step": 146 }, { "epoch": 2.3333333333333335, "grad_norm": 0.5578461067365529, "learning_rate": 6.502793754347721e-06, "loss": 0.7578, "step": 147 }, { "epoch": 2.3492063492063493, "grad_norm": 0.5451762654132818, "learning_rate": 6.449763450952912e-06, "loss": 0.6863, "step": 148 }, { "epoch": 2.365079365079365, "grad_norm": 0.4667833185680937, "learning_rate": 6.396554490753848e-06, "loss": 0.6825, "step": 149 }, { "epoch": 2.380952380952381, "grad_norm": 0.4209933154088852, "learning_rate": 6.343173430780769e-06, "loss": 0.836, "step": 150 }, { "epoch": 2.3968253968253967, "grad_norm": 0.46876037251704294, "learning_rate": 6.289626849272062e-06, "loss": 0.7981, "step": 151 }, { "epoch": 2.4126984126984126, "grad_norm": 0.47367833829704725, "learning_rate": 6.2359213448636104e-06, "loss": 0.751, "step": 152 }, { "epoch": 2.4285714285714284, "grad_norm": 0.43459439089398605, "learning_rate": 6.182063535775634e-06, "loss": 0.7654, "step": 153 }, { "epoch": 2.4444444444444446, "grad_norm": 0.39767398947957067, "learning_rate": 6.1280600589971225e-06, "loss": 0.7896, "step": 154 }, { "epoch": 2.4603174603174605, "grad_norm": 0.4231324131775063, "learning_rate": 6.073917569467934e-06, "loss": 0.8051, "step": 155 }, { "epoch": 2.4761904761904763, "grad_norm": 0.3983830637612639, "learning_rate": 6.0196427392587085e-06, "loss": 0.7038, "step": 156 }, { "epoch": 2.492063492063492, "grad_norm": 0.4585701856768339, "learning_rate": 5.96524225674865e-06, "loss": 0.7422, "step": 157 }, { "epoch": 2.507936507936508, "grad_norm": 0.4299692751487169, "learning_rate": 5.9107228258013085e-06, "loss": 0.7053, "step": 158 }, { "epoch": 2.5238095238095237, "grad_norm": 0.42827944956580943, "learning_rate": 5.856091164938451e-06, "loss": 0.6523, "step": 159 }, { "epoch": 2.5396825396825395, "grad_norm": 0.4098750307712162, "learning_rate": 5.801354006512127e-06, "loss": 0.6895, "step": 160 }, { "epoch": 2.5555555555555554, "grad_norm": 0.45624383692077836, "learning_rate": 5.746518095875033e-06, "loss": 0.6973, "step": 161 }, { "epoch": 2.571428571428571, "grad_norm": 0.40961695420487504, "learning_rate": 5.6915901905492586e-06, "loss": 0.6285, "step": 162 }, { "epoch": 2.5873015873015874, "grad_norm": 0.5946218628280344, "learning_rate": 5.6365770593935665e-06, "loss": 0.5907, "step": 163 }, { "epoch": 2.6031746031746033, "grad_norm": 0.5401440035651196, "learning_rate": 5.581485481769231e-06, "loss": 0.7181, "step": 164 }, { "epoch": 2.619047619047619, "grad_norm": 0.4378876946579892, "learning_rate": 5.526322246704628e-06, "loss": 0.7978, "step": 165 }, { "epoch": 2.634920634920635, "grad_norm": 0.40853074862176036, "learning_rate": 5.471094152058592e-06, "loss": 0.681, "step": 166 }, { "epoch": 2.6507936507936507, "grad_norm": 0.46389651051528763, "learning_rate": 5.415808003682717e-06, "loss": 0.7308, "step": 167 }, { "epoch": 2.6666666666666665, "grad_norm": 0.42992484133372394, "learning_rate": 5.360470614582661e-06, "loss": 0.7136, "step": 168 }, { "epoch": 2.682539682539683, "grad_norm": 0.557870344379466, "learning_rate": 5.305088804078559e-06, "loss": 0.7333, "step": 169 }, { "epoch": 2.6984126984126986, "grad_norm": 0.45453618737081114, "learning_rate": 5.249669396964665e-06, "loss": 0.6349, "step": 170 }, { "epoch": 2.7142857142857144, "grad_norm": 0.4511080452383348, "learning_rate": 5.1942192226683385e-06, "loss": 0.776, "step": 171 }, { "epoch": 2.7301587301587302, "grad_norm": 0.4900595891663431, "learning_rate": 5.138745114408427e-06, "loss": 0.5998, "step": 172 }, { "epoch": 2.746031746031746, "grad_norm": 0.46419645481002475, "learning_rate": 5.083253908353193e-06, "loss": 0.6676, "step": 173 }, { "epoch": 2.761904761904762, "grad_norm": 0.4905121964865482, "learning_rate": 5.0277524427778986e-06, "loss": 0.7831, "step": 174 }, { "epoch": 2.7777777777777777, "grad_norm": 0.4525848803424086, "learning_rate": 4.972247557222102e-06, "loss": 0.7164, "step": 175 }, { "epoch": 2.7936507936507935, "grad_norm": 0.5368330661361714, "learning_rate": 4.916746091646808e-06, "loss": 0.6805, "step": 176 }, { "epoch": 2.8095238095238093, "grad_norm": 0.4224136348005534, "learning_rate": 4.8612548855915755e-06, "loss": 0.724, "step": 177 }, { "epoch": 2.825396825396825, "grad_norm": 0.4869146817578471, "learning_rate": 4.805780777331662e-06, "loss": 0.7446, "step": 178 }, { "epoch": 2.8412698412698414, "grad_norm": 0.4511553539717499, "learning_rate": 4.750330603035336e-06, "loss": 0.7124, "step": 179 }, { "epoch": 2.857142857142857, "grad_norm": 0.45286461188582156, "learning_rate": 4.694911195921443e-06, "loss": 0.7252, "step": 180 }, { "epoch": 2.873015873015873, "grad_norm": 0.4293856541441545, "learning_rate": 4.6395293854173395e-06, "loss": 0.6053, "step": 181 }, { "epoch": 2.888888888888889, "grad_norm": 0.5836445711298119, "learning_rate": 4.584191996317285e-06, "loss": 0.6828, "step": 182 }, { "epoch": 2.9047619047619047, "grad_norm": 0.4710990644177235, "learning_rate": 4.528905847941411e-06, "loss": 0.8414, "step": 183 }, { "epoch": 2.9206349206349205, "grad_norm": 0.4770727006845428, "learning_rate": 4.473677753295375e-06, "loss": 0.6592, "step": 184 }, { "epoch": 2.9365079365079367, "grad_norm": 0.4087196329651188, "learning_rate": 4.418514518230769e-06, "loss": 0.7122, "step": 185 }, { "epoch": 2.9523809523809526, "grad_norm": 0.48530953017287554, "learning_rate": 4.363422940606435e-06, "loss": 0.7454, "step": 186 }, { "epoch": 2.9682539682539684, "grad_norm": 0.4900655757859956, "learning_rate": 4.308409809450742e-06, "loss": 0.7621, "step": 187 }, { "epoch": 2.984126984126984, "grad_norm": 0.49992358594135816, "learning_rate": 4.253481904124968e-06, "loss": 0.7331, "step": 188 }, { "epoch": 3.0, "grad_norm": 0.4385555753140365, "learning_rate": 4.198645993487872e-06, "loss": 0.604, "step": 189 } ], "logging_steps": 1, "max_steps": 315, "num_input_tokens_seen": 0, "num_train_epochs": 5, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 47138450767872.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }