{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 4.0, "eval_steps": 500, "global_step": 252, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.015873015873015872, "grad_norm": 2.3559653063882835, "learning_rate": 0.0, "loss": 1.0469, "step": 1 }, { "epoch": 0.031746031746031744, "grad_norm": 2.29659253609106, "learning_rate": 3.125e-07, "loss": 0.9929, "step": 2 }, { "epoch": 0.047619047619047616, "grad_norm": 2.8899785663629123, "learning_rate": 6.25e-07, "loss": 1.0189, "step": 3 }, { "epoch": 0.06349206349206349, "grad_norm": 2.2527917765154153, "learning_rate": 9.375000000000001e-07, "loss": 0.9098, "step": 4 }, { "epoch": 0.07936507936507936, "grad_norm": 2.2029116064708907, "learning_rate": 1.25e-06, "loss": 1.0462, "step": 5 }, { "epoch": 0.09523809523809523, "grad_norm": 2.122312207060731, "learning_rate": 1.5625e-06, "loss": 0.9986, "step": 6 }, { "epoch": 0.1111111111111111, "grad_norm": 2.045608321522422, "learning_rate": 1.8750000000000003e-06, "loss": 0.9554, "step": 7 }, { "epoch": 0.12698412698412698, "grad_norm": 1.8619152723479657, "learning_rate": 2.1875000000000002e-06, "loss": 0.9522, "step": 8 }, { "epoch": 0.14285714285714285, "grad_norm": 1.8663709940802706, "learning_rate": 2.5e-06, "loss": 0.9994, "step": 9 }, { "epoch": 0.15873015873015872, "grad_norm": 1.756651050051264, "learning_rate": 2.8125e-06, "loss": 0.9373, "step": 10 }, { "epoch": 0.1746031746031746, "grad_norm": 1.6604489437599113, "learning_rate": 3.125e-06, "loss": 0.8839, "step": 11 }, { "epoch": 0.19047619047619047, "grad_norm": 1.292095856227553, "learning_rate": 3.4375e-06, "loss": 0.9907, "step": 12 }, { "epoch": 0.20634920634920634, "grad_norm": 1.2693344466908103, "learning_rate": 3.7500000000000005e-06, "loss": 0.9322, "step": 13 }, { "epoch": 0.2222222222222222, "grad_norm": 1.6344663181288221, "learning_rate": 4.0625000000000005e-06, "loss": 1.0934, "step": 14 }, { "epoch": 0.23809523809523808, "grad_norm": 1.1511973531225708, "learning_rate": 4.3750000000000005e-06, "loss": 0.9771, "step": 15 }, { "epoch": 0.25396825396825395, "grad_norm": 1.1464903643465947, "learning_rate": 4.6875000000000004e-06, "loss": 1.0442, "step": 16 }, { "epoch": 0.2698412698412698, "grad_norm": 1.1549964376534243, "learning_rate": 5e-06, "loss": 0.949, "step": 17 }, { "epoch": 0.2857142857142857, "grad_norm": 1.2909225755543452, "learning_rate": 5.3125e-06, "loss": 1.0098, "step": 18 }, { "epoch": 0.30158730158730157, "grad_norm": 1.0413894105842352, "learning_rate": 5.625e-06, "loss": 0.7468, "step": 19 }, { "epoch": 0.31746031746031744, "grad_norm": 1.2863204827157997, "learning_rate": 5.9375e-06, "loss": 0.9232, "step": 20 }, { "epoch": 0.3333333333333333, "grad_norm": 0.901486908411037, "learning_rate": 6.25e-06, "loss": 0.9885, "step": 21 }, { "epoch": 0.3492063492063492, "grad_norm": 1.0229681930848715, "learning_rate": 6.5625e-06, "loss": 1.0508, "step": 22 }, { "epoch": 0.36507936507936506, "grad_norm": 1.0772270927236638, "learning_rate": 6.875e-06, "loss": 0.8728, "step": 23 }, { "epoch": 0.38095238095238093, "grad_norm": 0.9393952588410857, "learning_rate": 7.1875e-06, "loss": 0.8349, "step": 24 }, { "epoch": 0.3968253968253968, "grad_norm": 1.0822345499912303, "learning_rate": 7.500000000000001e-06, "loss": 1.0283, "step": 25 }, { "epoch": 0.4126984126984127, "grad_norm": 0.8124841375138875, "learning_rate": 7.8125e-06, "loss": 0.8612, "step": 26 }, { "epoch": 0.42857142857142855, "grad_norm": 0.7868328056966778, "learning_rate": 8.125000000000001e-06, "loss": 0.7232, "step": 27 }, { "epoch": 0.4444444444444444, "grad_norm": 0.8283002452965974, "learning_rate": 8.4375e-06, "loss": 0.9752, "step": 28 }, { "epoch": 0.4603174603174603, "grad_norm": 0.8117454313345658, "learning_rate": 8.750000000000001e-06, "loss": 0.9053, "step": 29 }, { "epoch": 0.47619047619047616, "grad_norm": 0.8594519052279771, "learning_rate": 9.0625e-06, "loss": 0.9479, "step": 30 }, { "epoch": 0.49206349206349204, "grad_norm": 0.9958158956912483, "learning_rate": 9.375000000000001e-06, "loss": 0.9188, "step": 31 }, { "epoch": 0.5079365079365079, "grad_norm": 0.8918575253813723, "learning_rate": 9.6875e-06, "loss": 0.8847, "step": 32 }, { "epoch": 0.5238095238095238, "grad_norm": 0.8125329449215294, "learning_rate": 1e-05, "loss": 1.0379, "step": 33 }, { "epoch": 0.5396825396825397, "grad_norm": 0.7108806956906407, "learning_rate": 9.999691920767945e-06, "loss": 0.8376, "step": 34 }, { "epoch": 0.5555555555555556, "grad_norm": 0.641926463787557, "learning_rate": 9.998767721036901e-06, "loss": 0.8241, "step": 35 }, { "epoch": 0.5714285714285714, "grad_norm": 0.7048636529194373, "learning_rate": 9.997227514697568e-06, "loss": 0.9693, "step": 36 }, { "epoch": 0.5873015873015873, "grad_norm": 0.6041864409794199, "learning_rate": 9.99507149155218e-06, "loss": 0.9839, "step": 37 }, { "epoch": 0.6031746031746031, "grad_norm": 0.6529381186048961, "learning_rate": 9.992299917291118e-06, "loss": 0.8479, "step": 38 }, { "epoch": 0.6190476190476191, "grad_norm": 0.7457758141141355, "learning_rate": 9.98891313346017e-06, "loss": 0.9095, "step": 39 }, { "epoch": 0.6349206349206349, "grad_norm": 0.6700791615416641, "learning_rate": 9.984911557418444e-06, "loss": 0.7685, "step": 40 }, { "epoch": 0.6507936507936508, "grad_norm": 0.6202447937301818, "learning_rate": 9.980295682286924e-06, "loss": 0.8387, "step": 41 }, { "epoch": 0.6666666666666666, "grad_norm": 0.6888680420644837, "learning_rate": 9.97506607688772e-06, "loss": 0.9107, "step": 42 }, { "epoch": 0.6825396825396826, "grad_norm": 0.5229452850388104, "learning_rate": 9.969223385673958e-06, "loss": 0.8308, "step": 43 }, { "epoch": 0.6984126984126984, "grad_norm": 0.5679326043532053, "learning_rate": 9.962768328650367e-06, "loss": 0.7516, "step": 44 }, { "epoch": 0.7142857142857143, "grad_norm": 0.5234412349262514, "learning_rate": 9.95570170128455e-06, "loss": 0.8443, "step": 45 }, { "epoch": 0.7301587301587301, "grad_norm": 0.5148736685750067, "learning_rate": 9.94802437440896e-06, "loss": 0.7959, "step": 46 }, { "epoch": 0.746031746031746, "grad_norm": 0.6223703419413371, "learning_rate": 9.939737294113585e-06, "loss": 0.8964, "step": 47 }, { "epoch": 0.7619047619047619, "grad_norm": 0.4712938980573866, "learning_rate": 9.930841481629358e-06, "loss": 0.8884, "step": 48 }, { "epoch": 0.7777777777777778, "grad_norm": 0.6385581101993485, "learning_rate": 9.92133803320231e-06, "loss": 0.7817, "step": 49 }, { "epoch": 0.7936507936507936, "grad_norm": 0.47528095545287, "learning_rate": 9.91122811995848e-06, "loss": 0.819, "step": 50 }, { "epoch": 0.8095238095238095, "grad_norm": 0.5522186664203698, "learning_rate": 9.90051298775959e-06, "loss": 0.8691, "step": 51 }, { "epoch": 0.8253968253968254, "grad_norm": 0.3924890188917555, "learning_rate": 9.88919395704952e-06, "loss": 0.8259, "step": 52 }, { "epoch": 0.8412698412698413, "grad_norm": 0.5584015479821739, "learning_rate": 9.877272422691583e-06, "loss": 0.9318, "step": 53 }, { "epoch": 0.8571428571428571, "grad_norm": 0.5472693893320031, "learning_rate": 9.864749853796642e-06, "loss": 0.7983, "step": 54 }, { "epoch": 0.873015873015873, "grad_norm": 0.5011856989250408, "learning_rate": 9.85162779354206e-06, "loss": 0.7289, "step": 55 }, { "epoch": 0.8888888888888888, "grad_norm": 0.48176520075987733, "learning_rate": 9.837907858981536e-06, "loss": 0.8795, "step": 56 }, { "epoch": 0.9047619047619048, "grad_norm": 0.4693619944653085, "learning_rate": 9.823591740845831e-06, "loss": 0.8625, "step": 57 }, { "epoch": 0.9206349206349206, "grad_norm": 0.5158078748351012, "learning_rate": 9.808681203334416e-06, "loss": 0.7975, "step": 58 }, { "epoch": 0.9365079365079365, "grad_norm": 0.467299048377056, "learning_rate": 9.793178083898073e-06, "loss": 0.878, "step": 59 }, { "epoch": 0.9523809523809523, "grad_norm": 0.4360100853426926, "learning_rate": 9.777084293012448e-06, "loss": 0.842, "step": 60 }, { "epoch": 0.9682539682539683, "grad_norm": 0.4999196363033725, "learning_rate": 9.760401813942641e-06, "loss": 0.7661, "step": 61 }, { "epoch": 0.9841269841269841, "grad_norm": 0.49451715958225617, "learning_rate": 9.743132702498785e-06, "loss": 0.8685, "step": 62 }, { "epoch": 1.0, "grad_norm": 0.51449429417728, "learning_rate": 9.725279086782719e-06, "loss": 0.7676, "step": 63 }, { "epoch": 1.0158730158730158, "grad_norm": 0.5392465569053122, "learning_rate": 9.706843166925733e-06, "loss": 0.7978, "step": 64 }, { "epoch": 1.0317460317460316, "grad_norm": 0.49426185655546884, "learning_rate": 9.687827214817433e-06, "loss": 0.8264, "step": 65 }, { "epoch": 1.0476190476190477, "grad_norm": 0.5050909892528982, "learning_rate": 9.668233573825794e-06, "loss": 0.8898, "step": 66 }, { "epoch": 1.0634920634920635, "grad_norm": 0.45134127922296613, "learning_rate": 9.64806465850836e-06, "loss": 0.7317, "step": 67 }, { "epoch": 1.0793650793650793, "grad_norm": 0.5413266326970981, "learning_rate": 9.62732295431471e-06, "loss": 0.7307, "step": 68 }, { "epoch": 1.0952380952380953, "grad_norm": 0.4781316290575908, "learning_rate": 9.606011017280166e-06, "loss": 0.8977, "step": 69 }, { "epoch": 1.1111111111111112, "grad_norm": 0.5064140744904799, "learning_rate": 9.5841314737108e-06, "loss": 0.7648, "step": 70 }, { "epoch": 1.126984126984127, "grad_norm": 0.5543523877170532, "learning_rate": 9.56168701985981e-06, "loss": 0.7995, "step": 71 }, { "epoch": 1.1428571428571428, "grad_norm": 0.4891764300467825, "learning_rate": 9.538680421595236e-06, "loss": 0.8072, "step": 72 }, { "epoch": 1.1587301587301586, "grad_norm": 0.48203192054287314, "learning_rate": 9.515114514059127e-06, "loss": 0.8128, "step": 73 }, { "epoch": 1.1746031746031746, "grad_norm": 0.499915788005329, "learning_rate": 9.490992201318165e-06, "loss": 0.7876, "step": 74 }, { "epoch": 1.1904761904761905, "grad_norm": 0.43129451868532453, "learning_rate": 9.466316456005783e-06, "loss": 0.7755, "step": 75 }, { "epoch": 1.2063492063492063, "grad_norm": 0.49436944947590167, "learning_rate": 9.441090318955843e-06, "loss": 0.7015, "step": 76 }, { "epoch": 1.2222222222222223, "grad_norm": 0.5018080177691097, "learning_rate": 9.415316898827923e-06, "loss": 0.7346, "step": 77 }, { "epoch": 1.2380952380952381, "grad_norm": 0.42117192002428844, "learning_rate": 9.388999371724212e-06, "loss": 0.8242, "step": 78 }, { "epoch": 1.253968253968254, "grad_norm": 0.47397540901194374, "learning_rate": 9.362140980798127e-06, "loss": 0.8928, "step": 79 }, { "epoch": 1.2698412698412698, "grad_norm": 0.48823131897505534, "learning_rate": 9.334745035854646e-06, "loss": 0.7581, "step": 80 }, { "epoch": 1.2857142857142856, "grad_norm": 0.5170460810325518, "learning_rate": 9.306814912942445e-06, "loss": 0.8361, "step": 81 }, { "epoch": 1.3015873015873016, "grad_norm": 0.41118521047488926, "learning_rate": 9.278354053937848e-06, "loss": 0.7794, "step": 82 }, { "epoch": 1.3174603174603174, "grad_norm": 0.4827654705693697, "learning_rate": 9.249365966120692e-06, "loss": 0.8542, "step": 83 }, { "epoch": 1.3333333333333333, "grad_norm": 0.45176873751511454, "learning_rate": 9.219854221742106e-06, "loss": 0.8101, "step": 84 }, { "epoch": 1.3492063492063493, "grad_norm": 0.44526540495239475, "learning_rate": 9.189822457584311e-06, "loss": 0.7419, "step": 85 }, { "epoch": 1.3650793650793651, "grad_norm": 0.41133066066087726, "learning_rate": 9.159274374512444e-06, "loss": 0.6576, "step": 86 }, { "epoch": 1.380952380952381, "grad_norm": 0.4500027229237173, "learning_rate": 9.128213737018493e-06, "loss": 0.8058, "step": 87 }, { "epoch": 1.3968253968253967, "grad_norm": 0.40834920107678924, "learning_rate": 9.096644372757393e-06, "loss": 0.8849, "step": 88 }, { "epoch": 1.4126984126984126, "grad_norm": 0.5843795042717066, "learning_rate": 9.064570172075349e-06, "loss": 0.7969, "step": 89 }, { "epoch": 1.4285714285714286, "grad_norm": 0.5139681695756663, "learning_rate": 9.031995087530403e-06, "loss": 0.7983, "step": 90 }, { "epoch": 1.4444444444444444, "grad_norm": 0.47799160571848326, "learning_rate": 8.99892313340537e-06, "loss": 0.6612, "step": 91 }, { "epoch": 1.4603174603174602, "grad_norm": 0.48090290795792257, "learning_rate": 8.96535838521314e-06, "loss": 0.8026, "step": 92 }, { "epoch": 1.4761904761904763, "grad_norm": 0.48955363216016506, "learning_rate": 8.931304979194452e-06, "loss": 0.8051, "step": 93 }, { "epoch": 1.492063492063492, "grad_norm": 0.47949685756309185, "learning_rate": 8.896767111808177e-06, "loss": 0.7354, "step": 94 }, { "epoch": 1.507936507936508, "grad_norm": 0.5732670061875946, "learning_rate": 8.861749039214177e-06, "loss": 0.9129, "step": 95 }, { "epoch": 1.5238095238095237, "grad_norm": 0.48050508555262206, "learning_rate": 8.826255076748823e-06, "loss": 0.8445, "step": 96 }, { "epoch": 1.5396825396825395, "grad_norm": 0.4329532952395629, "learning_rate": 8.790289598393186e-06, "loss": 0.7212, "step": 97 }, { "epoch": 1.5555555555555556, "grad_norm": 0.522751486773223, "learning_rate": 8.753857036234055e-06, "loss": 0.8149, "step": 98 }, { "epoch": 1.5714285714285714, "grad_norm": 0.4570961856172299, "learning_rate": 8.716961879917734e-06, "loss": 0.7365, "step": 99 }, { "epoch": 1.5873015873015874, "grad_norm": 0.4363179134183329, "learning_rate": 8.679608676096793e-06, "loss": 0.8131, "step": 100 }, { "epoch": 1.6031746031746033, "grad_norm": 0.4655541415571893, "learning_rate": 8.641802027869774e-06, "loss": 0.7946, "step": 101 }, { "epoch": 1.619047619047619, "grad_norm": 0.5743139418639736, "learning_rate": 8.603546594213935e-06, "loss": 0.8574, "step": 102 }, { "epoch": 1.6349206349206349, "grad_norm": 0.5267570867681096, "learning_rate": 8.564847089411128e-06, "loss": 0.8286, "step": 103 }, { "epoch": 1.6507936507936507, "grad_norm": 0.40799736834923667, "learning_rate": 8.525708282466839e-06, "loss": 0.8412, "step": 104 }, { "epoch": 1.6666666666666665, "grad_norm": 0.4236881481332967, "learning_rate": 8.486134996522502e-06, "loss": 0.8172, "step": 105 }, { "epoch": 1.6825396825396826, "grad_norm": 0.5593679767726464, "learning_rate": 8.446132108261136e-06, "loss": 0.8058, "step": 106 }, { "epoch": 1.6984126984126984, "grad_norm": 0.5031166228419733, "learning_rate": 8.405704547306379e-06, "loss": 0.8031, "step": 107 }, { "epoch": 1.7142857142857144, "grad_norm": 0.45322610730579044, "learning_rate": 8.364857295615006e-06, "loss": 0.8903, "step": 108 }, { "epoch": 1.7301587301587302, "grad_norm": 0.5335556769284883, "learning_rate": 8.323595386862985e-06, "loss": 0.7925, "step": 109 }, { "epoch": 1.746031746031746, "grad_norm": 0.4699718024263939, "learning_rate": 8.281923905825188e-06, "loss": 0.7664, "step": 110 }, { "epoch": 1.7619047619047619, "grad_norm": 0.47207237316096745, "learning_rate": 8.23984798774876e-06, "loss": 0.7347, "step": 111 }, { "epoch": 1.7777777777777777, "grad_norm": 0.4532509556288616, "learning_rate": 8.197372817720314e-06, "loss": 0.7369, "step": 112 }, { "epoch": 1.7936507936507935, "grad_norm": 0.5443221798521994, "learning_rate": 8.154503630026955e-06, "loss": 0.7261, "step": 113 }, { "epoch": 1.8095238095238095, "grad_norm": 0.4456098920838456, "learning_rate": 8.111245707511253e-06, "loss": 0.7194, "step": 114 }, { "epoch": 1.8253968253968254, "grad_norm": 0.4159654938486175, "learning_rate": 8.067604380920228e-06, "loss": 0.7945, "step": 115 }, { "epoch": 1.8412698412698414, "grad_norm": 0.4706342532274064, "learning_rate": 8.023585028248435e-06, "loss": 0.8487, "step": 116 }, { "epoch": 1.8571428571428572, "grad_norm": 0.5701232470412769, "learning_rate": 7.979193074075216e-06, "loss": 0.8887, "step": 117 }, { "epoch": 1.873015873015873, "grad_norm": 0.530430629054239, "learning_rate": 7.934433988896233e-06, "loss": 0.6534, "step": 118 }, { "epoch": 1.8888888888888888, "grad_norm": 0.48414840419963984, "learning_rate": 7.889313288449323e-06, "loss": 0.8214, "step": 119 }, { "epoch": 1.9047619047619047, "grad_norm": 0.4200926363513126, "learning_rate": 7.843836533034784e-06, "loss": 0.7614, "step": 120 }, { "epoch": 1.9206349206349205, "grad_norm": 0.4941849127950555, "learning_rate": 7.798009326830167e-06, "loss": 0.7996, "step": 121 }, { "epoch": 1.9365079365079365, "grad_norm": 0.41647477043231534, "learning_rate": 7.751837317199673e-06, "loss": 0.867, "step": 122 }, { "epoch": 1.9523809523809523, "grad_norm": 0.4462896414872465, "learning_rate": 7.705326193998207e-06, "loss": 0.7547, "step": 123 }, { "epoch": 1.9682539682539684, "grad_norm": 0.46366747032871125, "learning_rate": 7.658481688870218e-06, "loss": 0.7582, "step": 124 }, { "epoch": 1.9841269841269842, "grad_norm": 0.4714130206121814, "learning_rate": 7.611309574543373e-06, "loss": 0.7606, "step": 125 }, { "epoch": 2.0, "grad_norm": 0.46690869317456135, "learning_rate": 7.563815664117173e-06, "loss": 0.9121, "step": 126 }, { "epoch": 2.015873015873016, "grad_norm": 0.8060769356732992, "learning_rate": 7.5160058103465985e-06, "loss": 0.7122, "step": 127 }, { "epoch": 2.0317460317460316, "grad_norm": 0.5953210710991091, "learning_rate": 7.467885904920864e-06, "loss": 0.7567, "step": 128 }, { "epoch": 2.0476190476190474, "grad_norm": 0.7560871980312371, "learning_rate": 7.419461877737373e-06, "loss": 0.8318, "step": 129 }, { "epoch": 2.0634920634920633, "grad_norm": 2.1011598702400667, "learning_rate": 7.370739696170971e-06, "loss": 0.7428, "step": 130 }, { "epoch": 2.0793650793650795, "grad_norm": 0.940827314326734, "learning_rate": 7.321725364338566e-06, "loss": 0.6161, "step": 131 }, { "epoch": 2.0952380952380953, "grad_norm": 0.5381465457966281, "learning_rate": 7.272424922359246e-06, "loss": 0.6432, "step": 132 }, { "epoch": 2.111111111111111, "grad_norm": 0.7212158226191104, "learning_rate": 7.222844445609931e-06, "loss": 0.7817, "step": 133 }, { "epoch": 2.126984126984127, "grad_norm": 0.6031927565028607, "learning_rate": 7.172990043976703e-06, "loss": 0.7291, "step": 134 }, { "epoch": 2.142857142857143, "grad_norm": 0.5554913039306149, "learning_rate": 7.122867861101868e-06, "loss": 0.7928, "step": 135 }, { "epoch": 2.1587301587301586, "grad_norm": 0.4440614086169425, "learning_rate": 7.072484073626872e-06, "loss": 0.6864, "step": 136 }, { "epoch": 2.1746031746031744, "grad_norm": 2.875286251781212, "learning_rate": 7.021844890431136e-06, "loss": 0.7627, "step": 137 }, { "epoch": 2.1904761904761907, "grad_norm": 0.7248206082063566, "learning_rate": 6.970956551866925e-06, "loss": 0.728, "step": 138 }, { "epoch": 2.2063492063492065, "grad_norm": 0.5344769870855947, "learning_rate": 6.9198253289903515e-06, "loss": 0.6621, "step": 139 }, { "epoch": 2.2222222222222223, "grad_norm": 0.5334409779130068, "learning_rate": 6.868457522788561e-06, "loss": 0.7351, "step": 140 }, { "epoch": 2.238095238095238, "grad_norm": 0.4791675678917909, "learning_rate": 6.816859463403271e-06, "loss": 0.6568, "step": 141 }, { "epoch": 2.253968253968254, "grad_norm": 0.5667962259074942, "learning_rate": 6.765037509350685e-06, "loss": 0.758, "step": 142 }, { "epoch": 2.2698412698412698, "grad_norm": 0.523154654898243, "learning_rate": 6.7129980467379265e-06, "loss": 0.6657, "step": 143 }, { "epoch": 2.2857142857142856, "grad_norm": 0.45239550513741295, "learning_rate": 6.660747488476066e-06, "loss": 0.6615, "step": 144 }, { "epoch": 2.3015873015873014, "grad_norm": 0.6580127713752147, "learning_rate": 6.608292273489851e-06, "loss": 0.6112, "step": 145 }, { "epoch": 2.317460317460317, "grad_norm": 0.6033248382665617, "learning_rate": 6.555638865924221e-06, "loss": 0.7033, "step": 146 }, { "epoch": 2.3333333333333335, "grad_norm": 0.5578461067365529, "learning_rate": 6.502793754347721e-06, "loss": 0.7578, "step": 147 }, { "epoch": 2.3492063492063493, "grad_norm": 0.5451762654132818, "learning_rate": 6.449763450952912e-06, "loss": 0.6863, "step": 148 }, { "epoch": 2.365079365079365, "grad_norm": 0.4667833185680937, "learning_rate": 6.396554490753848e-06, "loss": 0.6825, "step": 149 }, { "epoch": 2.380952380952381, "grad_norm": 0.4209933154088852, "learning_rate": 6.343173430780769e-06, "loss": 0.836, "step": 150 }, { "epoch": 2.3968253968253967, "grad_norm": 0.46876037251704294, "learning_rate": 6.289626849272062e-06, "loss": 0.7981, "step": 151 }, { "epoch": 2.4126984126984126, "grad_norm": 0.47367833829704725, "learning_rate": 6.2359213448636104e-06, "loss": 0.751, "step": 152 }, { "epoch": 2.4285714285714284, "grad_norm": 0.43459439089398605, "learning_rate": 6.182063535775634e-06, "loss": 0.7654, "step": 153 }, { "epoch": 2.4444444444444446, "grad_norm": 0.39767398947957067, "learning_rate": 6.1280600589971225e-06, "loss": 0.7896, "step": 154 }, { "epoch": 2.4603174603174605, "grad_norm": 0.4231324131775063, "learning_rate": 6.073917569467934e-06, "loss": 0.8051, "step": 155 }, { "epoch": 2.4761904761904763, "grad_norm": 0.3983830637612639, "learning_rate": 6.0196427392587085e-06, "loss": 0.7038, "step": 156 }, { "epoch": 2.492063492063492, "grad_norm": 0.4585701856768339, "learning_rate": 5.96524225674865e-06, "loss": 0.7422, "step": 157 }, { "epoch": 2.507936507936508, "grad_norm": 0.4299692751487169, "learning_rate": 5.9107228258013085e-06, "loss": 0.7053, "step": 158 }, { "epoch": 2.5238095238095237, "grad_norm": 0.42827944956580943, "learning_rate": 5.856091164938451e-06, "loss": 0.6523, "step": 159 }, { "epoch": 2.5396825396825395, "grad_norm": 0.4098750307712162, "learning_rate": 5.801354006512127e-06, "loss": 0.6895, "step": 160 }, { "epoch": 2.5555555555555554, "grad_norm": 0.45624383692077836, "learning_rate": 5.746518095875033e-06, "loss": 0.6973, "step": 161 }, { "epoch": 2.571428571428571, "grad_norm": 0.40961695420487504, "learning_rate": 5.6915901905492586e-06, "loss": 0.6285, "step": 162 }, { "epoch": 2.5873015873015874, "grad_norm": 0.5946218628280344, "learning_rate": 5.6365770593935665e-06, "loss": 0.5907, "step": 163 }, { "epoch": 2.6031746031746033, "grad_norm": 0.5401440035651196, "learning_rate": 5.581485481769231e-06, "loss": 0.7181, "step": 164 }, { "epoch": 2.619047619047619, "grad_norm": 0.4378876946579892, "learning_rate": 5.526322246704628e-06, "loss": 0.7978, "step": 165 }, { "epoch": 2.634920634920635, "grad_norm": 0.40853074862176036, "learning_rate": 5.471094152058592e-06, "loss": 0.681, "step": 166 }, { "epoch": 2.6507936507936507, "grad_norm": 0.46389651051528763, "learning_rate": 5.415808003682717e-06, "loss": 0.7308, "step": 167 }, { "epoch": 2.6666666666666665, "grad_norm": 0.42992484133372394, "learning_rate": 5.360470614582661e-06, "loss": 0.7136, "step": 168 }, { "epoch": 2.682539682539683, "grad_norm": 0.557870344379466, "learning_rate": 5.305088804078559e-06, "loss": 0.7333, "step": 169 }, { "epoch": 2.6984126984126986, "grad_norm": 0.45453618737081114, "learning_rate": 5.249669396964665e-06, "loss": 0.6349, "step": 170 }, { "epoch": 2.7142857142857144, "grad_norm": 0.4511080452383348, "learning_rate": 5.1942192226683385e-06, "loss": 0.776, "step": 171 }, { "epoch": 2.7301587301587302, "grad_norm": 0.4900595891663431, "learning_rate": 5.138745114408427e-06, "loss": 0.5998, "step": 172 }, { "epoch": 2.746031746031746, "grad_norm": 0.46419645481002475, "learning_rate": 5.083253908353193e-06, "loss": 0.6676, "step": 173 }, { "epoch": 2.761904761904762, "grad_norm": 0.4905121964865482, "learning_rate": 5.0277524427778986e-06, "loss": 0.7831, "step": 174 }, { "epoch": 2.7777777777777777, "grad_norm": 0.4525848803424086, "learning_rate": 4.972247557222102e-06, "loss": 0.7164, "step": 175 }, { "epoch": 2.7936507936507935, "grad_norm": 0.5368330661361714, "learning_rate": 4.916746091646808e-06, "loss": 0.6805, "step": 176 }, { "epoch": 2.8095238095238093, "grad_norm": 0.4224136348005534, "learning_rate": 4.8612548855915755e-06, "loss": 0.724, "step": 177 }, { "epoch": 2.825396825396825, "grad_norm": 0.4869146817578471, "learning_rate": 4.805780777331662e-06, "loss": 0.7446, "step": 178 }, { "epoch": 2.8412698412698414, "grad_norm": 0.4511553539717499, "learning_rate": 4.750330603035336e-06, "loss": 0.7124, "step": 179 }, { "epoch": 2.857142857142857, "grad_norm": 0.45286461188582156, "learning_rate": 4.694911195921443e-06, "loss": 0.7252, "step": 180 }, { "epoch": 2.873015873015873, "grad_norm": 0.4293856541441545, "learning_rate": 4.6395293854173395e-06, "loss": 0.6053, "step": 181 }, { "epoch": 2.888888888888889, "grad_norm": 0.5836445711298119, "learning_rate": 4.584191996317285e-06, "loss": 0.6828, "step": 182 }, { "epoch": 2.9047619047619047, "grad_norm": 0.4710990644177235, "learning_rate": 4.528905847941411e-06, "loss": 0.8414, "step": 183 }, { "epoch": 2.9206349206349205, "grad_norm": 0.4770727006845428, "learning_rate": 4.473677753295375e-06, "loss": 0.6592, "step": 184 }, { "epoch": 2.9365079365079367, "grad_norm": 0.4087196329651188, "learning_rate": 4.418514518230769e-06, "loss": 0.7122, "step": 185 }, { "epoch": 2.9523809523809526, "grad_norm": 0.48530953017287554, "learning_rate": 4.363422940606435e-06, "loss": 0.7454, "step": 186 }, { "epoch": 2.9682539682539684, "grad_norm": 0.4900655757859956, "learning_rate": 4.308409809450742e-06, "loss": 0.7621, "step": 187 }, { "epoch": 2.984126984126984, "grad_norm": 0.49992358594135816, "learning_rate": 4.253481904124968e-06, "loss": 0.7331, "step": 188 }, { "epoch": 3.0, "grad_norm": 0.4385555753140365, "learning_rate": 4.198645993487872e-06, "loss": 0.604, "step": 189 }, { "epoch": 3.015873015873016, "grad_norm": 0.8387072939566776, "learning_rate": 4.143908835061551e-06, "loss": 0.6844, "step": 190 }, { "epoch": 3.0317460317460316, "grad_norm": 0.5067553850596358, "learning_rate": 4.089277174198694e-06, "loss": 0.7, "step": 191 }, { "epoch": 3.0476190476190474, "grad_norm": 0.8011190701762613, "learning_rate": 4.0347577432513515e-06, "loss": 0.743, "step": 192 }, { "epoch": 3.0634920634920633, "grad_norm": 0.5036174850503073, "learning_rate": 3.980357260741293e-06, "loss": 0.6811, "step": 193 }, { "epoch": 3.0793650793650795, "grad_norm": 0.5463958175244662, "learning_rate": 3.926082430532067e-06, "loss": 0.6461, "step": 194 }, { "epoch": 3.0952380952380953, "grad_norm": 0.6797072746043096, "learning_rate": 3.87193994100288e-06, "loss": 0.6076, "step": 195 }, { "epoch": 3.111111111111111, "grad_norm": 0.6595821505262829, "learning_rate": 3.817936464224367e-06, "loss": 0.6729, "step": 196 }, { "epoch": 3.126984126984127, "grad_norm": 0.6303031378119635, "learning_rate": 3.764078655136391e-06, "loss": 0.7462, "step": 197 }, { "epoch": 3.142857142857143, "grad_norm": 0.5867535234820467, "learning_rate": 3.7103731507279383e-06, "loss": 0.6994, "step": 198 }, { "epoch": 3.1587301587301586, "grad_norm": 0.744206344239702, "learning_rate": 3.656826569219233e-06, "loss": 0.6698, "step": 199 }, { "epoch": 3.1746031746031744, "grad_norm": 0.48008260540189834, "learning_rate": 3.603445509246154e-06, "loss": 0.641, "step": 200 }, { "epoch": 3.1904761904761907, "grad_norm": 0.4479357669097382, "learning_rate": 3.55023654904709e-06, "loss": 0.7151, "step": 201 }, { "epoch": 3.2063492063492065, "grad_norm": 0.4992070753022617, "learning_rate": 3.49720624565228e-06, "loss": 0.5512, "step": 202 }, { "epoch": 3.2222222222222223, "grad_norm": 0.48763610007436026, "learning_rate": 3.44436113407578e-06, "loss": 0.6851, "step": 203 }, { "epoch": 3.238095238095238, "grad_norm": 0.5828989750562972, "learning_rate": 3.3917077265101505e-06, "loss": 0.7484, "step": 204 }, { "epoch": 3.253968253968254, "grad_norm": 0.5181196242222144, "learning_rate": 3.3392525115239353e-06, "loss": 0.5734, "step": 205 }, { "epoch": 3.2698412698412698, "grad_norm": 0.47471736105285633, "learning_rate": 3.2870019532620744e-06, "loss": 0.7111, "step": 206 }, { "epoch": 3.2857142857142856, "grad_norm": 0.44233225614290655, "learning_rate": 3.2349624906493164e-06, "loss": 0.6547, "step": 207 }, { "epoch": 3.3015873015873014, "grad_norm": 0.549412981718262, "learning_rate": 3.1831405365967315e-06, "loss": 0.5268, "step": 208 }, { "epoch": 3.317460317460317, "grad_norm": 0.4610400286542981, "learning_rate": 3.1315424772114404e-06, "loss": 0.7138, "step": 209 }, { "epoch": 3.3333333333333335, "grad_norm": 0.5912978101838896, "learning_rate": 3.0801746710096497e-06, "loss": 0.5523, "step": 210 }, { "epoch": 3.3492063492063493, "grad_norm": 0.5365898687480075, "learning_rate": 3.0290434481330746e-06, "loss": 0.586, "step": 211 }, { "epoch": 3.365079365079365, "grad_norm": 0.46259460742378206, "learning_rate": 2.978155109568864e-06, "loss": 0.6185, "step": 212 }, { "epoch": 3.380952380952381, "grad_norm": 0.5035201915907002, "learning_rate": 2.927515926373129e-06, "loss": 0.6636, "step": 213 }, { "epoch": 3.3968253968253967, "grad_norm": 0.45266495724596634, "learning_rate": 2.8771321388981334e-06, "loss": 0.6636, "step": 214 }, { "epoch": 3.4126984126984126, "grad_norm": 0.9391085088935888, "learning_rate": 2.8270099560232992e-06, "loss": 0.7087, "step": 215 }, { "epoch": 3.4285714285714284, "grad_norm": 0.4858344733348455, "learning_rate": 2.77715555439007e-06, "loss": 0.7505, "step": 216 }, { "epoch": 3.4444444444444446, "grad_norm": 0.43967486893271346, "learning_rate": 2.7275750776407568e-06, "loss": 0.6028, "step": 217 }, { "epoch": 3.4603174603174605, "grad_norm": 0.4717225483417324, "learning_rate": 2.6782746356614364e-06, "loss": 0.5843, "step": 218 }, { "epoch": 3.4761904761904763, "grad_norm": 0.5642208112077767, "learning_rate": 2.6292603038290306e-06, "loss": 0.7137, "step": 219 }, { "epoch": 3.492063492063492, "grad_norm": 0.49853539661605933, "learning_rate": 2.580538122262627e-06, "loss": 0.6527, "step": 220 }, { "epoch": 3.507936507936508, "grad_norm": 0.5784669157284802, "learning_rate": 2.532114095079137e-06, "loss": 0.5754, "step": 221 }, { "epoch": 3.5238095238095237, "grad_norm": 0.451433112279235, "learning_rate": 2.4839941896534027e-06, "loss": 0.6267, "step": 222 }, { "epoch": 3.5396825396825395, "grad_norm": 0.43076481074733924, "learning_rate": 2.4361843358828287e-06, "loss": 0.5873, "step": 223 }, { "epoch": 3.5555555555555554, "grad_norm": 0.5555243876307536, "learning_rate": 2.388690425456629e-06, "loss": 0.7168, "step": 224 }, { "epoch": 3.571428571428571, "grad_norm": 0.5140731956812183, "learning_rate": 2.341518311129781e-06, "loss": 0.5671, "step": 225 }, { "epoch": 3.5873015873015874, "grad_norm": 0.4829910245643812, "learning_rate": 2.2946738060017947e-06, "loss": 0.708, "step": 226 }, { "epoch": 3.6031746031746033, "grad_norm": 0.4822066553569954, "learning_rate": 2.24816268280033e-06, "loss": 0.653, "step": 227 }, { "epoch": 3.619047619047619, "grad_norm": 0.47281479364796025, "learning_rate": 2.2019906731698337e-06, "loss": 0.6494, "step": 228 }, { "epoch": 3.634920634920635, "grad_norm": 0.40568931340881786, "learning_rate": 2.156163466965218e-06, "loss": 0.6293, "step": 229 }, { "epoch": 3.6507936507936507, "grad_norm": 0.8300833772536724, "learning_rate": 2.110686711550678e-06, "loss": 0.7077, "step": 230 }, { "epoch": 3.6666666666666665, "grad_norm": 0.4061549756977283, "learning_rate": 2.0655660111037685e-06, "loss": 0.6339, "step": 231 }, { "epoch": 3.682539682539683, "grad_norm": 0.4255055033708699, "learning_rate": 2.0208069259247866e-06, "loss": 0.6577, "step": 232 }, { "epoch": 3.6984126984126986, "grad_norm": 0.43647553702799224, "learning_rate": 1.976414971751568e-06, "loss": 0.6029, "step": 233 }, { "epoch": 3.7142857142857144, "grad_norm": 0.4631401287763674, "learning_rate": 1.932395619079771e-06, "loss": 0.6777, "step": 234 }, { "epoch": 3.7301587301587302, "grad_norm": 0.4306497798982157, "learning_rate": 1.8887542924887486e-06, "loss": 0.6996, "step": 235 }, { "epoch": 3.746031746031746, "grad_norm": 0.42608290640248714, "learning_rate": 1.8454963699730471e-06, "loss": 0.6727, "step": 236 }, { "epoch": 3.761904761904762, "grad_norm": 0.43950627489492866, "learning_rate": 1.802627182279687e-06, "loss": 0.5927, "step": 237 }, { "epoch": 3.7777777777777777, "grad_norm": 0.8150329270991283, "learning_rate": 1.760152012251241e-06, "loss": 0.5039, "step": 238 }, { "epoch": 3.7936507936507935, "grad_norm": 0.5428442791120296, "learning_rate": 1.7180760941748132e-06, "loss": 0.6682, "step": 239 }, { "epoch": 3.8095238095238093, "grad_norm": 0.7967588807434276, "learning_rate": 1.6764046131370142e-06, "loss": 0.7302, "step": 240 }, { "epoch": 3.825396825396825, "grad_norm": 0.4533126401638171, "learning_rate": 1.6351427043849955e-06, "loss": 0.6953, "step": 241 }, { "epoch": 3.8412698412698414, "grad_norm": 0.5989253712710739, "learning_rate": 1.5942954526936217e-06, "loss": 0.654, "step": 242 }, { "epoch": 3.857142857142857, "grad_norm": 0.5693019253502395, "learning_rate": 1.5538678917388638e-06, "loss": 0.6618, "step": 243 }, { "epoch": 3.873015873015873, "grad_norm": 0.47402524421174175, "learning_rate": 1.5138650034775004e-06, "loss": 0.5709, "step": 244 }, { "epoch": 3.888888888888889, "grad_norm": 0.4074162350507339, "learning_rate": 1.4742917175331644e-06, "loss": 0.7154, "step": 245 }, { "epoch": 3.9047619047619047, "grad_norm": 0.4421839212167129, "learning_rate": 1.4351529105888735e-06, "loss": 0.7655, "step": 246 }, { "epoch": 3.9206349206349205, "grad_norm": 0.45643272842877347, "learning_rate": 1.3964534057860652e-06, "loss": 0.5952, "step": 247 }, { "epoch": 3.9365079365079367, "grad_norm": 0.4769259453601474, "learning_rate": 1.3581979721302286e-06, "loss": 0.6536, "step": 248 }, { "epoch": 3.9523809523809526, "grad_norm": 0.5748113648603245, "learning_rate": 1.3203913239032074e-06, "loss": 0.6675, "step": 249 }, { "epoch": 3.9682539682539684, "grad_norm": 0.5055584872741437, "learning_rate": 1.283038120082268e-06, "loss": 0.6196, "step": 250 }, { "epoch": 3.984126984126984, "grad_norm": 0.4492562647446275, "learning_rate": 1.2461429637659466e-06, "loss": 0.6194, "step": 251 }, { "epoch": 4.0, "grad_norm": 0.5069425974799285, "learning_rate": 1.2097104016068146e-06, "loss": 0.6338, "step": 252 } ], "logging_steps": 1, "max_steps": 315, "num_input_tokens_seen": 0, "num_train_epochs": 5, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 62903662804992.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }