{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 5.0, "eval_steps": 500, "global_step": 315, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.015873015873015872, "grad_norm": 2.3323949793109238, "learning_rate": 0.0, "loss": 1.0469, "step": 1 }, { "epoch": 0.031746031746031744, "grad_norm": 2.317201200688066, "learning_rate": 3.125e-07, "loss": 0.9931, "step": 2 }, { "epoch": 0.047619047619047616, "grad_norm": 2.603689956679125, "learning_rate": 6.25e-07, "loss": 1.0188, "step": 3 }, { "epoch": 0.06349206349206349, "grad_norm": 2.2583787301898592, "learning_rate": 9.375000000000001e-07, "loss": 0.9097, "step": 4 }, { "epoch": 0.07936507936507936, "grad_norm": 2.197466891038096, "learning_rate": 1.25e-06, "loss": 1.0459, "step": 5 }, { "epoch": 0.09523809523809523, "grad_norm": 2.1259963361099747, "learning_rate": 1.5625e-06, "loss": 0.9986, "step": 6 }, { "epoch": 0.1111111111111111, "grad_norm": 2.0707820881041, "learning_rate": 1.8750000000000003e-06, "loss": 0.9555, "step": 7 }, { "epoch": 0.12698412698412698, "grad_norm": 1.870407527874291, "learning_rate": 2.1875000000000002e-06, "loss": 0.952, "step": 8 }, { "epoch": 0.14285714285714285, "grad_norm": 1.8578085390534953, "learning_rate": 2.5e-06, "loss": 0.9993, "step": 9 }, { "epoch": 0.15873015873015872, "grad_norm": 1.881148688458384, "learning_rate": 2.8125e-06, "loss": 0.9373, "step": 10 }, { "epoch": 0.1746031746031746, "grad_norm": 1.6917769845914787, "learning_rate": 3.125e-06, "loss": 0.8839, "step": 11 }, { "epoch": 0.19047619047619047, "grad_norm": 1.2541345576396532, "learning_rate": 3.4375e-06, "loss": 0.9909, "step": 12 }, { "epoch": 0.20634920634920634, "grad_norm": 1.4038335670152517, "learning_rate": 3.7500000000000005e-06, "loss": 0.9322, "step": 13 }, { "epoch": 0.2222222222222222, "grad_norm": 1.363468897891553, "learning_rate": 4.0625000000000005e-06, "loss": 1.0934, "step": 14 }, { "epoch": 0.23809523809523808, "grad_norm": 1.1331989679866032, "learning_rate": 4.3750000000000005e-06, "loss": 0.977, "step": 15 }, { "epoch": 0.25396825396825395, "grad_norm": 0.9899834287202586, "learning_rate": 4.6875000000000004e-06, "loss": 1.0443, "step": 16 }, { "epoch": 0.2698412698412698, "grad_norm": 1.155920523517074, "learning_rate": 5e-06, "loss": 0.9483, "step": 17 }, { "epoch": 0.2857142857142857, "grad_norm": 1.2715867938274161, "learning_rate": 5.3125e-06, "loss": 1.0096, "step": 18 }, { "epoch": 0.30158730158730157, "grad_norm": 0.9922231339593638, "learning_rate": 5.625e-06, "loss": 0.7463, "step": 19 }, { "epoch": 0.31746031746031744, "grad_norm": 1.2551959582539625, "learning_rate": 5.9375e-06, "loss": 0.9226, "step": 20 }, { "epoch": 0.3333333333333333, "grad_norm": 0.892951024999124, "learning_rate": 6.25e-06, "loss": 0.988, "step": 21 }, { "epoch": 0.3492063492063492, "grad_norm": 1.4360539096520086, "learning_rate": 6.5625e-06, "loss": 1.0509, "step": 22 }, { "epoch": 0.36507936507936506, "grad_norm": 1.1100051374669628, "learning_rate": 6.875e-06, "loss": 0.8728, "step": 23 }, { "epoch": 0.38095238095238093, "grad_norm": 0.9630208551024003, "learning_rate": 7.1875e-06, "loss": 0.8352, "step": 24 }, { "epoch": 0.3968253968253968, "grad_norm": 1.109963225007402, "learning_rate": 7.500000000000001e-06, "loss": 1.0289, "step": 25 }, { "epoch": 0.4126984126984127, "grad_norm": 0.842175710243708, "learning_rate": 7.8125e-06, "loss": 0.8616, "step": 26 }, { "epoch": 0.42857142857142855, "grad_norm": 0.8255762742603932, "learning_rate": 8.125000000000001e-06, "loss": 0.7234, "step": 27 }, { "epoch": 0.4444444444444444, "grad_norm": 0.8274507712792363, "learning_rate": 8.4375e-06, "loss": 0.9758, "step": 28 }, { "epoch": 0.4603174603174603, "grad_norm": 0.7834224887700044, "learning_rate": 8.750000000000001e-06, "loss": 0.9056, "step": 29 }, { "epoch": 0.47619047619047616, "grad_norm": 1.187020605300137, "learning_rate": 9.0625e-06, "loss": 0.9481, "step": 30 }, { "epoch": 0.49206349206349204, "grad_norm": 1.0233176856791018, "learning_rate": 9.375000000000001e-06, "loss": 0.9194, "step": 31 }, { "epoch": 0.5079365079365079, "grad_norm": 0.848791394024066, "learning_rate": 9.6875e-06, "loss": 0.8852, "step": 32 }, { "epoch": 0.5238095238095238, "grad_norm": 0.8289281876622956, "learning_rate": 1e-05, "loss": 1.038, "step": 33 }, { "epoch": 0.5396825396825397, "grad_norm": 0.7738330911179299, "learning_rate": 9.999691920767945e-06, "loss": 0.8374, "step": 34 }, { "epoch": 0.5555555555555556, "grad_norm": 0.65004421093035, "learning_rate": 9.998767721036901e-06, "loss": 0.8242, "step": 35 }, { "epoch": 0.5714285714285714, "grad_norm": 0.718229691257778, "learning_rate": 9.997227514697568e-06, "loss": 0.9693, "step": 36 }, { "epoch": 0.5873015873015873, "grad_norm": 0.598178727036991, "learning_rate": 9.99507149155218e-06, "loss": 0.9843, "step": 37 }, { "epoch": 0.6031746031746031, "grad_norm": 0.6896420594948925, "learning_rate": 9.992299917291118e-06, "loss": 0.848, "step": 38 }, { "epoch": 0.6190476190476191, "grad_norm": 0.7218001479564617, "learning_rate": 9.98891313346017e-06, "loss": 0.9095, "step": 39 }, { "epoch": 0.6349206349206349, "grad_norm": 0.673383804041238, "learning_rate": 9.984911557418444e-06, "loss": 0.7682, "step": 40 }, { "epoch": 0.6507936507936508, "grad_norm": 0.9044903125501461, "learning_rate": 9.980295682286924e-06, "loss": 0.8388, "step": 41 }, { "epoch": 0.6666666666666666, "grad_norm": 0.6528626470394925, "learning_rate": 9.97506607688772e-06, "loss": 0.9107, "step": 42 }, { "epoch": 0.6825396825396826, "grad_norm": 0.5248039585149111, "learning_rate": 9.969223385673958e-06, "loss": 0.8307, "step": 43 }, { "epoch": 0.6984126984126984, "grad_norm": 0.568338771820042, "learning_rate": 9.962768328650367e-06, "loss": 0.7523, "step": 44 }, { "epoch": 0.7142857142857143, "grad_norm": 0.5429855696185105, "learning_rate": 9.95570170128455e-06, "loss": 0.8442, "step": 45 }, { "epoch": 0.7301587301587301, "grad_norm": 0.5098426033492849, "learning_rate": 9.94802437440896e-06, "loss": 0.7962, "step": 46 }, { "epoch": 0.746031746031746, "grad_norm": 0.6078990273192543, "learning_rate": 9.939737294113585e-06, "loss": 0.8969, "step": 47 }, { "epoch": 0.7619047619047619, "grad_norm": 0.4709547244829324, "learning_rate": 9.930841481629358e-06, "loss": 0.8885, "step": 48 }, { "epoch": 0.7777777777777778, "grad_norm": 0.54039591858629, "learning_rate": 9.92133803320231e-06, "loss": 0.7818, "step": 49 }, { "epoch": 0.7936507936507936, "grad_norm": 0.4875170254753124, "learning_rate": 9.91122811995848e-06, "loss": 0.8193, "step": 50 }, { "epoch": 0.8095238095238095, "grad_norm": 0.5005396928536703, "learning_rate": 9.90051298775959e-06, "loss": 0.8692, "step": 51 }, { "epoch": 0.8253968253968254, "grad_norm": 0.40245216027036546, "learning_rate": 9.88919395704952e-06, "loss": 0.826, "step": 52 }, { "epoch": 0.8412698412698413, "grad_norm": 0.5389952051377087, "learning_rate": 9.877272422691583e-06, "loss": 0.9318, "step": 53 }, { "epoch": 0.8571428571428571, "grad_norm": 0.5638980417584056, "learning_rate": 9.864749853796642e-06, "loss": 0.7985, "step": 54 }, { "epoch": 0.873015873015873, "grad_norm": 0.5506830661309166, "learning_rate": 9.85162779354206e-06, "loss": 0.7291, "step": 55 }, { "epoch": 0.8888888888888888, "grad_norm": 0.48566023212019677, "learning_rate": 9.837907858981536e-06, "loss": 0.8802, "step": 56 }, { "epoch": 0.9047619047619048, "grad_norm": 0.4725406192484581, "learning_rate": 9.823591740845831e-06, "loss": 0.8627, "step": 57 }, { "epoch": 0.9206349206349206, "grad_norm": 0.5270784935436914, "learning_rate": 9.808681203334416e-06, "loss": 0.7976, "step": 58 }, { "epoch": 0.9365079365079365, "grad_norm": 0.4795159174595573, "learning_rate": 9.793178083898073e-06, "loss": 0.8783, "step": 59 }, { "epoch": 0.9523809523809523, "grad_norm": 0.42309628953003137, "learning_rate": 9.777084293012448e-06, "loss": 0.842, "step": 60 }, { "epoch": 0.9682539682539683, "grad_norm": 0.464555539059811, "learning_rate": 9.760401813942641e-06, "loss": 0.7662, "step": 61 }, { "epoch": 0.9841269841269841, "grad_norm": 0.5141212041737542, "learning_rate": 9.743132702498785e-06, "loss": 0.8688, "step": 62 }, { "epoch": 1.0, "grad_norm": 0.5165788253828009, "learning_rate": 9.725279086782719e-06, "loss": 0.768, "step": 63 }, { "epoch": 1.0158730158730158, "grad_norm": 0.576629868282963, "learning_rate": 9.706843166925733e-06, "loss": 0.7989, "step": 64 }, { "epoch": 1.0317460317460316, "grad_norm": 0.4946943998511545, "learning_rate": 9.687827214817433e-06, "loss": 0.8261, "step": 65 }, { "epoch": 1.0476190476190477, "grad_norm": 0.4987216606535057, "learning_rate": 9.668233573825794e-06, "loss": 0.8905, "step": 66 }, { "epoch": 1.0634920634920635, "grad_norm": 0.45688977932466196, "learning_rate": 9.64806465850836e-06, "loss": 0.7327, "step": 67 }, { "epoch": 1.0793650793650793, "grad_norm": 0.5226340006885853, "learning_rate": 9.62732295431471e-06, "loss": 0.7311, "step": 68 }, { "epoch": 1.0952380952380953, "grad_norm": 0.6684025298786129, "learning_rate": 9.606011017280166e-06, "loss": 0.8971, "step": 69 }, { "epoch": 1.1111111111111112, "grad_norm": 0.5147703758608321, "learning_rate": 9.5841314737108e-06, "loss": 0.7652, "step": 70 }, { "epoch": 1.126984126984127, "grad_norm": 0.5417227409614662, "learning_rate": 9.56168701985981e-06, "loss": 0.7999, "step": 71 }, { "epoch": 1.1428571428571428, "grad_norm": 0.5016561221704748, "learning_rate": 9.538680421595236e-06, "loss": 0.8074, "step": 72 }, { "epoch": 1.1587301587301586, "grad_norm": 0.4853528793957531, "learning_rate": 9.515114514059127e-06, "loss": 0.8135, "step": 73 }, { "epoch": 1.1746031746031746, "grad_norm": 0.47765415470199357, "learning_rate": 9.490992201318165e-06, "loss": 0.7879, "step": 74 }, { "epoch": 1.1904761904761905, "grad_norm": 0.46535342031003013, "learning_rate": 9.466316456005783e-06, "loss": 0.7762, "step": 75 }, { "epoch": 1.2063492063492063, "grad_norm": 0.5033568814253909, "learning_rate": 9.441090318955843e-06, "loss": 0.7022, "step": 76 }, { "epoch": 1.2222222222222223, "grad_norm": 0.4986643533291915, "learning_rate": 9.415316898827923e-06, "loss": 0.7349, "step": 77 }, { "epoch": 1.2380952380952381, "grad_norm": 0.43657193718859494, "learning_rate": 9.388999371724212e-06, "loss": 0.8264, "step": 78 }, { "epoch": 1.253968253968254, "grad_norm": 0.47617277777848616, "learning_rate": 9.362140980798127e-06, "loss": 0.8944, "step": 79 }, { "epoch": 1.2698412698412698, "grad_norm": 0.4295219607791053, "learning_rate": 9.334745035854646e-06, "loss": 0.7588, "step": 80 }, { "epoch": 1.2857142857142856, "grad_norm": 0.5225987407011279, "learning_rate": 9.306814912942445e-06, "loss": 0.8359, "step": 81 }, { "epoch": 1.3015873015873016, "grad_norm": 0.4173684559568506, "learning_rate": 9.278354053937848e-06, "loss": 0.7804, "step": 82 }, { "epoch": 1.3174603174603174, "grad_norm": 0.5238592049595157, "learning_rate": 9.249365966120692e-06, "loss": 0.8564, "step": 83 }, { "epoch": 1.3333333333333333, "grad_norm": 0.4526393208745273, "learning_rate": 9.219854221742106e-06, "loss": 0.8102, "step": 84 }, { "epoch": 1.3492063492063493, "grad_norm": 0.44471888761912887, "learning_rate": 9.189822457584311e-06, "loss": 0.7439, "step": 85 }, { "epoch": 1.3650793650793651, "grad_norm": 0.43731884433734214, "learning_rate": 9.159274374512444e-06, "loss": 0.6592, "step": 86 }, { "epoch": 1.380952380952381, "grad_norm": 0.4377614076782124, "learning_rate": 9.128213737018493e-06, "loss": 0.806, "step": 87 }, { "epoch": 1.3968253968253967, "grad_norm": 0.4027105033083121, "learning_rate": 9.096644372757393e-06, "loss": 0.8855, "step": 88 }, { "epoch": 1.4126984126984126, "grad_norm": 0.571463019194369, "learning_rate": 9.064570172075349e-06, "loss": 0.7979, "step": 89 }, { "epoch": 1.4285714285714286, "grad_norm": 0.4801097800367482, "learning_rate": 9.031995087530403e-06, "loss": 0.7992, "step": 90 }, { "epoch": 1.4444444444444444, "grad_norm": 0.47255682704462587, "learning_rate": 8.99892313340537e-06, "loss": 0.6633, "step": 91 }, { "epoch": 1.4603174603174602, "grad_norm": 0.4862492507086913, "learning_rate": 8.96535838521314e-06, "loss": 0.8033, "step": 92 }, { "epoch": 1.4761904761904763, "grad_norm": 0.4794987734861929, "learning_rate": 8.931304979194452e-06, "loss": 0.8069, "step": 93 }, { "epoch": 1.492063492063492, "grad_norm": 0.4658669415595415, "learning_rate": 8.896767111808177e-06, "loss": 0.7371, "step": 94 }, { "epoch": 1.507936507936508, "grad_norm": 0.5683125861447418, "learning_rate": 8.861749039214177e-06, "loss": 0.9145, "step": 95 }, { "epoch": 1.5238095238095237, "grad_norm": 0.47857884026171116, "learning_rate": 8.826255076748823e-06, "loss": 0.8455, "step": 96 }, { "epoch": 1.5396825396825395, "grad_norm": 0.429389167302876, "learning_rate": 8.790289598393186e-06, "loss": 0.7216, "step": 97 }, { "epoch": 1.5555555555555556, "grad_norm": 0.522031534882144, "learning_rate": 8.753857036234055e-06, "loss": 0.8155, "step": 98 }, { "epoch": 1.5714285714285714, "grad_norm": 0.5375692580431519, "learning_rate": 8.716961879917734e-06, "loss": 0.7373, "step": 99 }, { "epoch": 1.5873015873015874, "grad_norm": 0.4277716225580266, "learning_rate": 8.679608676096793e-06, "loss": 0.8132, "step": 100 }, { "epoch": 1.6031746031746033, "grad_norm": 0.9709114563751018, "learning_rate": 8.641802027869774e-06, "loss": 0.7952, "step": 101 }, { "epoch": 1.619047619047619, "grad_norm": 0.6722991060253756, "learning_rate": 8.603546594213935e-06, "loss": 0.8566, "step": 102 }, { "epoch": 1.6349206349206349, "grad_norm": 0.48227435877100366, "learning_rate": 8.564847089411128e-06, "loss": 0.8292, "step": 103 }, { "epoch": 1.6507936507936507, "grad_norm": 0.43738769808282163, "learning_rate": 8.525708282466839e-06, "loss": 0.8424, "step": 104 }, { "epoch": 1.6666666666666665, "grad_norm": 0.42758983764847835, "learning_rate": 8.486134996522502e-06, "loss": 0.8179, "step": 105 }, { "epoch": 1.6825396825396826, "grad_norm": 0.6465752665836958, "learning_rate": 8.446132108261136e-06, "loss": 0.806, "step": 106 }, { "epoch": 1.6984126984126984, "grad_norm": 0.5216064305348748, "learning_rate": 8.405704547306379e-06, "loss": 0.8041, "step": 107 }, { "epoch": 1.7142857142857144, "grad_norm": 0.46284349128240304, "learning_rate": 8.364857295615006e-06, "loss": 0.8924, "step": 108 }, { "epoch": 1.7301587301587302, "grad_norm": 0.48814352812138595, "learning_rate": 8.323595386862985e-06, "loss": 0.7929, "step": 109 }, { "epoch": 1.746031746031746, "grad_norm": 0.48088506678769916, "learning_rate": 8.281923905825188e-06, "loss": 0.7671, "step": 110 }, { "epoch": 1.7619047619047619, "grad_norm": 0.4594586947272896, "learning_rate": 8.23984798774876e-06, "loss": 0.7366, "step": 111 }, { "epoch": 1.7777777777777777, "grad_norm": 0.4673793179812366, "learning_rate": 8.197372817720314e-06, "loss": 0.7397, "step": 112 }, { "epoch": 1.7936507936507935, "grad_norm": 0.6557346369623661, "learning_rate": 8.154503630026955e-06, "loss": 0.7262, "step": 113 }, { "epoch": 1.8095238095238095, "grad_norm": 0.45128446254113314, "learning_rate": 8.111245707511253e-06, "loss": 0.7213, "step": 114 }, { "epoch": 1.8253968253968254, "grad_norm": 0.41666335434637974, "learning_rate": 8.067604380920228e-06, "loss": 0.7952, "step": 115 }, { "epoch": 1.8412698412698414, "grad_norm": 0.4407610683896587, "learning_rate": 8.023585028248435e-06, "loss": 0.8486, "step": 116 }, { "epoch": 1.8571428571428572, "grad_norm": 0.5501977264080524, "learning_rate": 7.979193074075216e-06, "loss": 0.8911, "step": 117 }, { "epoch": 1.873015873015873, "grad_norm": 0.459940871244406, "learning_rate": 7.934433988896233e-06, "loss": 0.6535, "step": 118 }, { "epoch": 1.8888888888888888, "grad_norm": 0.46949896874504654, "learning_rate": 7.889313288449323e-06, "loss": 0.8232, "step": 119 }, { "epoch": 1.9047619047619047, "grad_norm": 0.41110722374315695, "learning_rate": 7.843836533034784e-06, "loss": 0.7628, "step": 120 }, { "epoch": 1.9206349206349205, "grad_norm": 0.47755036946919965, "learning_rate": 7.798009326830167e-06, "loss": 0.8003, "step": 121 }, { "epoch": 1.9365079365079365, "grad_norm": 0.41342145123270885, "learning_rate": 7.751837317199673e-06, "loss": 0.8683, "step": 122 }, { "epoch": 1.9523809523809523, "grad_norm": 0.4479867168170251, "learning_rate": 7.705326193998207e-06, "loss": 0.7552, "step": 123 }, { "epoch": 1.9682539682539684, "grad_norm": 0.4549548876094008, "learning_rate": 7.658481688870218e-06, "loss": 0.7587, "step": 124 }, { "epoch": 1.9841269841269842, "grad_norm": 0.4684989926335189, "learning_rate": 7.611309574543373e-06, "loss": 0.7607, "step": 125 }, { "epoch": 2.0, "grad_norm": 0.4367513791425883, "learning_rate": 7.563815664117173e-06, "loss": 0.9146, "step": 126 }, { "epoch": 2.015873015873016, "grad_norm": 0.7927149278076437, "learning_rate": 7.5160058103465985e-06, "loss": 0.7131, "step": 127 }, { "epoch": 2.0317460317460316, "grad_norm": 0.5847918647965703, "learning_rate": 7.467885904920864e-06, "loss": 0.7578, "step": 128 }, { "epoch": 2.0476190476190474, "grad_norm": 0.7836046335272314, "learning_rate": 7.419461877737373e-06, "loss": 0.8327, "step": 129 }, { "epoch": 2.0634920634920633, "grad_norm": 2.1428241341527117, "learning_rate": 7.370739696170971e-06, "loss": 0.7441, "step": 130 }, { "epoch": 2.0793650793650795, "grad_norm": 0.9566247813485141, "learning_rate": 7.321725364338566e-06, "loss": 0.6185, "step": 131 }, { "epoch": 2.0952380952380953, "grad_norm": 0.5336099004301172, "learning_rate": 7.272424922359246e-06, "loss": 0.6455, "step": 132 }, { "epoch": 2.111111111111111, "grad_norm": 0.7132260718912609, "learning_rate": 7.222844445609931e-06, "loss": 0.7834, "step": 133 }, { "epoch": 2.126984126984127, "grad_norm": 0.5749113101610002, "learning_rate": 7.172990043976703e-06, "loss": 0.7296, "step": 134 }, { "epoch": 2.142857142857143, "grad_norm": 0.5366676899164674, "learning_rate": 7.122867861101868e-06, "loss": 0.795, "step": 135 }, { "epoch": 2.1587301587301586, "grad_norm": 0.44931031781346276, "learning_rate": 7.072484073626872e-06, "loss": 0.6875, "step": 136 }, { "epoch": 2.1746031746031744, "grad_norm": 0.6709913679680917, "learning_rate": 7.021844890431136e-06, "loss": 0.7669, "step": 137 }, { "epoch": 2.1904761904761907, "grad_norm": 0.5782700607354144, "learning_rate": 6.970956551866925e-06, "loss": 0.7273, "step": 138 }, { "epoch": 2.2063492063492065, "grad_norm": 0.5008612890527109, "learning_rate": 6.9198253289903515e-06, "loss": 0.6634, "step": 139 }, { "epoch": 2.2222222222222223, "grad_norm": 0.5733594756270326, "learning_rate": 6.868457522788561e-06, "loss": 0.7358, "step": 140 }, { "epoch": 2.238095238095238, "grad_norm": 0.48532685396257946, "learning_rate": 6.816859463403271e-06, "loss": 0.659, "step": 141 }, { "epoch": 2.253968253968254, "grad_norm": 0.5460096768726493, "learning_rate": 6.765037509350685e-06, "loss": 0.7585, "step": 142 }, { "epoch": 2.2698412698412698, "grad_norm": 0.4827715321224934, "learning_rate": 6.7129980467379265e-06, "loss": 0.6664, "step": 143 }, { "epoch": 2.2857142857142856, "grad_norm": 0.5417449745700821, "learning_rate": 6.660747488476066e-06, "loss": 0.663, "step": 144 }, { "epoch": 2.3015873015873014, "grad_norm": 0.5672091588208017, "learning_rate": 6.608292273489851e-06, "loss": 0.6122, "step": 145 }, { "epoch": 2.317460317460317, "grad_norm": 0.5264115445856029, "learning_rate": 6.555638865924221e-06, "loss": 0.7035, "step": 146 }, { "epoch": 2.3333333333333335, "grad_norm": 0.5168486054014866, "learning_rate": 6.502793754347721e-06, "loss": 0.7598, "step": 147 }, { "epoch": 2.3492063492063493, "grad_norm": 0.6085627519823247, "learning_rate": 6.449763450952912e-06, "loss": 0.6875, "step": 148 }, { "epoch": 2.365079365079365, "grad_norm": 0.504951049632705, "learning_rate": 6.396554490753848e-06, "loss": 0.6839, "step": 149 }, { "epoch": 2.380952380952381, "grad_norm": 0.42239268629753335, "learning_rate": 6.343173430780769e-06, "loss": 0.8396, "step": 150 }, { "epoch": 2.3968253968253967, "grad_norm": 0.5170870251352963, "learning_rate": 6.289626849272062e-06, "loss": 0.8013, "step": 151 }, { "epoch": 2.4126984126984126, "grad_norm": 0.5408561718958109, "learning_rate": 6.2359213448636104e-06, "loss": 0.754, "step": 152 }, { "epoch": 2.4285714285714284, "grad_norm": 0.42606389993166277, "learning_rate": 6.182063535775634e-06, "loss": 0.7662, "step": 153 }, { "epoch": 2.4444444444444446, "grad_norm": 0.41021417431281776, "learning_rate": 6.1280600589971225e-06, "loss": 0.791, "step": 154 }, { "epoch": 2.4603174603174605, "grad_norm": 0.4068459581892925, "learning_rate": 6.073917569467934e-06, "loss": 0.8066, "step": 155 }, { "epoch": 2.4761904761904763, "grad_norm": 0.40243757072180364, "learning_rate": 6.0196427392587085e-06, "loss": 0.7061, "step": 156 }, { "epoch": 2.492063492063492, "grad_norm": 0.5924677871750427, "learning_rate": 5.96524225674865e-06, "loss": 0.744, "step": 157 }, { "epoch": 2.507936507936508, "grad_norm": 0.4344103520994765, "learning_rate": 5.9107228258013085e-06, "loss": 0.7076, "step": 158 }, { "epoch": 2.5238095238095237, "grad_norm": 0.4824828219676673, "learning_rate": 5.856091164938451e-06, "loss": 0.6534, "step": 159 }, { "epoch": 2.5396825396825395, "grad_norm": 0.4197375023372333, "learning_rate": 5.801354006512127e-06, "loss": 0.6902, "step": 160 }, { "epoch": 2.5555555555555554, "grad_norm": 0.4523354962317184, "learning_rate": 5.746518095875033e-06, "loss": 0.6996, "step": 161 }, { "epoch": 2.571428571428571, "grad_norm": 0.41073692830700287, "learning_rate": 5.6915901905492586e-06, "loss": 0.629, "step": 162 }, { "epoch": 2.5873015873015874, "grad_norm": 0.5807356357914126, "learning_rate": 5.6365770593935665e-06, "loss": 0.5924, "step": 163 }, { "epoch": 2.6031746031746033, "grad_norm": 0.5296154741304107, "learning_rate": 5.581485481769231e-06, "loss": 0.7197, "step": 164 }, { "epoch": 2.619047619047619, "grad_norm": 0.4462893254042338, "learning_rate": 5.526322246704628e-06, "loss": 0.8007, "step": 165 }, { "epoch": 2.634920634920635, "grad_norm": 0.3974463949753287, "learning_rate": 5.471094152058592e-06, "loss": 0.6822, "step": 166 }, { "epoch": 2.6507936507936507, "grad_norm": 0.46244966479154553, "learning_rate": 5.415808003682717e-06, "loss": 0.7318, "step": 167 }, { "epoch": 2.6666666666666665, "grad_norm": 0.438557400530548, "learning_rate": 5.360470614582661e-06, "loss": 0.7147, "step": 168 }, { "epoch": 2.682539682539683, "grad_norm": 0.5680373876053647, "learning_rate": 5.305088804078559e-06, "loss": 0.7357, "step": 169 }, { "epoch": 2.6984126984126986, "grad_norm": 0.4556205137087138, "learning_rate": 5.249669396964665e-06, "loss": 0.6361, "step": 170 }, { "epoch": 2.7142857142857144, "grad_norm": 0.44940699263796485, "learning_rate": 5.1942192226683385e-06, "loss": 0.7778, "step": 171 }, { "epoch": 2.7301587301587302, "grad_norm": 0.47535854965434626, "learning_rate": 5.138745114408427e-06, "loss": 0.6008, "step": 172 }, { "epoch": 2.746031746031746, "grad_norm": 0.5020715004802897, "learning_rate": 5.083253908353193e-06, "loss": 0.6696, "step": 173 }, { "epoch": 2.761904761904762, "grad_norm": 0.4715489187155987, "learning_rate": 5.0277524427778986e-06, "loss": 0.7846, "step": 174 }, { "epoch": 2.7777777777777777, "grad_norm": 0.44938039077917374, "learning_rate": 4.972247557222102e-06, "loss": 0.7187, "step": 175 }, { "epoch": 2.7936507936507935, "grad_norm": 0.536309868809644, "learning_rate": 4.916746091646808e-06, "loss": 0.6818, "step": 176 }, { "epoch": 2.8095238095238093, "grad_norm": 0.4238224566275176, "learning_rate": 4.8612548855915755e-06, "loss": 0.7252, "step": 177 }, { "epoch": 2.825396825396825, "grad_norm": 0.5075369152051689, "learning_rate": 4.805780777331662e-06, "loss": 0.7461, "step": 178 }, { "epoch": 2.8412698412698414, "grad_norm": 0.463068134108742, "learning_rate": 4.750330603035336e-06, "loss": 0.7141, "step": 179 }, { "epoch": 2.857142857142857, "grad_norm": 0.44910366292391646, "learning_rate": 4.694911195921443e-06, "loss": 0.7278, "step": 180 }, { "epoch": 2.873015873015873, "grad_norm": 0.43362119780351166, "learning_rate": 4.6395293854173395e-06, "loss": 0.6069, "step": 181 }, { "epoch": 2.888888888888889, "grad_norm": 0.7285135499415637, "learning_rate": 4.584191996317285e-06, "loss": 0.6846, "step": 182 }, { "epoch": 2.9047619047619047, "grad_norm": 0.49976201370002465, "learning_rate": 4.528905847941411e-06, "loss": 0.843, "step": 183 }, { "epoch": 2.9206349206349205, "grad_norm": 0.47745344638517, "learning_rate": 4.473677753295375e-06, "loss": 0.6609, "step": 184 }, { "epoch": 2.9365079365079367, "grad_norm": 0.4075892143069301, "learning_rate": 4.418514518230769e-06, "loss": 0.7133, "step": 185 }, { "epoch": 2.9523809523809526, "grad_norm": 0.490679894902017, "learning_rate": 4.363422940606435e-06, "loss": 0.7483, "step": 186 }, { "epoch": 2.9682539682539684, "grad_norm": 0.507751484260846, "learning_rate": 4.308409809450742e-06, "loss": 0.7635, "step": 187 }, { "epoch": 2.984126984126984, "grad_norm": 0.5129728167302848, "learning_rate": 4.253481904124968e-06, "loss": 0.7353, "step": 188 }, { "epoch": 3.0, "grad_norm": 0.44280290900369257, "learning_rate": 4.198645993487872e-06, "loss": 0.6059, "step": 189 }, { "epoch": 3.015873015873016, "grad_norm": 0.7949393554198322, "learning_rate": 4.143908835061551e-06, "loss": 0.6868, "step": 190 }, { "epoch": 3.0317460317460316, "grad_norm": 0.5012314119268376, "learning_rate": 4.089277174198694e-06, "loss": 0.7037, "step": 191 }, { "epoch": 3.0476190476190474, "grad_norm": 0.8765248539640519, "learning_rate": 4.0347577432513515e-06, "loss": 0.746, "step": 192 }, { "epoch": 3.0634920634920633, "grad_norm": 0.5276377235611475, "learning_rate": 3.980357260741293e-06, "loss": 0.6836, "step": 193 }, { "epoch": 3.0793650793650795, "grad_norm": 0.5739417223582697, "learning_rate": 3.926082430532067e-06, "loss": 0.6428, "step": 194 }, { "epoch": 3.0952380952380953, "grad_norm": 0.6325463534989497, "learning_rate": 3.87193994100288e-06, "loss": 0.6092, "step": 195 }, { "epoch": 3.111111111111111, "grad_norm": 0.6843617935822326, "learning_rate": 3.817936464224367e-06, "loss": 0.6763, "step": 196 }, { "epoch": 3.126984126984127, "grad_norm": 0.5698355849375702, "learning_rate": 3.764078655136391e-06, "loss": 0.7472, "step": 197 }, { "epoch": 3.142857142857143, "grad_norm": 0.5699592517012283, "learning_rate": 3.7103731507279383e-06, "loss": 0.7029, "step": 198 }, { "epoch": 3.1587301587301586, "grad_norm": 0.4423177821267063, "learning_rate": 3.656826569219233e-06, "loss": 0.6717, "step": 199 }, { "epoch": 3.1746031746031744, "grad_norm": 0.5057172241583261, "learning_rate": 3.603445509246154e-06, "loss": 0.6429, "step": 200 }, { "epoch": 3.1904761904761907, "grad_norm": 0.4627898485974749, "learning_rate": 3.55023654904709e-06, "loss": 0.7171, "step": 201 }, { "epoch": 3.2063492063492065, "grad_norm": 0.4765018395044146, "learning_rate": 3.49720624565228e-06, "loss": 0.5539, "step": 202 }, { "epoch": 3.2222222222222223, "grad_norm": 0.46472301884916256, "learning_rate": 3.44436113407578e-06, "loss": 0.6852, "step": 203 }, { "epoch": 3.238095238095238, "grad_norm": 0.7227019117707013, "learning_rate": 3.3917077265101505e-06, "loss": 0.751, "step": 204 }, { "epoch": 3.253968253968254, "grad_norm": 0.46124162458293566, "learning_rate": 3.3392525115239353e-06, "loss": 0.5753, "step": 205 }, { "epoch": 3.2698412698412698, "grad_norm": 1.150296667678599, "learning_rate": 3.2870019532620744e-06, "loss": 0.7116, "step": 206 }, { "epoch": 3.2857142857142856, "grad_norm": 0.45629133977245157, "learning_rate": 3.2349624906493164e-06, "loss": 0.6506, "step": 207 }, { "epoch": 3.3015873015873014, "grad_norm": 0.6330635820823547, "learning_rate": 3.1831405365967315e-06, "loss": 0.5314, "step": 208 }, { "epoch": 3.317460317460317, "grad_norm": 0.5103783322022635, "learning_rate": 3.1315424772114404e-06, "loss": 0.7163, "step": 209 }, { "epoch": 3.3333333333333335, "grad_norm": 0.5407291227510194, "learning_rate": 3.0801746710096497e-06, "loss": 0.5543, "step": 210 }, { "epoch": 3.3492063492063493, "grad_norm": 0.5148220790962434, "learning_rate": 3.0290434481330746e-06, "loss": 0.5885, "step": 211 }, { "epoch": 3.365079365079365, "grad_norm": 0.4436633763550198, "learning_rate": 2.978155109568864e-06, "loss": 0.6205, "step": 212 }, { "epoch": 3.380952380952381, "grad_norm": 0.3967936149581789, "learning_rate": 2.927515926373129e-06, "loss": 0.6664, "step": 213 }, { "epoch": 3.3968253968253967, "grad_norm": 0.4218431236274798, "learning_rate": 2.8771321388981334e-06, "loss": 0.6664, "step": 214 }, { "epoch": 3.4126984126984126, "grad_norm": 0.6850450086103512, "learning_rate": 2.8270099560232992e-06, "loss": 0.71, "step": 215 }, { "epoch": 3.4285714285714284, "grad_norm": 0.49624036517991055, "learning_rate": 2.77715555439007e-06, "loss": 0.7545, "step": 216 }, { "epoch": 3.4444444444444446, "grad_norm": 0.4389117683902036, "learning_rate": 2.7275750776407568e-06, "loss": 0.6014, "step": 217 }, { "epoch": 3.4603174603174605, "grad_norm": 0.44946398322882497, "learning_rate": 2.6782746356614364e-06, "loss": 0.5866, "step": 218 }, { "epoch": 3.4761904761904763, "grad_norm": 0.5130278875069821, "learning_rate": 2.6292603038290306e-06, "loss": 0.7161, "step": 219 }, { "epoch": 3.492063492063492, "grad_norm": 0.6079510505594462, "learning_rate": 2.580538122262627e-06, "loss": 0.6545, "step": 220 }, { "epoch": 3.507936507936508, "grad_norm": 0.5185432227363381, "learning_rate": 2.532114095079137e-06, "loss": 0.5745, "step": 221 }, { "epoch": 3.5238095238095237, "grad_norm": 0.47475284651402894, "learning_rate": 2.4839941896534027e-06, "loss": 0.6287, "step": 222 }, { "epoch": 3.5396825396825395, "grad_norm": 0.4721541505351033, "learning_rate": 2.4361843358828287e-06, "loss": 0.5891, "step": 223 }, { "epoch": 3.5555555555555554, "grad_norm": 0.6637884613662758, "learning_rate": 2.388690425456629e-06, "loss": 0.7191, "step": 224 }, { "epoch": 3.571428571428571, "grad_norm": 0.49879760044528987, "learning_rate": 2.341518311129781e-06, "loss": 0.5703, "step": 225 }, { "epoch": 3.5873015873015874, "grad_norm": 0.42482396593298977, "learning_rate": 2.2946738060017947e-06, "loss": 0.706, "step": 226 }, { "epoch": 3.6031746031746033, "grad_norm": 0.5244046208280333, "learning_rate": 2.24816268280033e-06, "loss": 0.6567, "step": 227 }, { "epoch": 3.619047619047619, "grad_norm": 0.45713016270372664, "learning_rate": 2.2019906731698337e-06, "loss": 0.6519, "step": 228 }, { "epoch": 3.634920634920635, "grad_norm": 0.3877769815934568, "learning_rate": 2.156163466965218e-06, "loss": 0.63, "step": 229 }, { "epoch": 3.6507936507936507, "grad_norm": 0.5213212350040638, "learning_rate": 2.110686711550678e-06, "loss": 0.7059, "step": 230 }, { "epoch": 3.6666666666666665, "grad_norm": 0.39785815195503926, "learning_rate": 2.0655660111037685e-06, "loss": 0.6371, "step": 231 }, { "epoch": 3.682539682539683, "grad_norm": 0.42984736444835686, "learning_rate": 2.0208069259247866e-06, "loss": 0.659, "step": 232 }, { "epoch": 3.6984126984126986, "grad_norm": 0.4602245826690893, "learning_rate": 1.976414971751568e-06, "loss": 0.6043, "step": 233 }, { "epoch": 3.7142857142857144, "grad_norm": 0.4841459331133356, "learning_rate": 1.932395619079771e-06, "loss": 0.6762, "step": 234 }, { "epoch": 3.7301587301587302, "grad_norm": 0.38677471787487294, "learning_rate": 1.8887542924887486e-06, "loss": 0.7034, "step": 235 }, { "epoch": 3.746031746031746, "grad_norm": 0.669293977144537, "learning_rate": 1.8454963699730471e-06, "loss": 0.6753, "step": 236 }, { "epoch": 3.761904761904762, "grad_norm": 0.9653712675428361, "learning_rate": 1.802627182279687e-06, "loss": 0.5958, "step": 237 }, { "epoch": 3.7777777777777777, "grad_norm": 0.8023679085767069, "learning_rate": 1.760152012251241e-06, "loss": 0.5046, "step": 238 }, { "epoch": 3.7936507936507935, "grad_norm": 0.523400494967504, "learning_rate": 1.7180760941748132e-06, "loss": 0.6704, "step": 239 }, { "epoch": 3.8095238095238093, "grad_norm": 0.5650557974529034, "learning_rate": 1.6764046131370142e-06, "loss": 0.7334, "step": 240 }, { "epoch": 3.825396825396825, "grad_norm": 0.446176823325039, "learning_rate": 1.6351427043849955e-06, "loss": 0.6972, "step": 241 }, { "epoch": 3.8412698412698414, "grad_norm": 0.5687688471627884, "learning_rate": 1.5942954526936217e-06, "loss": 0.6563, "step": 242 }, { "epoch": 3.857142857142857, "grad_norm": 0.588174299336183, "learning_rate": 1.5538678917388638e-06, "loss": 0.6638, "step": 243 }, { "epoch": 3.873015873015873, "grad_norm": 0.48774660261391006, "learning_rate": 1.5138650034775004e-06, "loss": 0.5733, "step": 244 }, { "epoch": 3.888888888888889, "grad_norm": 0.4185611368252772, "learning_rate": 1.4742917175331644e-06, "loss": 0.7174, "step": 245 }, { "epoch": 3.9047619047619047, "grad_norm": 0.43603269341453055, "learning_rate": 1.4351529105888735e-06, "loss": 0.7672, "step": 246 }, { "epoch": 3.9206349206349205, "grad_norm": 0.4318904871120016, "learning_rate": 1.3964534057860652e-06, "loss": 0.5978, "step": 247 }, { "epoch": 3.9365079365079367, "grad_norm": 0.40904640871839104, "learning_rate": 1.3581979721302286e-06, "loss": 0.6579, "step": 248 }, { "epoch": 3.9523809523809526, "grad_norm": 0.6005145592007414, "learning_rate": 1.3203913239032074e-06, "loss": 0.6694, "step": 249 }, { "epoch": 3.9682539682539684, "grad_norm": 0.472367689533449, "learning_rate": 1.283038120082268e-06, "loss": 0.6197, "step": 250 }, { "epoch": 3.984126984126984, "grad_norm": 0.4356830095251736, "learning_rate": 1.2461429637659466e-06, "loss": 0.6213, "step": 251 }, { "epoch": 4.0, "grad_norm": 0.4857139342731584, "learning_rate": 1.2097104016068146e-06, "loss": 0.6352, "step": 252 }, { "epoch": 4.015873015873016, "grad_norm": 0.7237535323689852, "learning_rate": 1.1737449232511799e-06, "loss": 0.6382, "step": 253 }, { "epoch": 4.031746031746032, "grad_norm": 0.46436683787098876, "learning_rate": 1.1382509607858233e-06, "loss": 0.681, "step": 254 }, { "epoch": 4.0476190476190474, "grad_norm": 0.38871205852451385, "learning_rate": 1.1032328881918237e-06, "loss": 0.6655, "step": 255 }, { "epoch": 4.063492063492063, "grad_norm": 0.5245733396531106, "learning_rate": 1.0686950208055486e-06, "loss": 0.6977, "step": 256 }, { "epoch": 4.079365079365079, "grad_norm": 0.7180379448497728, "learning_rate": 1.034641614786862e-06, "loss": 0.6271, "step": 257 }, { "epoch": 4.095238095238095, "grad_norm": 0.47735389131691536, "learning_rate": 1.0010768665946309e-06, "loss": 0.6079, "step": 258 }, { "epoch": 4.111111111111111, "grad_norm": 0.5783859241207984, "learning_rate": 9.680049124695973e-07, "loss": 0.6364, "step": 259 }, { "epoch": 4.1269841269841265, "grad_norm": 0.47172528206140724, "learning_rate": 9.35429827924652e-07, "loss": 0.6471, "step": 260 }, { "epoch": 4.142857142857143, "grad_norm": 0.7730664217116625, "learning_rate": 9.033556272426075e-07, "loss": 0.5769, "step": 261 }, { "epoch": 4.158730158730159, "grad_norm": 0.6533050547442746, "learning_rate": 8.717862629815099e-07, "loss": 0.6638, "step": 262 }, { "epoch": 4.174603174603175, "grad_norm": 0.5126950213106886, "learning_rate": 8.407256254875573e-07, "loss": 0.5556, "step": 263 }, { "epoch": 4.190476190476191, "grad_norm": 0.4249288916316267, "learning_rate": 8.101775424156888e-07, "loss": 0.7416, "step": 264 }, { "epoch": 4.2063492063492065, "grad_norm": 0.4999911716251449, "learning_rate": 7.801457782578947e-07, "loss": 0.5759, "step": 265 }, { "epoch": 4.222222222222222, "grad_norm": 0.44804880194019553, "learning_rate": 7.506340338793111e-07, "loss": 0.7019, "step": 266 }, { "epoch": 4.238095238095238, "grad_norm": 0.502803217299879, "learning_rate": 7.216459460621528e-07, "loss": 0.569, "step": 267 }, { "epoch": 4.253968253968254, "grad_norm": 0.6816091836503904, "learning_rate": 6.931850870575563e-07, "loss": 0.607, "step": 268 }, { "epoch": 4.26984126984127, "grad_norm": 0.7355066410111105, "learning_rate": 6.652549641453543e-07, "loss": 0.6546, "step": 269 }, { "epoch": 4.285714285714286, "grad_norm": 0.6616759448500391, "learning_rate": 6.378590192018752e-07, "loss": 0.5275, "step": 270 }, { "epoch": 4.301587301587301, "grad_norm": 0.5266225510350064, "learning_rate": 6.110006282757897e-07, "loss": 0.6357, "step": 271 }, { "epoch": 4.317460317460317, "grad_norm": 0.5556516253636915, "learning_rate": 5.846831011720789e-07, "loss": 0.5667, "step": 272 }, { "epoch": 4.333333333333333, "grad_norm": 0.45705890325071213, "learning_rate": 5.589096810441574e-07, "loss": 0.5398, "step": 273 }, { "epoch": 4.349206349206349, "grad_norm": 0.4272112657516473, "learning_rate": 5.3368354399422e-07, "loss": 0.6608, "step": 274 }, { "epoch": 4.365079365079365, "grad_norm": 0.49431513901379337, "learning_rate": 5.090077986818365e-07, "loss": 0.5874, "step": 275 }, { "epoch": 4.380952380952381, "grad_norm": 0.5019371137242049, "learning_rate": 4.848854859408731e-07, "loss": 0.6658, "step": 276 }, { "epoch": 4.396825396825397, "grad_norm": 0.4921839831510061, "learning_rate": 4.613195784047653e-07, "loss": 0.5992, "step": 277 }, { "epoch": 4.412698412698413, "grad_norm": 0.4913329983646245, "learning_rate": 4.3831298014019144e-07, "loss": 0.6414, "step": 278 }, { "epoch": 4.428571428571429, "grad_norm": 0.4591709887480562, "learning_rate": 4.1586852628920095e-07, "loss": 0.5581, "step": 279 }, { "epoch": 4.444444444444445, "grad_norm": 0.49202921355042367, "learning_rate": 3.939889827198362e-07, "loss": 0.4977, "step": 280 }, { "epoch": 4.4603174603174605, "grad_norm": 0.4454959373647045, "learning_rate": 3.7267704568529015e-07, "loss": 0.5291, "step": 281 }, { "epoch": 4.476190476190476, "grad_norm": 0.5229427416726549, "learning_rate": 3.519353414916404e-07, "loss": 0.6844, "step": 282 }, { "epoch": 4.492063492063492, "grad_norm": 0.4117177933501312, "learning_rate": 3.3176642617420817e-07, "loss": 0.6441, "step": 283 }, { "epoch": 4.507936507936508, "grad_norm": 0.5207909494470314, "learning_rate": 3.1217278518256844e-07, "loss": 0.6815, "step": 284 }, { "epoch": 4.523809523809524, "grad_norm": 0.47298038451032376, "learning_rate": 2.93156833074269e-07, "loss": 0.7125, "step": 285 }, { "epoch": 4.5396825396825395, "grad_norm": 0.4559655191725887, "learning_rate": 2.7472091321728067e-07, "loss": 0.5207, "step": 286 }, { "epoch": 4.555555555555555, "grad_norm": 0.48538261262399124, "learning_rate": 2.568672975012154e-07, "loss": 0.5553, "step": 287 }, { "epoch": 4.571428571428571, "grad_norm": 0.42805849682825836, "learning_rate": 2.3959818605736095e-07, "loss": 0.5694, "step": 288 }, { "epoch": 4.587301587301587, "grad_norm": 0.46054414700724183, "learning_rate": 2.229157069875537e-07, "loss": 0.6352, "step": 289 }, { "epoch": 4.603174603174603, "grad_norm": 0.4170725725245928, "learning_rate": 2.068219161019297e-07, "loss": 0.493, "step": 290 }, { "epoch": 4.619047619047619, "grad_norm": 0.5643812716552056, "learning_rate": 1.9131879666558385e-07, "loss": 0.6324, "step": 291 }, { "epoch": 4.634920634920634, "grad_norm": 0.462594946066143, "learning_rate": 1.7640825915416994e-07, "loss": 0.5406, "step": 292 }, { "epoch": 4.650793650793651, "grad_norm": 0.38882576034876637, "learning_rate": 1.6209214101846394e-07, "loss": 0.5732, "step": 293 }, { "epoch": 4.666666666666667, "grad_norm": 0.4703648367216305, "learning_rate": 1.4837220645793905e-07, "loss": 0.6893, "step": 294 }, { "epoch": 4.682539682539683, "grad_norm": 0.4272728817767105, "learning_rate": 1.3525014620335786e-07, "loss": 0.6755, "step": 295 }, { "epoch": 4.698412698412699, "grad_norm": 0.612286386647566, "learning_rate": 1.2272757730841744e-07, "loss": 0.7234, "step": 296 }, { "epoch": 4.714285714285714, "grad_norm": 0.3683204109754377, "learning_rate": 1.1080604295048203e-07, "loss": 0.5681, "step": 297 }, { "epoch": 4.73015873015873, "grad_norm": 0.4198898648516418, "learning_rate": 9.948701224041124e-08, "loss": 0.682, "step": 298 }, { "epoch": 4.746031746031746, "grad_norm": 0.4161451987722102, "learning_rate": 8.877188004152104e-08, "loss": 0.6787, "step": 299 }, { "epoch": 4.761904761904762, "grad_norm": 0.45897732699104254, "learning_rate": 7.866196679768956e-08, "loss": 0.589, "step": 300 }, { "epoch": 4.777777777777778, "grad_norm": 0.40150992469071173, "learning_rate": 6.91585183706428e-08, "loss": 0.5974, "step": 301 }, { "epoch": 4.7936507936507935, "grad_norm": 0.43359782320810974, "learning_rate": 6.02627058864158e-08, "loss": 0.6319, "step": 302 }, { "epoch": 4.809523809523809, "grad_norm": 0.44506261105820183, "learning_rate": 5.19756255910403e-08, "loss": 0.6191, "step": 303 }, { "epoch": 4.825396825396825, "grad_norm": 0.5066033533058674, "learning_rate": 4.429829871545055e-08, "loss": 0.6192, "step": 304 }, { "epoch": 4.841269841269841, "grad_norm": 0.408870784528967, "learning_rate": 3.7231671349634015e-08, "loss": 0.5396, "step": 305 }, { "epoch": 4.857142857142857, "grad_norm": 0.41548539048694993, "learning_rate": 3.077661432604184e-08, "loss": 0.573, "step": 306 }, { "epoch": 4.8730158730158735, "grad_norm": 0.41079087097410333, "learning_rate": 2.4933923112279712e-08, "loss": 0.6776, "step": 307 }, { "epoch": 4.888888888888889, "grad_norm": 0.4175105640706622, "learning_rate": 1.9704317713076236e-08, "loss": 0.7029, "step": 308 }, { "epoch": 4.904761904761905, "grad_norm": 0.4780736273310837, "learning_rate": 1.508844258155728e-08, "loss": 0.6435, "step": 309 }, { "epoch": 4.920634920634921, "grad_norm": 0.4004152110528723, "learning_rate": 1.1086866539830044e-08, "loss": 0.6868, "step": 310 }, { "epoch": 4.936507936507937, "grad_norm": 0.4479777279190579, "learning_rate": 7.700082708883006e-09, "loss": 0.6421, "step": 311 }, { "epoch": 4.9523809523809526, "grad_norm": 0.44065610989172405, "learning_rate": 4.928508447821223e-09, "loss": 0.617, "step": 312 }, { "epoch": 4.968253968253968, "grad_norm": 0.4640148914737005, "learning_rate": 2.7724853024324594e-09, "loss": 0.5754, "step": 313 }, { "epoch": 4.984126984126984, "grad_norm": 0.5313400670046616, "learning_rate": 1.2322789630997422e-09, "loss": 0.6552, "step": 314 }, { "epoch": 5.0, "grad_norm": 0.4643036864898895, "learning_rate": 3.080792320564463e-10, "loss": 0.5647, "step": 315 }, { "epoch": 5.0, "step": 315, "total_flos": 78480301031424.0, "train_loss": 0.7375902401076423, "train_runtime": 10558.2121, "train_samples_per_second": 0.474, "train_steps_per_second": 0.03 } ], "logging_steps": 1, "max_steps": 315, "num_input_tokens_seen": 0, "num_train_epochs": 5, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 78480301031424.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }