{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 3.0, "eval_steps": 500, "global_step": 189, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.015873015873015872, "grad_norm": 2.3323949793109238, "learning_rate": 0.0, "loss": 1.0469, "step": 1 }, { "epoch": 0.031746031746031744, "grad_norm": 2.317201200688066, "learning_rate": 3.125e-07, "loss": 0.9931, "step": 2 }, { "epoch": 0.047619047619047616, "grad_norm": 2.603689956679125, "learning_rate": 6.25e-07, "loss": 1.0188, "step": 3 }, { "epoch": 0.06349206349206349, "grad_norm": 2.2583787301898592, "learning_rate": 9.375000000000001e-07, "loss": 0.9097, "step": 4 }, { "epoch": 0.07936507936507936, "grad_norm": 2.197466891038096, "learning_rate": 1.25e-06, "loss": 1.0459, "step": 5 }, { "epoch": 0.09523809523809523, "grad_norm": 2.1259963361099747, "learning_rate": 1.5625e-06, "loss": 0.9986, "step": 6 }, { "epoch": 0.1111111111111111, "grad_norm": 2.0707820881041, "learning_rate": 1.8750000000000003e-06, "loss": 0.9555, "step": 7 }, { "epoch": 0.12698412698412698, "grad_norm": 1.870407527874291, "learning_rate": 2.1875000000000002e-06, "loss": 0.952, "step": 8 }, { "epoch": 0.14285714285714285, "grad_norm": 1.8578085390534953, "learning_rate": 2.5e-06, "loss": 0.9993, "step": 9 }, { "epoch": 0.15873015873015872, "grad_norm": 1.881148688458384, "learning_rate": 2.8125e-06, "loss": 0.9373, "step": 10 }, { "epoch": 0.1746031746031746, "grad_norm": 1.6917769845914787, "learning_rate": 3.125e-06, "loss": 0.8839, "step": 11 }, { "epoch": 0.19047619047619047, "grad_norm": 1.2541345576396532, "learning_rate": 3.4375e-06, "loss": 0.9909, "step": 12 }, { "epoch": 0.20634920634920634, "grad_norm": 1.4038335670152517, "learning_rate": 3.7500000000000005e-06, "loss": 0.9322, "step": 13 }, { "epoch": 0.2222222222222222, "grad_norm": 1.363468897891553, "learning_rate": 4.0625000000000005e-06, "loss": 1.0934, "step": 14 }, { "epoch": 0.23809523809523808, "grad_norm": 1.1331989679866032, "learning_rate": 4.3750000000000005e-06, "loss": 0.977, "step": 15 }, { "epoch": 0.25396825396825395, "grad_norm": 0.9899834287202586, "learning_rate": 4.6875000000000004e-06, "loss": 1.0443, "step": 16 }, { "epoch": 0.2698412698412698, "grad_norm": 1.155920523517074, "learning_rate": 5e-06, "loss": 0.9483, "step": 17 }, { "epoch": 0.2857142857142857, "grad_norm": 1.2715867938274161, "learning_rate": 5.3125e-06, "loss": 1.0096, "step": 18 }, { "epoch": 0.30158730158730157, "grad_norm": 0.9922231339593638, "learning_rate": 5.625e-06, "loss": 0.7463, "step": 19 }, { "epoch": 0.31746031746031744, "grad_norm": 1.2551959582539625, "learning_rate": 5.9375e-06, "loss": 0.9226, "step": 20 }, { "epoch": 0.3333333333333333, "grad_norm": 0.892951024999124, "learning_rate": 6.25e-06, "loss": 0.988, "step": 21 }, { "epoch": 0.3492063492063492, "grad_norm": 1.4360539096520086, "learning_rate": 6.5625e-06, "loss": 1.0509, "step": 22 }, { "epoch": 0.36507936507936506, "grad_norm": 1.1100051374669628, "learning_rate": 6.875e-06, "loss": 0.8728, "step": 23 }, { "epoch": 0.38095238095238093, "grad_norm": 0.9630208551024003, "learning_rate": 7.1875e-06, "loss": 0.8352, "step": 24 }, { "epoch": 0.3968253968253968, "grad_norm": 1.109963225007402, "learning_rate": 7.500000000000001e-06, "loss": 1.0289, "step": 25 }, { "epoch": 0.4126984126984127, "grad_norm": 0.842175710243708, "learning_rate": 7.8125e-06, "loss": 0.8616, "step": 26 }, { "epoch": 0.42857142857142855, "grad_norm": 0.8255762742603932, "learning_rate": 8.125000000000001e-06, "loss": 0.7234, "step": 27 }, { "epoch": 0.4444444444444444, "grad_norm": 0.8274507712792363, "learning_rate": 8.4375e-06, "loss": 0.9758, "step": 28 }, { "epoch": 0.4603174603174603, "grad_norm": 0.7834224887700044, "learning_rate": 8.750000000000001e-06, "loss": 0.9056, "step": 29 }, { "epoch": 0.47619047619047616, "grad_norm": 1.187020605300137, "learning_rate": 9.0625e-06, "loss": 0.9481, "step": 30 }, { "epoch": 0.49206349206349204, "grad_norm": 1.0233176856791018, "learning_rate": 9.375000000000001e-06, "loss": 0.9194, "step": 31 }, { "epoch": 0.5079365079365079, "grad_norm": 0.848791394024066, "learning_rate": 9.6875e-06, "loss": 0.8852, "step": 32 }, { "epoch": 0.5238095238095238, "grad_norm": 0.8289281876622956, "learning_rate": 1e-05, "loss": 1.038, "step": 33 }, { "epoch": 0.5396825396825397, "grad_norm": 0.7738330911179299, "learning_rate": 9.999691920767945e-06, "loss": 0.8374, "step": 34 }, { "epoch": 0.5555555555555556, "grad_norm": 0.65004421093035, "learning_rate": 9.998767721036901e-06, "loss": 0.8242, "step": 35 }, { "epoch": 0.5714285714285714, "grad_norm": 0.718229691257778, "learning_rate": 9.997227514697568e-06, "loss": 0.9693, "step": 36 }, { "epoch": 0.5873015873015873, "grad_norm": 0.598178727036991, "learning_rate": 9.99507149155218e-06, "loss": 0.9843, "step": 37 }, { "epoch": 0.6031746031746031, "grad_norm": 0.6896420594948925, "learning_rate": 9.992299917291118e-06, "loss": 0.848, "step": 38 }, { "epoch": 0.6190476190476191, "grad_norm": 0.7218001479564617, "learning_rate": 9.98891313346017e-06, "loss": 0.9095, "step": 39 }, { "epoch": 0.6349206349206349, "grad_norm": 0.673383804041238, "learning_rate": 9.984911557418444e-06, "loss": 0.7682, "step": 40 }, { "epoch": 0.6507936507936508, "grad_norm": 0.9044903125501461, "learning_rate": 9.980295682286924e-06, "loss": 0.8388, "step": 41 }, { "epoch": 0.6666666666666666, "grad_norm": 0.6528626470394925, "learning_rate": 9.97506607688772e-06, "loss": 0.9107, "step": 42 }, { "epoch": 0.6825396825396826, "grad_norm": 0.5248039585149111, "learning_rate": 9.969223385673958e-06, "loss": 0.8307, "step": 43 }, { "epoch": 0.6984126984126984, "grad_norm": 0.568338771820042, "learning_rate": 9.962768328650367e-06, "loss": 0.7523, "step": 44 }, { "epoch": 0.7142857142857143, "grad_norm": 0.5429855696185105, "learning_rate": 9.95570170128455e-06, "loss": 0.8442, "step": 45 }, { "epoch": 0.7301587301587301, "grad_norm": 0.5098426033492849, "learning_rate": 9.94802437440896e-06, "loss": 0.7962, "step": 46 }, { "epoch": 0.746031746031746, "grad_norm": 0.6078990273192543, "learning_rate": 9.939737294113585e-06, "loss": 0.8969, "step": 47 }, { "epoch": 0.7619047619047619, "grad_norm": 0.4709547244829324, "learning_rate": 9.930841481629358e-06, "loss": 0.8885, "step": 48 }, { "epoch": 0.7777777777777778, "grad_norm": 0.54039591858629, "learning_rate": 9.92133803320231e-06, "loss": 0.7818, "step": 49 }, { "epoch": 0.7936507936507936, "grad_norm": 0.4875170254753124, "learning_rate": 9.91122811995848e-06, "loss": 0.8193, "step": 50 }, { "epoch": 0.8095238095238095, "grad_norm": 0.5005396928536703, "learning_rate": 9.90051298775959e-06, "loss": 0.8692, "step": 51 }, { "epoch": 0.8253968253968254, "grad_norm": 0.40245216027036546, "learning_rate": 9.88919395704952e-06, "loss": 0.826, "step": 52 }, { "epoch": 0.8412698412698413, "grad_norm": 0.5389952051377087, "learning_rate": 9.877272422691583e-06, "loss": 0.9318, "step": 53 }, { "epoch": 0.8571428571428571, "grad_norm": 0.5638980417584056, "learning_rate": 9.864749853796642e-06, "loss": 0.7985, "step": 54 }, { "epoch": 0.873015873015873, "grad_norm": 0.5506830661309166, "learning_rate": 9.85162779354206e-06, "loss": 0.7291, "step": 55 }, { "epoch": 0.8888888888888888, "grad_norm": 0.48566023212019677, "learning_rate": 9.837907858981536e-06, "loss": 0.8802, "step": 56 }, { "epoch": 0.9047619047619048, "grad_norm": 0.4725406192484581, "learning_rate": 9.823591740845831e-06, "loss": 0.8627, "step": 57 }, { "epoch": 0.9206349206349206, "grad_norm": 0.5270784935436914, "learning_rate": 9.808681203334416e-06, "loss": 0.7976, "step": 58 }, { "epoch": 0.9365079365079365, "grad_norm": 0.4795159174595573, "learning_rate": 9.793178083898073e-06, "loss": 0.8783, "step": 59 }, { "epoch": 0.9523809523809523, "grad_norm": 0.42309628953003137, "learning_rate": 9.777084293012448e-06, "loss": 0.842, "step": 60 }, { "epoch": 0.9682539682539683, "grad_norm": 0.464555539059811, "learning_rate": 9.760401813942641e-06, "loss": 0.7662, "step": 61 }, { "epoch": 0.9841269841269841, "grad_norm": 0.5141212041737542, "learning_rate": 9.743132702498785e-06, "loss": 0.8688, "step": 62 }, { "epoch": 1.0, "grad_norm": 0.5165788253828009, "learning_rate": 9.725279086782719e-06, "loss": 0.768, "step": 63 }, { "epoch": 1.0158730158730158, "grad_norm": 0.576629868282963, "learning_rate": 9.706843166925733e-06, "loss": 0.7989, "step": 64 }, { "epoch": 1.0317460317460316, "grad_norm": 0.4946943998511545, "learning_rate": 9.687827214817433e-06, "loss": 0.8261, "step": 65 }, { "epoch": 1.0476190476190477, "grad_norm": 0.4987216606535057, "learning_rate": 9.668233573825794e-06, "loss": 0.8905, "step": 66 }, { "epoch": 1.0634920634920635, "grad_norm": 0.45688977932466196, "learning_rate": 9.64806465850836e-06, "loss": 0.7327, "step": 67 }, { "epoch": 1.0793650793650793, "grad_norm": 0.5226340006885853, "learning_rate": 9.62732295431471e-06, "loss": 0.7311, "step": 68 }, { "epoch": 1.0952380952380953, "grad_norm": 0.6684025298786129, "learning_rate": 9.606011017280166e-06, "loss": 0.8971, "step": 69 }, { "epoch": 1.1111111111111112, "grad_norm": 0.5147703758608321, "learning_rate": 9.5841314737108e-06, "loss": 0.7652, "step": 70 }, { "epoch": 1.126984126984127, "grad_norm": 0.5417227409614662, "learning_rate": 9.56168701985981e-06, "loss": 0.7999, "step": 71 }, { "epoch": 1.1428571428571428, "grad_norm": 0.5016561221704748, "learning_rate": 9.538680421595236e-06, "loss": 0.8074, "step": 72 }, { "epoch": 1.1587301587301586, "grad_norm": 0.4853528793957531, "learning_rate": 9.515114514059127e-06, "loss": 0.8135, "step": 73 }, { "epoch": 1.1746031746031746, "grad_norm": 0.47765415470199357, "learning_rate": 9.490992201318165e-06, "loss": 0.7879, "step": 74 }, { "epoch": 1.1904761904761905, "grad_norm": 0.46535342031003013, "learning_rate": 9.466316456005783e-06, "loss": 0.7762, "step": 75 }, { "epoch": 1.2063492063492063, "grad_norm": 0.5033568814253909, "learning_rate": 9.441090318955843e-06, "loss": 0.7022, "step": 76 }, { "epoch": 1.2222222222222223, "grad_norm": 0.4986643533291915, "learning_rate": 9.415316898827923e-06, "loss": 0.7349, "step": 77 }, { "epoch": 1.2380952380952381, "grad_norm": 0.43657193718859494, "learning_rate": 9.388999371724212e-06, "loss": 0.8264, "step": 78 }, { "epoch": 1.253968253968254, "grad_norm": 0.47617277777848616, "learning_rate": 9.362140980798127e-06, "loss": 0.8944, "step": 79 }, { "epoch": 1.2698412698412698, "grad_norm": 0.4295219607791053, "learning_rate": 9.334745035854646e-06, "loss": 0.7588, "step": 80 }, { "epoch": 1.2857142857142856, "grad_norm": 0.5225987407011279, "learning_rate": 9.306814912942445e-06, "loss": 0.8359, "step": 81 }, { "epoch": 1.3015873015873016, "grad_norm": 0.4173684559568506, "learning_rate": 9.278354053937848e-06, "loss": 0.7804, "step": 82 }, { "epoch": 1.3174603174603174, "grad_norm": 0.5238592049595157, "learning_rate": 9.249365966120692e-06, "loss": 0.8564, "step": 83 }, { "epoch": 1.3333333333333333, "grad_norm": 0.4526393208745273, "learning_rate": 9.219854221742106e-06, "loss": 0.8102, "step": 84 }, { "epoch": 1.3492063492063493, "grad_norm": 0.44471888761912887, "learning_rate": 9.189822457584311e-06, "loss": 0.7439, "step": 85 }, { "epoch": 1.3650793650793651, "grad_norm": 0.43731884433734214, "learning_rate": 9.159274374512444e-06, "loss": 0.6592, "step": 86 }, { "epoch": 1.380952380952381, "grad_norm": 0.4377614076782124, "learning_rate": 9.128213737018493e-06, "loss": 0.806, "step": 87 }, { "epoch": 1.3968253968253967, "grad_norm": 0.4027105033083121, "learning_rate": 9.096644372757393e-06, "loss": 0.8855, "step": 88 }, { "epoch": 1.4126984126984126, "grad_norm": 0.571463019194369, "learning_rate": 9.064570172075349e-06, "loss": 0.7979, "step": 89 }, { "epoch": 1.4285714285714286, "grad_norm": 0.4801097800367482, "learning_rate": 9.031995087530403e-06, "loss": 0.7992, "step": 90 }, { "epoch": 1.4444444444444444, "grad_norm": 0.47255682704462587, "learning_rate": 8.99892313340537e-06, "loss": 0.6633, "step": 91 }, { "epoch": 1.4603174603174602, "grad_norm": 0.4862492507086913, "learning_rate": 8.96535838521314e-06, "loss": 0.8033, "step": 92 }, { "epoch": 1.4761904761904763, "grad_norm": 0.4794987734861929, "learning_rate": 8.931304979194452e-06, "loss": 0.8069, "step": 93 }, { "epoch": 1.492063492063492, "grad_norm": 0.4658669415595415, "learning_rate": 8.896767111808177e-06, "loss": 0.7371, "step": 94 }, { "epoch": 1.507936507936508, "grad_norm": 0.5683125861447418, "learning_rate": 8.861749039214177e-06, "loss": 0.9145, "step": 95 }, { "epoch": 1.5238095238095237, "grad_norm": 0.47857884026171116, "learning_rate": 8.826255076748823e-06, "loss": 0.8455, "step": 96 }, { "epoch": 1.5396825396825395, "grad_norm": 0.429389167302876, "learning_rate": 8.790289598393186e-06, "loss": 0.7216, "step": 97 }, { "epoch": 1.5555555555555556, "grad_norm": 0.522031534882144, "learning_rate": 8.753857036234055e-06, "loss": 0.8155, "step": 98 }, { "epoch": 1.5714285714285714, "grad_norm": 0.5375692580431519, "learning_rate": 8.716961879917734e-06, "loss": 0.7373, "step": 99 }, { "epoch": 1.5873015873015874, "grad_norm": 0.4277716225580266, "learning_rate": 8.679608676096793e-06, "loss": 0.8132, "step": 100 }, { "epoch": 1.6031746031746033, "grad_norm": 0.9709114563751018, "learning_rate": 8.641802027869774e-06, "loss": 0.7952, "step": 101 }, { "epoch": 1.619047619047619, "grad_norm": 0.6722991060253756, "learning_rate": 8.603546594213935e-06, "loss": 0.8566, "step": 102 }, { "epoch": 1.6349206349206349, "grad_norm": 0.48227435877100366, "learning_rate": 8.564847089411128e-06, "loss": 0.8292, "step": 103 }, { "epoch": 1.6507936507936507, "grad_norm": 0.43738769808282163, "learning_rate": 8.525708282466839e-06, "loss": 0.8424, "step": 104 }, { "epoch": 1.6666666666666665, "grad_norm": 0.42758983764847835, "learning_rate": 8.486134996522502e-06, "loss": 0.8179, "step": 105 }, { "epoch": 1.6825396825396826, "grad_norm": 0.6465752665836958, "learning_rate": 8.446132108261136e-06, "loss": 0.806, "step": 106 }, { "epoch": 1.6984126984126984, "grad_norm": 0.5216064305348748, "learning_rate": 8.405704547306379e-06, "loss": 0.8041, "step": 107 }, { "epoch": 1.7142857142857144, "grad_norm": 0.46284349128240304, "learning_rate": 8.364857295615006e-06, "loss": 0.8924, "step": 108 }, { "epoch": 1.7301587301587302, "grad_norm": 0.48814352812138595, "learning_rate": 8.323595386862985e-06, "loss": 0.7929, "step": 109 }, { "epoch": 1.746031746031746, "grad_norm": 0.48088506678769916, "learning_rate": 8.281923905825188e-06, "loss": 0.7671, "step": 110 }, { "epoch": 1.7619047619047619, "grad_norm": 0.4594586947272896, "learning_rate": 8.23984798774876e-06, "loss": 0.7366, "step": 111 }, { "epoch": 1.7777777777777777, "grad_norm": 0.4673793179812366, "learning_rate": 8.197372817720314e-06, "loss": 0.7397, "step": 112 }, { "epoch": 1.7936507936507935, "grad_norm": 0.6557346369623661, "learning_rate": 8.154503630026955e-06, "loss": 0.7262, "step": 113 }, { "epoch": 1.8095238095238095, "grad_norm": 0.45128446254113314, "learning_rate": 8.111245707511253e-06, "loss": 0.7213, "step": 114 }, { "epoch": 1.8253968253968254, "grad_norm": 0.41666335434637974, "learning_rate": 8.067604380920228e-06, "loss": 0.7952, "step": 115 }, { "epoch": 1.8412698412698414, "grad_norm": 0.4407610683896587, "learning_rate": 8.023585028248435e-06, "loss": 0.8486, "step": 116 }, { "epoch": 1.8571428571428572, "grad_norm": 0.5501977264080524, "learning_rate": 7.979193074075216e-06, "loss": 0.8911, "step": 117 }, { "epoch": 1.873015873015873, "grad_norm": 0.459940871244406, "learning_rate": 7.934433988896233e-06, "loss": 0.6535, "step": 118 }, { "epoch": 1.8888888888888888, "grad_norm": 0.46949896874504654, "learning_rate": 7.889313288449323e-06, "loss": 0.8232, "step": 119 }, { "epoch": 1.9047619047619047, "grad_norm": 0.41110722374315695, "learning_rate": 7.843836533034784e-06, "loss": 0.7628, "step": 120 }, { "epoch": 1.9206349206349205, "grad_norm": 0.47755036946919965, "learning_rate": 7.798009326830167e-06, "loss": 0.8003, "step": 121 }, { "epoch": 1.9365079365079365, "grad_norm": 0.41342145123270885, "learning_rate": 7.751837317199673e-06, "loss": 0.8683, "step": 122 }, { "epoch": 1.9523809523809523, "grad_norm": 0.4479867168170251, "learning_rate": 7.705326193998207e-06, "loss": 0.7552, "step": 123 }, { "epoch": 1.9682539682539684, "grad_norm": 0.4549548876094008, "learning_rate": 7.658481688870218e-06, "loss": 0.7587, "step": 124 }, { "epoch": 1.9841269841269842, "grad_norm": 0.4684989926335189, "learning_rate": 7.611309574543373e-06, "loss": 0.7607, "step": 125 }, { "epoch": 2.0, "grad_norm": 0.4367513791425883, "learning_rate": 7.563815664117173e-06, "loss": 0.9146, "step": 126 }, { "epoch": 2.015873015873016, "grad_norm": 0.7927149278076437, "learning_rate": 7.5160058103465985e-06, "loss": 0.7131, "step": 127 }, { "epoch": 2.0317460317460316, "grad_norm": 0.5847918647965703, "learning_rate": 7.467885904920864e-06, "loss": 0.7578, "step": 128 }, { "epoch": 2.0476190476190474, "grad_norm": 0.7836046335272314, "learning_rate": 7.419461877737373e-06, "loss": 0.8327, "step": 129 }, { "epoch": 2.0634920634920633, "grad_norm": 2.1428241341527117, "learning_rate": 7.370739696170971e-06, "loss": 0.7441, "step": 130 }, { "epoch": 2.0793650793650795, "grad_norm": 0.9566247813485141, "learning_rate": 7.321725364338566e-06, "loss": 0.6185, "step": 131 }, { "epoch": 2.0952380952380953, "grad_norm": 0.5336099004301172, "learning_rate": 7.272424922359246e-06, "loss": 0.6455, "step": 132 }, { "epoch": 2.111111111111111, "grad_norm": 0.7132260718912609, "learning_rate": 7.222844445609931e-06, "loss": 0.7834, "step": 133 }, { "epoch": 2.126984126984127, "grad_norm": 0.5749113101610002, "learning_rate": 7.172990043976703e-06, "loss": 0.7296, "step": 134 }, { "epoch": 2.142857142857143, "grad_norm": 0.5366676899164674, "learning_rate": 7.122867861101868e-06, "loss": 0.795, "step": 135 }, { "epoch": 2.1587301587301586, "grad_norm": 0.44931031781346276, "learning_rate": 7.072484073626872e-06, "loss": 0.6875, "step": 136 }, { "epoch": 2.1746031746031744, "grad_norm": 0.6709913679680917, "learning_rate": 7.021844890431136e-06, "loss": 0.7669, "step": 137 }, { "epoch": 2.1904761904761907, "grad_norm": 0.5782700607354144, "learning_rate": 6.970956551866925e-06, "loss": 0.7273, "step": 138 }, { "epoch": 2.2063492063492065, "grad_norm": 0.5008612890527109, "learning_rate": 6.9198253289903515e-06, "loss": 0.6634, "step": 139 }, { "epoch": 2.2222222222222223, "grad_norm": 0.5733594756270326, "learning_rate": 6.868457522788561e-06, "loss": 0.7358, "step": 140 }, { "epoch": 2.238095238095238, "grad_norm": 0.48532685396257946, "learning_rate": 6.816859463403271e-06, "loss": 0.659, "step": 141 }, { "epoch": 2.253968253968254, "grad_norm": 0.5460096768726493, "learning_rate": 6.765037509350685e-06, "loss": 0.7585, "step": 142 }, { "epoch": 2.2698412698412698, "grad_norm": 0.4827715321224934, "learning_rate": 6.7129980467379265e-06, "loss": 0.6664, "step": 143 }, { "epoch": 2.2857142857142856, "grad_norm": 0.5417449745700821, "learning_rate": 6.660747488476066e-06, "loss": 0.663, "step": 144 }, { "epoch": 2.3015873015873014, "grad_norm": 0.5672091588208017, "learning_rate": 6.608292273489851e-06, "loss": 0.6122, "step": 145 }, { "epoch": 2.317460317460317, "grad_norm": 0.5264115445856029, "learning_rate": 6.555638865924221e-06, "loss": 0.7035, "step": 146 }, { "epoch": 2.3333333333333335, "grad_norm": 0.5168486054014866, "learning_rate": 6.502793754347721e-06, "loss": 0.7598, "step": 147 }, { "epoch": 2.3492063492063493, "grad_norm": 0.6085627519823247, "learning_rate": 6.449763450952912e-06, "loss": 0.6875, "step": 148 }, { "epoch": 2.365079365079365, "grad_norm": 0.504951049632705, "learning_rate": 6.396554490753848e-06, "loss": 0.6839, "step": 149 }, { "epoch": 2.380952380952381, "grad_norm": 0.42239268629753335, "learning_rate": 6.343173430780769e-06, "loss": 0.8396, "step": 150 }, { "epoch": 2.3968253968253967, "grad_norm": 0.5170870251352963, "learning_rate": 6.289626849272062e-06, "loss": 0.8013, "step": 151 }, { "epoch": 2.4126984126984126, "grad_norm": 0.5408561718958109, "learning_rate": 6.2359213448636104e-06, "loss": 0.754, "step": 152 }, { "epoch": 2.4285714285714284, "grad_norm": 0.42606389993166277, "learning_rate": 6.182063535775634e-06, "loss": 0.7662, "step": 153 }, { "epoch": 2.4444444444444446, "grad_norm": 0.41021417431281776, "learning_rate": 6.1280600589971225e-06, "loss": 0.791, "step": 154 }, { "epoch": 2.4603174603174605, "grad_norm": 0.4068459581892925, "learning_rate": 6.073917569467934e-06, "loss": 0.8066, "step": 155 }, { "epoch": 2.4761904761904763, "grad_norm": 0.40243757072180364, "learning_rate": 6.0196427392587085e-06, "loss": 0.7061, "step": 156 }, { "epoch": 2.492063492063492, "grad_norm": 0.5924677871750427, "learning_rate": 5.96524225674865e-06, "loss": 0.744, "step": 157 }, { "epoch": 2.507936507936508, "grad_norm": 0.4344103520994765, "learning_rate": 5.9107228258013085e-06, "loss": 0.7076, "step": 158 }, { "epoch": 2.5238095238095237, "grad_norm": 0.4824828219676673, "learning_rate": 5.856091164938451e-06, "loss": 0.6534, "step": 159 }, { "epoch": 2.5396825396825395, "grad_norm": 0.4197375023372333, "learning_rate": 5.801354006512127e-06, "loss": 0.6902, "step": 160 }, { "epoch": 2.5555555555555554, "grad_norm": 0.4523354962317184, "learning_rate": 5.746518095875033e-06, "loss": 0.6996, "step": 161 }, { "epoch": 2.571428571428571, "grad_norm": 0.41073692830700287, "learning_rate": 5.6915901905492586e-06, "loss": 0.629, "step": 162 }, { "epoch": 2.5873015873015874, "grad_norm": 0.5807356357914126, "learning_rate": 5.6365770593935665e-06, "loss": 0.5924, "step": 163 }, { "epoch": 2.6031746031746033, "grad_norm": 0.5296154741304107, "learning_rate": 5.581485481769231e-06, "loss": 0.7197, "step": 164 }, { "epoch": 2.619047619047619, "grad_norm": 0.4462893254042338, "learning_rate": 5.526322246704628e-06, "loss": 0.8007, "step": 165 }, { "epoch": 2.634920634920635, "grad_norm": 0.3974463949753287, "learning_rate": 5.471094152058592e-06, "loss": 0.6822, "step": 166 }, { "epoch": 2.6507936507936507, "grad_norm": 0.46244966479154553, "learning_rate": 5.415808003682717e-06, "loss": 0.7318, "step": 167 }, { "epoch": 2.6666666666666665, "grad_norm": 0.438557400530548, "learning_rate": 5.360470614582661e-06, "loss": 0.7147, "step": 168 }, { "epoch": 2.682539682539683, "grad_norm": 0.5680373876053647, "learning_rate": 5.305088804078559e-06, "loss": 0.7357, "step": 169 }, { "epoch": 2.6984126984126986, "grad_norm": 0.4556205137087138, "learning_rate": 5.249669396964665e-06, "loss": 0.6361, "step": 170 }, { "epoch": 2.7142857142857144, "grad_norm": 0.44940699263796485, "learning_rate": 5.1942192226683385e-06, "loss": 0.7778, "step": 171 }, { "epoch": 2.7301587301587302, "grad_norm": 0.47535854965434626, "learning_rate": 5.138745114408427e-06, "loss": 0.6008, "step": 172 }, { "epoch": 2.746031746031746, "grad_norm": 0.5020715004802897, "learning_rate": 5.083253908353193e-06, "loss": 0.6696, "step": 173 }, { "epoch": 2.761904761904762, "grad_norm": 0.4715489187155987, "learning_rate": 5.0277524427778986e-06, "loss": 0.7846, "step": 174 }, { "epoch": 2.7777777777777777, "grad_norm": 0.44938039077917374, "learning_rate": 4.972247557222102e-06, "loss": 0.7187, "step": 175 }, { "epoch": 2.7936507936507935, "grad_norm": 0.536309868809644, "learning_rate": 4.916746091646808e-06, "loss": 0.6818, "step": 176 }, { "epoch": 2.8095238095238093, "grad_norm": 0.4238224566275176, "learning_rate": 4.8612548855915755e-06, "loss": 0.7252, "step": 177 }, { "epoch": 2.825396825396825, "grad_norm": 0.5075369152051689, "learning_rate": 4.805780777331662e-06, "loss": 0.7461, "step": 178 }, { "epoch": 2.8412698412698414, "grad_norm": 0.463068134108742, "learning_rate": 4.750330603035336e-06, "loss": 0.7141, "step": 179 }, { "epoch": 2.857142857142857, "grad_norm": 0.44910366292391646, "learning_rate": 4.694911195921443e-06, "loss": 0.7278, "step": 180 }, { "epoch": 2.873015873015873, "grad_norm": 0.43362119780351166, "learning_rate": 4.6395293854173395e-06, "loss": 0.6069, "step": 181 }, { "epoch": 2.888888888888889, "grad_norm": 0.7285135499415637, "learning_rate": 4.584191996317285e-06, "loss": 0.6846, "step": 182 }, { "epoch": 2.9047619047619047, "grad_norm": 0.49976201370002465, "learning_rate": 4.528905847941411e-06, "loss": 0.843, "step": 183 }, { "epoch": 2.9206349206349205, "grad_norm": 0.47745344638517, "learning_rate": 4.473677753295375e-06, "loss": 0.6609, "step": 184 }, { "epoch": 2.9365079365079367, "grad_norm": 0.4075892143069301, "learning_rate": 4.418514518230769e-06, "loss": 0.7133, "step": 185 }, { "epoch": 2.9523809523809526, "grad_norm": 0.490679894902017, "learning_rate": 4.363422940606435e-06, "loss": 0.7483, "step": 186 }, { "epoch": 2.9682539682539684, "grad_norm": 0.507751484260846, "learning_rate": 4.308409809450742e-06, "loss": 0.7635, "step": 187 }, { "epoch": 2.984126984126984, "grad_norm": 0.5129728167302848, "learning_rate": 4.253481904124968e-06, "loss": 0.7353, "step": 188 }, { "epoch": 3.0, "grad_norm": 0.44280290900369257, "learning_rate": 4.198645993487872e-06, "loss": 0.6059, "step": 189 } ], "logging_steps": 1, "max_steps": 315, "num_input_tokens_seen": 0, "num_train_epochs": 5, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 47138450767872.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }