{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 0, "global_step": 185, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.005405405405405406, "grad_norm": 2.034959316253662, "learning_rate": 1e-05, "loss": 2.3274, "step": 1 }, { "epoch": 0.010810810810810811, "grad_norm": 1.8778449296951294, "learning_rate": 9.945945945945947e-06, "loss": 2.227, "step": 2 }, { "epoch": 0.016216216216216217, "grad_norm": 1.820204734802246, "learning_rate": 9.891891891891893e-06, "loss": 2.3029, "step": 3 }, { "epoch": 0.021621621621621623, "grad_norm": 1.6635195016860962, "learning_rate": 9.83783783783784e-06, "loss": 2.2474, "step": 4 }, { "epoch": 0.02702702702702703, "grad_norm": 1.362695336341858, "learning_rate": 9.783783783783785e-06, "loss": 2.0898, "step": 5 }, { "epoch": 0.032432432432432434, "grad_norm": 1.105480670928955, "learning_rate": 9.729729729729732e-06, "loss": 2.0848, "step": 6 }, { "epoch": 0.03783783783783784, "grad_norm": 0.9252412915229797, "learning_rate": 9.675675675675676e-06, "loss": 2.0229, "step": 7 }, { "epoch": 0.043243243243243246, "grad_norm": 0.7585290670394897, "learning_rate": 9.621621621621622e-06, "loss": 1.9767, "step": 8 }, { "epoch": 0.04864864864864865, "grad_norm": 0.7198324203491211, "learning_rate": 9.567567567567568e-06, "loss": 1.9532, "step": 9 }, { "epoch": 0.05405405405405406, "grad_norm": 0.7078152298927307, "learning_rate": 9.513513513513514e-06, "loss": 1.845, "step": 10 }, { "epoch": 0.05945945945945946, "grad_norm": 0.7482191920280457, "learning_rate": 9.45945945945946e-06, "loss": 1.9299, "step": 11 }, { "epoch": 0.06486486486486487, "grad_norm": 0.7480123043060303, "learning_rate": 9.405405405405407e-06, "loss": 1.8255, "step": 12 }, { "epoch": 0.07027027027027027, "grad_norm": 0.8021811842918396, "learning_rate": 9.351351351351353e-06, "loss": 1.8584, "step": 13 }, { "epoch": 0.07567567567567568, "grad_norm": 0.8102870583534241, "learning_rate": 9.297297297297299e-06, "loss": 1.8065, "step": 14 }, { "epoch": 0.08108108108108109, "grad_norm": 0.8265206813812256, "learning_rate": 9.243243243243243e-06, "loss": 1.8241, "step": 15 }, { "epoch": 0.08648648648648649, "grad_norm": 0.7550373673439026, "learning_rate": 9.189189189189191e-06, "loss": 1.7726, "step": 16 }, { "epoch": 0.0918918918918919, "grad_norm": 0.6808019876480103, "learning_rate": 9.135135135135136e-06, "loss": 1.7703, "step": 17 }, { "epoch": 0.0972972972972973, "grad_norm": 0.6290677189826965, "learning_rate": 9.081081081081082e-06, "loss": 1.7586, "step": 18 }, { "epoch": 0.10270270270270271, "grad_norm": 0.6062494516372681, "learning_rate": 9.027027027027028e-06, "loss": 1.7713, "step": 19 }, { "epoch": 0.10810810810810811, "grad_norm": 0.5270338654518127, "learning_rate": 8.972972972972974e-06, "loss": 1.652, "step": 20 }, { "epoch": 0.11351351351351352, "grad_norm": 0.5446780920028687, "learning_rate": 8.91891891891892e-06, "loss": 1.6587, "step": 21 }, { "epoch": 0.11891891891891893, "grad_norm": 0.500572681427002, "learning_rate": 8.864864864864866e-06, "loss": 1.6509, "step": 22 }, { "epoch": 0.12432432432432433, "grad_norm": 0.5181313157081604, "learning_rate": 8.810810810810811e-06, "loss": 1.5852, "step": 23 }, { "epoch": 0.12972972972972974, "grad_norm": 0.5200425386428833, "learning_rate": 8.756756756756759e-06, "loss": 1.5981, "step": 24 }, { "epoch": 0.13513513513513514, "grad_norm": 0.4856228232383728, "learning_rate": 8.702702702702703e-06, "loss": 1.562, "step": 25 }, { "epoch": 0.14054054054054055, "grad_norm": 0.47276848554611206, "learning_rate": 8.64864864864865e-06, "loss": 1.6072, "step": 26 }, { "epoch": 0.14594594594594595, "grad_norm": 0.4729558229446411, "learning_rate": 8.594594594594595e-06, "loss": 1.6082, "step": 27 }, { "epoch": 0.15135135135135136, "grad_norm": 0.4218498468399048, "learning_rate": 8.540540540540542e-06, "loss": 1.5092, "step": 28 }, { "epoch": 0.15675675675675677, "grad_norm": 0.4382428526878357, "learning_rate": 8.486486486486488e-06, "loss": 1.5741, "step": 29 }, { "epoch": 0.16216216216216217, "grad_norm": 0.4070360064506531, "learning_rate": 8.432432432432434e-06, "loss": 1.5187, "step": 30 }, { "epoch": 0.16756756756756758, "grad_norm": 0.41073259711265564, "learning_rate": 8.378378378378378e-06, "loss": 1.4882, "step": 31 }, { "epoch": 0.17297297297297298, "grad_norm": 0.4228561222553253, "learning_rate": 8.324324324324326e-06, "loss": 1.4688, "step": 32 }, { "epoch": 0.1783783783783784, "grad_norm": 0.39718520641326904, "learning_rate": 8.27027027027027e-06, "loss": 1.4849, "step": 33 }, { "epoch": 0.1837837837837838, "grad_norm": 0.4296083152294159, "learning_rate": 8.216216216216217e-06, "loss": 1.4872, "step": 34 }, { "epoch": 0.1891891891891892, "grad_norm": 0.4223209321498871, "learning_rate": 8.162162162162163e-06, "loss": 1.4476, "step": 35 }, { "epoch": 0.1945945945945946, "grad_norm": 0.40885603427886963, "learning_rate": 8.108108108108109e-06, "loss": 1.4787, "step": 36 }, { "epoch": 0.2, "grad_norm": 0.42571428418159485, "learning_rate": 8.054054054054055e-06, "loss": 1.4734, "step": 37 }, { "epoch": 0.20540540540540542, "grad_norm": 0.3889505863189697, "learning_rate": 8.000000000000001e-06, "loss": 1.4245, "step": 38 }, { "epoch": 0.21081081081081082, "grad_norm": 0.39451542496681213, "learning_rate": 7.945945945945946e-06, "loss": 1.4509, "step": 39 }, { "epoch": 0.21621621621621623, "grad_norm": 0.3710978329181671, "learning_rate": 7.891891891891894e-06, "loss": 1.4155, "step": 40 }, { "epoch": 0.22162162162162163, "grad_norm": 0.35685208439826965, "learning_rate": 7.837837837837838e-06, "loss": 1.4081, "step": 41 }, { "epoch": 0.22702702702702704, "grad_norm": 0.33981138467788696, "learning_rate": 7.783783783783784e-06, "loss": 1.3643, "step": 42 }, { "epoch": 0.23243243243243245, "grad_norm": 0.3623262941837311, "learning_rate": 7.72972972972973e-06, "loss": 1.4029, "step": 43 }, { "epoch": 0.23783783783783785, "grad_norm": 0.3363046646118164, "learning_rate": 7.675675675675676e-06, "loss": 1.3669, "step": 44 }, { "epoch": 0.24324324324324326, "grad_norm": 0.33891770243644714, "learning_rate": 7.621621621621622e-06, "loss": 1.3635, "step": 45 }, { "epoch": 0.24864864864864866, "grad_norm": 0.33656665682792664, "learning_rate": 7.567567567567569e-06, "loss": 1.352, "step": 46 }, { "epoch": 0.25405405405405407, "grad_norm": 0.3358979821205139, "learning_rate": 7.513513513513514e-06, "loss": 1.3606, "step": 47 }, { "epoch": 0.2594594594594595, "grad_norm": 0.32272347807884216, "learning_rate": 7.45945945945946e-06, "loss": 1.342, "step": 48 }, { "epoch": 0.2648648648648649, "grad_norm": 0.3023531436920166, "learning_rate": 7.4054054054054055e-06, "loss": 1.3267, "step": 49 }, { "epoch": 0.2702702702702703, "grad_norm": 0.3095267713069916, "learning_rate": 7.3513513513513525e-06, "loss": 1.342, "step": 50 }, { "epoch": 0.2756756756756757, "grad_norm": 0.3033177852630615, "learning_rate": 7.297297297297298e-06, "loss": 1.3611, "step": 51 }, { "epoch": 0.2810810810810811, "grad_norm": 0.29411327838897705, "learning_rate": 7.243243243243244e-06, "loss": 1.3114, "step": 52 }, { "epoch": 0.2864864864864865, "grad_norm": 0.28862670063972473, "learning_rate": 7.189189189189189e-06, "loss": 1.3045, "step": 53 }, { "epoch": 0.2918918918918919, "grad_norm": 0.33967703580856323, "learning_rate": 7.135135135135136e-06, "loss": 1.2879, "step": 54 }, { "epoch": 0.2972972972972973, "grad_norm": 0.3063770532608032, "learning_rate": 7.0810810810810815e-06, "loss": 1.3237, "step": 55 }, { "epoch": 0.3027027027027027, "grad_norm": 0.2903819978237152, "learning_rate": 7.027027027027028e-06, "loss": 1.3088, "step": 56 }, { "epoch": 0.3081081081081081, "grad_norm": 0.2815365791320801, "learning_rate": 6.972972972972973e-06, "loss": 1.3184, "step": 57 }, { "epoch": 0.31351351351351353, "grad_norm": 0.3042902946472168, "learning_rate": 6.91891891891892e-06, "loss": 1.3182, "step": 58 }, { "epoch": 0.31891891891891894, "grad_norm": 0.27602043747901917, "learning_rate": 6.864864864864865e-06, "loss": 1.3051, "step": 59 }, { "epoch": 0.32432432432432434, "grad_norm": 0.30109262466430664, "learning_rate": 6.810810810810811e-06, "loss": 1.3268, "step": 60 }, { "epoch": 0.32972972972972975, "grad_norm": 0.27311018109321594, "learning_rate": 6.7567567567567575e-06, "loss": 1.305, "step": 61 }, { "epoch": 0.33513513513513515, "grad_norm": 0.2715848684310913, "learning_rate": 6.702702702702704e-06, "loss": 1.3034, "step": 62 }, { "epoch": 0.34054054054054056, "grad_norm": 0.2581041157245636, "learning_rate": 6.648648648648649e-06, "loss": 1.2567, "step": 63 }, { "epoch": 0.34594594594594597, "grad_norm": 0.26454904675483704, "learning_rate": 6.594594594594595e-06, "loss": 1.2792, "step": 64 }, { "epoch": 0.35135135135135137, "grad_norm": 0.27279508113861084, "learning_rate": 6.540540540540541e-06, "loss": 1.2745, "step": 65 }, { "epoch": 0.3567567567567568, "grad_norm": 0.27239367365837097, "learning_rate": 6.486486486486487e-06, "loss": 1.2947, "step": 66 }, { "epoch": 0.3621621621621622, "grad_norm": 0.2796631455421448, "learning_rate": 6.432432432432433e-06, "loss": 1.2705, "step": 67 }, { "epoch": 0.3675675675675676, "grad_norm": 0.266652911901474, "learning_rate": 6.378378378378379e-06, "loss": 1.2441, "step": 68 }, { "epoch": 0.372972972972973, "grad_norm": 0.26300087571144104, "learning_rate": 6.324324324324325e-06, "loss": 1.259, "step": 69 }, { "epoch": 0.3783783783783784, "grad_norm": 0.3145560622215271, "learning_rate": 6.270270270270271e-06, "loss": 1.3417, "step": 70 }, { "epoch": 0.3837837837837838, "grad_norm": 0.2778513729572296, "learning_rate": 6.2162162162162164e-06, "loss": 1.2198, "step": 71 }, { "epoch": 0.3891891891891892, "grad_norm": 0.25896984338760376, "learning_rate": 6.162162162162163e-06, "loss": 1.231, "step": 72 }, { "epoch": 0.3945945945945946, "grad_norm": 0.2606508433818817, "learning_rate": 6.108108108108109e-06, "loss": 1.2799, "step": 73 }, { "epoch": 0.4, "grad_norm": 0.2623414695262909, "learning_rate": 6.054054054054055e-06, "loss": 1.2269, "step": 74 }, { "epoch": 0.40540540540540543, "grad_norm": 0.26297539472579956, "learning_rate": 6e-06, "loss": 1.2751, "step": 75 }, { "epoch": 0.41081081081081083, "grad_norm": 0.24481281638145447, "learning_rate": 5.945945945945947e-06, "loss": 1.2047, "step": 76 }, { "epoch": 0.41621621621621624, "grad_norm": 0.2644273340702057, "learning_rate": 5.8918918918918924e-06, "loss": 1.2507, "step": 77 }, { "epoch": 0.42162162162162165, "grad_norm": 0.2676522731781006, "learning_rate": 5.837837837837839e-06, "loss": 1.2374, "step": 78 }, { "epoch": 0.42702702702702705, "grad_norm": 0.2541520297527313, "learning_rate": 5.783783783783784e-06, "loss": 1.2225, "step": 79 }, { "epoch": 0.43243243243243246, "grad_norm": 0.252860426902771, "learning_rate": 5.729729729729731e-06, "loss": 1.225, "step": 80 }, { "epoch": 0.43783783783783786, "grad_norm": 0.24671323597431183, "learning_rate": 5.675675675675676e-06, "loss": 1.1988, "step": 81 }, { "epoch": 0.44324324324324327, "grad_norm": 0.25774669647216797, "learning_rate": 5.621621621621622e-06, "loss": 1.1917, "step": 82 }, { "epoch": 0.4486486486486487, "grad_norm": 0.2582065165042877, "learning_rate": 5.567567567567568e-06, "loss": 1.2049, "step": 83 }, { "epoch": 0.4540540540540541, "grad_norm": 0.2658644914627075, "learning_rate": 5.513513513513515e-06, "loss": 1.1864, "step": 84 }, { "epoch": 0.4594594594594595, "grad_norm": 0.2440362274646759, "learning_rate": 5.45945945945946e-06, "loss": 1.1772, "step": 85 }, { "epoch": 0.4648648648648649, "grad_norm": 0.2492218315601349, "learning_rate": 5.405405405405406e-06, "loss": 1.2137, "step": 86 }, { "epoch": 0.4702702702702703, "grad_norm": 0.2513432204723358, "learning_rate": 5.351351351351351e-06, "loss": 1.2077, "step": 87 }, { "epoch": 0.4756756756756757, "grad_norm": 0.252913236618042, "learning_rate": 5.297297297297298e-06, "loss": 1.215, "step": 88 }, { "epoch": 0.4810810810810811, "grad_norm": 0.24399417638778687, "learning_rate": 5.243243243243244e-06, "loss": 1.1977, "step": 89 }, { "epoch": 0.4864864864864865, "grad_norm": 0.2553783059120178, "learning_rate": 5.18918918918919e-06, "loss": 1.2004, "step": 90 }, { "epoch": 0.4918918918918919, "grad_norm": 0.24357211589813232, "learning_rate": 5.135135135135135e-06, "loss": 1.1741, "step": 91 }, { "epoch": 0.4972972972972973, "grad_norm": 0.2634775638580322, "learning_rate": 5.081081081081082e-06, "loss": 1.1964, "step": 92 }, { "epoch": 0.5027027027027027, "grad_norm": 0.24633990228176117, "learning_rate": 5.027027027027027e-06, "loss": 1.1541, "step": 93 }, { "epoch": 0.5081081081081081, "grad_norm": 0.2492210417985916, "learning_rate": 4.9729729729729735e-06, "loss": 1.1663, "step": 94 }, { "epoch": 0.5135135135135135, "grad_norm": 0.24858078360557556, "learning_rate": 4.91891891891892e-06, "loss": 1.1684, "step": 95 }, { "epoch": 0.518918918918919, "grad_norm": 0.24967968463897705, "learning_rate": 4.864864864864866e-06, "loss": 1.1695, "step": 96 }, { "epoch": 0.5243243243243243, "grad_norm": 0.24852266907691956, "learning_rate": 4.810810810810811e-06, "loss": 1.1529, "step": 97 }, { "epoch": 0.5297297297297298, "grad_norm": 0.24503152072429657, "learning_rate": 4.756756756756757e-06, "loss": 1.1689, "step": 98 }, { "epoch": 0.5351351351351351, "grad_norm": 0.25913184881210327, "learning_rate": 4.702702702702703e-06, "loss": 1.1888, "step": 99 }, { "epoch": 0.5405405405405406, "grad_norm": 0.23641978204250336, "learning_rate": 4.6486486486486495e-06, "loss": 1.1676, "step": 100 }, { "epoch": 0.5459459459459459, "grad_norm": 0.25850117206573486, "learning_rate": 4.594594594594596e-06, "loss": 1.1609, "step": 101 }, { "epoch": 0.5513513513513514, "grad_norm": 0.24710091948509216, "learning_rate": 4.540540540540541e-06, "loss": 1.1522, "step": 102 }, { "epoch": 0.5567567567567567, "grad_norm": 0.2579835057258606, "learning_rate": 4.486486486486487e-06, "loss": 1.1872, "step": 103 }, { "epoch": 0.5621621621621622, "grad_norm": 0.26577505469322205, "learning_rate": 4.432432432432433e-06, "loss": 1.1775, "step": 104 }, { "epoch": 0.5675675675675675, "grad_norm": 0.289943665266037, "learning_rate": 4.378378378378379e-06, "loss": 1.1849, "step": 105 }, { "epoch": 0.572972972972973, "grad_norm": 0.29339632391929626, "learning_rate": 4.324324324324325e-06, "loss": 1.1607, "step": 106 }, { "epoch": 0.5783783783783784, "grad_norm": 0.24826812744140625, "learning_rate": 4.270270270270271e-06, "loss": 1.1834, "step": 107 }, { "epoch": 0.5837837837837838, "grad_norm": 0.24059158563613892, "learning_rate": 4.216216216216217e-06, "loss": 1.1611, "step": 108 }, { "epoch": 0.5891891891891892, "grad_norm": 0.2687944769859314, "learning_rate": 4.162162162162163e-06, "loss": 1.1515, "step": 109 }, { "epoch": 0.5945945945945946, "grad_norm": 0.27312615513801575, "learning_rate": 4.108108108108108e-06, "loss": 1.1971, "step": 110 }, { "epoch": 0.6, "grad_norm": 0.27208036184310913, "learning_rate": 4.0540540540540545e-06, "loss": 1.1789, "step": 111 }, { "epoch": 0.6054054054054054, "grad_norm": 0.2820265591144562, "learning_rate": 4.000000000000001e-06, "loss": 1.1981, "step": 112 }, { "epoch": 0.6108108108108108, "grad_norm": 0.26071199774742126, "learning_rate": 3.945945945945947e-06, "loss": 1.1936, "step": 113 }, { "epoch": 0.6162162162162163, "grad_norm": 0.2583164870738983, "learning_rate": 3.891891891891892e-06, "loss": 1.1672, "step": 114 }, { "epoch": 0.6216216216216216, "grad_norm": 0.27570709586143494, "learning_rate": 3.837837837837838e-06, "loss": 1.2096, "step": 115 }, { "epoch": 0.6270270270270271, "grad_norm": 0.2592144012451172, "learning_rate": 3.7837837837837844e-06, "loss": 1.169, "step": 116 }, { "epoch": 0.6324324324324324, "grad_norm": 0.26106804609298706, "learning_rate": 3.72972972972973e-06, "loss": 1.1912, "step": 117 }, { "epoch": 0.6378378378378379, "grad_norm": 0.25909966230392456, "learning_rate": 3.6756756756756763e-06, "loss": 1.1462, "step": 118 }, { "epoch": 0.6432432432432432, "grad_norm": 0.28418397903442383, "learning_rate": 3.621621621621622e-06, "loss": 1.1584, "step": 119 }, { "epoch": 0.6486486486486487, "grad_norm": 0.26269835233688354, "learning_rate": 3.567567567567568e-06, "loss": 1.1556, "step": 120 }, { "epoch": 0.654054054054054, "grad_norm": 0.2741927206516266, "learning_rate": 3.513513513513514e-06, "loss": 1.1681, "step": 121 }, { "epoch": 0.6594594594594595, "grad_norm": 0.2586127519607544, "learning_rate": 3.45945945945946e-06, "loss": 1.1531, "step": 122 }, { "epoch": 0.6648648648648648, "grad_norm": 0.25531938672065735, "learning_rate": 3.4054054054054057e-06, "loss": 1.1636, "step": 123 }, { "epoch": 0.6702702702702703, "grad_norm": 0.2960667610168457, "learning_rate": 3.351351351351352e-06, "loss": 1.1818, "step": 124 }, { "epoch": 0.6756756756756757, "grad_norm": 0.31269460916519165, "learning_rate": 3.2972972972972976e-06, "loss": 1.2105, "step": 125 }, { "epoch": 0.6810810810810811, "grad_norm": 0.2489175945520401, "learning_rate": 3.2432432432432437e-06, "loss": 1.1327, "step": 126 }, { "epoch": 0.6864864864864865, "grad_norm": 0.27137380838394165, "learning_rate": 3.1891891891891894e-06, "loss": 1.1882, "step": 127 }, { "epoch": 0.6918918918918919, "grad_norm": 0.27693238854408264, "learning_rate": 3.1351351351351356e-06, "loss": 1.1574, "step": 128 }, { "epoch": 0.6972972972972973, "grad_norm": 0.26204511523246765, "learning_rate": 3.0810810810810817e-06, "loss": 1.1535, "step": 129 }, { "epoch": 0.7027027027027027, "grad_norm": 0.2716626822948456, "learning_rate": 3.0270270270270274e-06, "loss": 1.1572, "step": 130 }, { "epoch": 0.7081081081081081, "grad_norm": 0.2737910747528076, "learning_rate": 2.9729729729729736e-06, "loss": 1.1453, "step": 131 }, { "epoch": 0.7135135135135136, "grad_norm": 0.2846289873123169, "learning_rate": 2.9189189189189193e-06, "loss": 1.1371, "step": 132 }, { "epoch": 0.7189189189189189, "grad_norm": 0.2652347683906555, "learning_rate": 2.8648648648648654e-06, "loss": 1.1399, "step": 133 }, { "epoch": 0.7243243243243244, "grad_norm": 0.26175656914711, "learning_rate": 2.810810810810811e-06, "loss": 1.1691, "step": 134 }, { "epoch": 0.7297297297297297, "grad_norm": 0.274801641702652, "learning_rate": 2.7567567567567573e-06, "loss": 1.1563, "step": 135 }, { "epoch": 0.7351351351351352, "grad_norm": 0.2784542143344879, "learning_rate": 2.702702702702703e-06, "loss": 1.178, "step": 136 }, { "epoch": 0.7405405405405405, "grad_norm": 0.26470088958740234, "learning_rate": 2.648648648648649e-06, "loss": 1.1688, "step": 137 }, { "epoch": 0.745945945945946, "grad_norm": 0.27295202016830444, "learning_rate": 2.594594594594595e-06, "loss": 1.1403, "step": 138 }, { "epoch": 0.7513513513513513, "grad_norm": 0.2557908892631531, "learning_rate": 2.540540540540541e-06, "loss": 1.1573, "step": 139 }, { "epoch": 0.7567567567567568, "grad_norm": 0.27424272894859314, "learning_rate": 2.4864864864864867e-06, "loss": 1.1423, "step": 140 }, { "epoch": 0.7621621621621621, "grad_norm": 0.2811051607131958, "learning_rate": 2.432432432432433e-06, "loss": 1.1549, "step": 141 }, { "epoch": 0.7675675675675676, "grad_norm": 0.25805169343948364, "learning_rate": 2.3783783783783786e-06, "loss": 1.1486, "step": 142 }, { "epoch": 0.772972972972973, "grad_norm": 0.26995599269866943, "learning_rate": 2.3243243243243247e-06, "loss": 1.1357, "step": 143 }, { "epoch": 0.7783783783783784, "grad_norm": 0.2542300224304199, "learning_rate": 2.2702702702702705e-06, "loss": 1.1208, "step": 144 }, { "epoch": 0.7837837837837838, "grad_norm": 0.24706785380840302, "learning_rate": 2.2162162162162166e-06, "loss": 1.1324, "step": 145 }, { "epoch": 0.7891891891891892, "grad_norm": 0.2644682824611664, "learning_rate": 2.1621621621621623e-06, "loss": 1.121, "step": 146 }, { "epoch": 0.7945945945945946, "grad_norm": 0.2919802665710449, "learning_rate": 2.1081081081081085e-06, "loss": 1.1779, "step": 147 }, { "epoch": 0.8, "grad_norm": 0.2681489586830139, "learning_rate": 2.054054054054054e-06, "loss": 1.1651, "step": 148 }, { "epoch": 0.8054054054054054, "grad_norm": 0.24865786731243134, "learning_rate": 2.0000000000000003e-06, "loss": 1.1272, "step": 149 }, { "epoch": 0.8108108108108109, "grad_norm": 0.26049530506134033, "learning_rate": 1.945945945945946e-06, "loss": 1.1299, "step": 150 }, { "epoch": 0.8162162162162162, "grad_norm": 0.2903294265270233, "learning_rate": 1.8918918918918922e-06, "loss": 1.1751, "step": 151 }, { "epoch": 0.8216216216216217, "grad_norm": 0.274706095457077, "learning_rate": 1.8378378378378381e-06, "loss": 1.1493, "step": 152 }, { "epoch": 0.827027027027027, "grad_norm": 0.2549731433391571, "learning_rate": 1.783783783783784e-06, "loss": 1.1345, "step": 153 }, { "epoch": 0.8324324324324325, "grad_norm": 0.26216205954551697, "learning_rate": 1.72972972972973e-06, "loss": 1.1316, "step": 154 }, { "epoch": 0.8378378378378378, "grad_norm": 0.26119425892829895, "learning_rate": 1.675675675675676e-06, "loss": 1.132, "step": 155 }, { "epoch": 0.8432432432432433, "grad_norm": 0.26794594526290894, "learning_rate": 1.6216216216216219e-06, "loss": 1.1532, "step": 156 }, { "epoch": 0.8486486486486486, "grad_norm": 0.2842087745666504, "learning_rate": 1.5675675675675678e-06, "loss": 1.0995, "step": 157 }, { "epoch": 0.8540540540540541, "grad_norm": 0.26724207401275635, "learning_rate": 1.5135135135135137e-06, "loss": 1.102, "step": 158 }, { "epoch": 0.8594594594594595, "grad_norm": 0.2525201737880707, "learning_rate": 1.4594594594594596e-06, "loss": 1.095, "step": 159 }, { "epoch": 0.8648648648648649, "grad_norm": 0.2626565098762512, "learning_rate": 1.4054054054054056e-06, "loss": 1.1435, "step": 160 }, { "epoch": 0.8702702702702703, "grad_norm": 0.2600706219673157, "learning_rate": 1.3513513513513515e-06, "loss": 1.1316, "step": 161 }, { "epoch": 0.8756756756756757, "grad_norm": 0.2661210596561432, "learning_rate": 1.2972972972972974e-06, "loss": 1.1423, "step": 162 }, { "epoch": 0.8810810810810811, "grad_norm": 0.2888488173484802, "learning_rate": 1.2432432432432434e-06, "loss": 1.1384, "step": 163 }, { "epoch": 0.8864864864864865, "grad_norm": 0.25049468874931335, "learning_rate": 1.1891891891891893e-06, "loss": 1.1257, "step": 164 }, { "epoch": 0.8918918918918919, "grad_norm": 0.2596016526222229, "learning_rate": 1.1351351351351352e-06, "loss": 1.139, "step": 165 }, { "epoch": 0.8972972972972973, "grad_norm": 0.26817384362220764, "learning_rate": 1.0810810810810812e-06, "loss": 1.1399, "step": 166 }, { "epoch": 0.9027027027027027, "grad_norm": 0.27717313170433044, "learning_rate": 1.027027027027027e-06, "loss": 1.1356, "step": 167 }, { "epoch": 0.9081081081081082, "grad_norm": 0.27815675735473633, "learning_rate": 9.72972972972973e-07, "loss": 1.0882, "step": 168 }, { "epoch": 0.9135135135135135, "grad_norm": 0.26827895641326904, "learning_rate": 9.189189189189191e-07, "loss": 1.1382, "step": 169 }, { "epoch": 0.918918918918919, "grad_norm": 0.26916950941085815, "learning_rate": 8.64864864864865e-07, "loss": 1.1633, "step": 170 }, { "epoch": 0.9243243243243243, "grad_norm": 0.2528124451637268, "learning_rate": 8.108108108108109e-07, "loss": 1.13, "step": 171 }, { "epoch": 0.9297297297297298, "grad_norm": 0.2664795219898224, "learning_rate": 7.567567567567569e-07, "loss": 1.1121, "step": 172 }, { "epoch": 0.9351351351351351, "grad_norm": 0.257194846868515, "learning_rate": 7.027027027027028e-07, "loss": 1.1416, "step": 173 }, { "epoch": 0.9405405405405406, "grad_norm": 0.2620390057563782, "learning_rate": 6.486486486486487e-07, "loss": 1.1354, "step": 174 }, { "epoch": 0.9459459459459459, "grad_norm": 0.25638052821159363, "learning_rate": 5.945945945945947e-07, "loss": 1.15, "step": 175 }, { "epoch": 0.9513513513513514, "grad_norm": 0.25654318928718567, "learning_rate": 5.405405405405406e-07, "loss": 1.114, "step": 176 }, { "epoch": 0.9567567567567568, "grad_norm": 0.3269580006599426, "learning_rate": 4.864864864864865e-07, "loss": 1.1254, "step": 177 }, { "epoch": 0.9621621621621622, "grad_norm": 0.26131343841552734, "learning_rate": 4.324324324324325e-07, "loss": 1.0901, "step": 178 }, { "epoch": 0.9675675675675676, "grad_norm": 0.2583537697792053, "learning_rate": 3.7837837837837843e-07, "loss": 1.1467, "step": 179 }, { "epoch": 0.972972972972973, "grad_norm": 0.2696709930896759, "learning_rate": 3.2432432432432436e-07, "loss": 1.1198, "step": 180 }, { "epoch": 0.9783783783783784, "grad_norm": 0.2895946502685547, "learning_rate": 2.702702702702703e-07, "loss": 1.1572, "step": 181 }, { "epoch": 0.9837837837837838, "grad_norm": 0.25485262274742126, "learning_rate": 2.1621621621621625e-07, "loss": 1.152, "step": 182 }, { "epoch": 0.9891891891891892, "grad_norm": 0.262500524520874, "learning_rate": 1.6216216216216218e-07, "loss": 1.1578, "step": 183 }, { "epoch": 0.9945945945945946, "grad_norm": 0.26408693194389343, "learning_rate": 1.0810810810810812e-07, "loss": 1.1164, "step": 184 }, { "epoch": 1.0, "grad_norm": 0.2597890794277191, "learning_rate": 5.405405405405406e-08, "loss": 1.1132, "step": 185 } ], "logging_steps": 1.0, "max_steps": 185, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 0, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 3.3279459925898035e+17, "train_batch_size": 1, "trial_name": null, "trial_params": null }