{ "best_metric": 0.6766157746315002, "best_model_checkpoint": "./checkpoints/llava-v1.6-vicuna-7b_anyres/checkpoint-250", "epoch": 10.0, "eval_steps": 1.0, "global_step": 320, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.03125, "grad_norm": 1.0039058937636163, "learning_rate": 0.0, "loss": 1.3969, "step": 1 }, { "epoch": 0.03125, "eval_loss": 1.4111441373825073, "eval_runtime": 50.4639, "eval_samples_per_second": 3.963, "eval_steps_per_second": 0.495, "step": 1 }, { "epoch": 0.0625, "grad_norm": 0.8420754522690636, "learning_rate": 2e-05, "loss": 1.3382, "step": 2 }, { "epoch": 0.0625, "eval_loss": 1.4111441373825073, "eval_runtime": 43.3333, "eval_samples_per_second": 4.615, "eval_steps_per_second": 0.577, "step": 2 }, { "epoch": 0.09375, "grad_norm": 0.8367925175081548, "learning_rate": 2e-05, "loss": 1.3867, "step": 3 }, { "epoch": 0.09375, "eval_loss": 1.3688743114471436, "eval_runtime": 43.5247, "eval_samples_per_second": 4.595, "eval_steps_per_second": 0.574, "step": 3 }, { "epoch": 0.125, "grad_norm": 0.7061648883003396, "learning_rate": 2e-05, "loss": 1.3331, "step": 4 }, { "epoch": 0.125, "eval_loss": 1.3259124755859375, "eval_runtime": 43.4317, "eval_samples_per_second": 4.605, "eval_steps_per_second": 0.576, "step": 4 }, { "epoch": 0.15625, "grad_norm": 0.8059747640123492, "learning_rate": 2e-05, "loss": 1.3031, "step": 5 }, { "epoch": 0.15625, "eval_loss": 1.2872124910354614, "eval_runtime": 43.4379, "eval_samples_per_second": 4.604, "eval_steps_per_second": 0.576, "step": 5 }, { "epoch": 0.1875, "grad_norm": 0.7045153329302901, "learning_rate": 2e-05, "loss": 1.2771, "step": 6 }, { "epoch": 0.1875, "eval_loss": 1.2505193948745728, "eval_runtime": 43.5902, "eval_samples_per_second": 4.588, "eval_steps_per_second": 0.574, "step": 6 }, { "epoch": 0.21875, "grad_norm": 0.6329971562106237, "learning_rate": 2e-05, "loss": 1.249, "step": 7 }, { "epoch": 0.21875, "eval_loss": 1.2199320793151855, "eval_runtime": 43.4066, "eval_samples_per_second": 4.608, "eval_steps_per_second": 0.576, "step": 7 }, { "epoch": 0.25, "grad_norm": 0.5550979385222247, "learning_rate": 2e-05, "loss": 1.2257, "step": 8 }, { "epoch": 0.25, "eval_loss": 1.1977466344833374, "eval_runtime": 43.5387, "eval_samples_per_second": 4.594, "eval_steps_per_second": 0.574, "step": 8 }, { "epoch": 0.28125, "grad_norm": 0.4406797963422461, "learning_rate": 2e-05, "loss": 1.2462, "step": 9 }, { "epoch": 0.28125, "eval_loss": 1.179214358329773, "eval_runtime": 43.4861, "eval_samples_per_second": 4.599, "eval_steps_per_second": 0.575, "step": 9 }, { "epoch": 0.3125, "grad_norm": 0.42022162096647486, "learning_rate": 2e-05, "loss": 1.1858, "step": 10 }, { "epoch": 0.3125, "eval_loss": 1.1616674661636353, "eval_runtime": 43.8611, "eval_samples_per_second": 4.56, "eval_steps_per_second": 0.57, "step": 10 }, { "epoch": 0.34375, "grad_norm": 0.39691998835013426, "learning_rate": 2e-05, "loss": 1.235, "step": 11 }, { "epoch": 0.34375, "eval_loss": 1.1443771123886108, "eval_runtime": 43.5109, "eval_samples_per_second": 4.597, "eval_steps_per_second": 0.575, "step": 11 }, { "epoch": 0.375, "grad_norm": 0.4500748148291364, "learning_rate": 2e-05, "loss": 1.1953, "step": 12 }, { "epoch": 0.375, "eval_loss": 1.1261780261993408, "eval_runtime": 44.8553, "eval_samples_per_second": 4.459, "eval_steps_per_second": 0.557, "step": 12 }, { "epoch": 0.40625, "grad_norm": 0.4777471950803986, "learning_rate": 2e-05, "loss": 1.2094, "step": 13 }, { "epoch": 0.40625, "eval_loss": 1.1074599027633667, "eval_runtime": 43.6762, "eval_samples_per_second": 4.579, "eval_steps_per_second": 0.572, "step": 13 }, { "epoch": 0.4375, "grad_norm": 0.45433160021015917, "learning_rate": 2e-05, "loss": 1.0426, "step": 14 }, { "epoch": 0.4375, "eval_loss": 1.089483380317688, "eval_runtime": 43.9528, "eval_samples_per_second": 4.55, "eval_steps_per_second": 0.569, "step": 14 }, { "epoch": 0.46875, "grad_norm": 0.39854476457233645, "learning_rate": 2e-05, "loss": 1.1595, "step": 15 }, { "epoch": 0.46875, "eval_loss": 1.0731947422027588, "eval_runtime": 43.3809, "eval_samples_per_second": 4.61, "eval_steps_per_second": 0.576, "step": 15 }, { "epoch": 0.5, "grad_norm": 0.41898459581564557, "learning_rate": 2e-05, "loss": 1.0923, "step": 16 }, { "epoch": 0.5, "eval_loss": 1.0587964057922363, "eval_runtime": 46.3861, "eval_samples_per_second": 4.312, "eval_steps_per_second": 0.539, "step": 16 }, { "epoch": 0.53125, "grad_norm": 0.3748700393546972, "learning_rate": 2e-05, "loss": 1.0973, "step": 17 }, { "epoch": 0.53125, "eval_loss": 1.0456310510635376, "eval_runtime": 44.8571, "eval_samples_per_second": 4.459, "eval_steps_per_second": 0.557, "step": 17 }, { "epoch": 0.5625, "grad_norm": 0.5226526211782249, "learning_rate": 2e-05, "loss": 1.0901, "step": 18 }, { "epoch": 0.5625, "eval_loss": 1.0317203998565674, "eval_runtime": 44.6579, "eval_samples_per_second": 4.478, "eval_steps_per_second": 0.56, "step": 18 }, { "epoch": 0.59375, "grad_norm": 0.3769885031745698, "learning_rate": 2e-05, "loss": 1.0033, "step": 19 }, { "epoch": 0.59375, "eval_loss": 1.0182812213897705, "eval_runtime": 44.6735, "eval_samples_per_second": 4.477, "eval_steps_per_second": 0.56, "step": 19 }, { "epoch": 0.625, "grad_norm": 0.34752776954348064, "learning_rate": 2e-05, "loss": 1.1256, "step": 20 }, { "epoch": 0.625, "eval_loss": 1.0062216520309448, "eval_runtime": 44.4317, "eval_samples_per_second": 4.501, "eval_steps_per_second": 0.563, "step": 20 }, { "epoch": 0.65625, "grad_norm": 0.275958956017114, "learning_rate": 2e-05, "loss": 1.0333, "step": 21 }, { "epoch": 0.65625, "eval_loss": 0.9957399964332581, "eval_runtime": 46.4719, "eval_samples_per_second": 4.304, "eval_steps_per_second": 0.538, "step": 21 }, { "epoch": 0.6875, "grad_norm": 0.31928085878737833, "learning_rate": 2e-05, "loss": 1.0847, "step": 22 }, { "epoch": 0.6875, "eval_loss": 0.9862645864486694, "eval_runtime": 46.7925, "eval_samples_per_second": 4.274, "eval_steps_per_second": 0.534, "step": 22 }, { "epoch": 0.71875, "grad_norm": 0.26966401299568643, "learning_rate": 2e-05, "loss": 1.0678, "step": 23 }, { "epoch": 0.71875, "eval_loss": 0.9774981141090393, "eval_runtime": 46.2095, "eval_samples_per_second": 4.328, "eval_steps_per_second": 0.541, "step": 23 }, { "epoch": 0.75, "grad_norm": 0.24088872786986867, "learning_rate": 2e-05, "loss": 1.064, "step": 24 }, { "epoch": 0.75, "eval_loss": 0.9695597887039185, "eval_runtime": 47.1059, "eval_samples_per_second": 4.246, "eval_steps_per_second": 0.531, "step": 24 }, { "epoch": 0.78125, "grad_norm": 0.27631902106476014, "learning_rate": 2e-05, "loss": 1.0141, "step": 25 }, { "epoch": 0.78125, "eval_loss": 0.9618983268737793, "eval_runtime": 46.1528, "eval_samples_per_second": 4.333, "eval_steps_per_second": 0.542, "step": 25 }, { "epoch": 0.8125, "grad_norm": 0.24434161495988888, "learning_rate": 2e-05, "loss": 1.0376, "step": 26 }, { "epoch": 0.8125, "eval_loss": 0.9548751711845398, "eval_runtime": 45.7844, "eval_samples_per_second": 4.368, "eval_steps_per_second": 0.546, "step": 26 }, { "epoch": 0.84375, "grad_norm": 0.25256672152337845, "learning_rate": 2e-05, "loss": 0.9632, "step": 27 }, { "epoch": 0.84375, "eval_loss": 0.9482427835464478, "eval_runtime": 47.8001, "eval_samples_per_second": 4.184, "eval_steps_per_second": 0.523, "step": 27 }, { "epoch": 0.875, "grad_norm": 0.26872334126279845, "learning_rate": 2e-05, "loss": 0.9819, "step": 28 }, { "epoch": 0.875, "eval_loss": 0.9416670203208923, "eval_runtime": 47.157, "eval_samples_per_second": 4.241, "eval_steps_per_second": 0.53, "step": 28 }, { "epoch": 0.90625, "grad_norm": 0.21711663558311656, "learning_rate": 2e-05, "loss": 0.9953, "step": 29 }, { "epoch": 0.90625, "eval_loss": 0.9355730414390564, "eval_runtime": 45.9328, "eval_samples_per_second": 4.354, "eval_steps_per_second": 0.544, "step": 29 }, { "epoch": 0.9375, "grad_norm": 0.21636473054277702, "learning_rate": 2e-05, "loss": 1.0328, "step": 30 }, { "epoch": 0.9375, "eval_loss": 0.9298823475837708, "eval_runtime": 46.0325, "eval_samples_per_second": 4.345, "eval_steps_per_second": 0.543, "step": 30 }, { "epoch": 0.96875, "grad_norm": 0.2530858798467821, "learning_rate": 2e-05, "loss": 0.8713, "step": 31 }, { "epoch": 0.96875, "eval_loss": 0.9241495728492737, "eval_runtime": 46.0309, "eval_samples_per_second": 4.345, "eval_steps_per_second": 0.543, "step": 31 }, { "epoch": 1.0, "grad_norm": 0.2500917296208238, "learning_rate": 2e-05, "loss": 0.9831, "step": 32 }, { "epoch": 1.0, "eval_loss": 0.9184038043022156, "eval_runtime": 46.1304, "eval_samples_per_second": 4.336, "eval_steps_per_second": 0.542, "step": 32 }, { "epoch": 1.03125, "grad_norm": 0.25563291180685294, "learning_rate": 2e-05, "loss": 1.0227, "step": 33 }, { "epoch": 1.03125, "eval_loss": 0.9126191735267639, "eval_runtime": 52.6388, "eval_samples_per_second": 3.799, "eval_steps_per_second": 0.475, "step": 33 }, { "epoch": 1.0625, "grad_norm": 0.2225226787999786, "learning_rate": 2e-05, "loss": 1.0241, "step": 34 }, { "epoch": 1.0625, "eval_loss": 0.9070788621902466, "eval_runtime": 43.6322, "eval_samples_per_second": 4.584, "eval_steps_per_second": 0.573, "step": 34 }, { "epoch": 1.09375, "grad_norm": 0.2052840697405099, "learning_rate": 2e-05, "loss": 1.0476, "step": 35 }, { "epoch": 1.09375, "eval_loss": 0.9018412828445435, "eval_runtime": 43.1975, "eval_samples_per_second": 4.63, "eval_steps_per_second": 0.579, "step": 35 }, { "epoch": 1.125, "grad_norm": 0.23676392278447683, "learning_rate": 2e-05, "loss": 1.01, "step": 36 }, { "epoch": 1.125, "eval_loss": 0.8966168761253357, "eval_runtime": 45.9216, "eval_samples_per_second": 4.355, "eval_steps_per_second": 0.544, "step": 36 }, { "epoch": 1.15625, "grad_norm": 0.22099733575664926, "learning_rate": 2e-05, "loss": 0.9525, "step": 37 }, { "epoch": 1.15625, "eval_loss": 0.891795814037323, "eval_runtime": 44.7872, "eval_samples_per_second": 4.466, "eval_steps_per_second": 0.558, "step": 37 }, { "epoch": 1.1875, "grad_norm": 0.2527359179725302, "learning_rate": 2e-05, "loss": 0.9627, "step": 38 }, { "epoch": 1.1875, "eval_loss": 0.8872839212417603, "eval_runtime": 44.6369, "eval_samples_per_second": 4.481, "eval_steps_per_second": 0.56, "step": 38 }, { "epoch": 1.21875, "grad_norm": 0.25432158026395235, "learning_rate": 2e-05, "loss": 0.9972, "step": 39 }, { "epoch": 1.21875, "eval_loss": 0.8827975988388062, "eval_runtime": 44.7753, "eval_samples_per_second": 4.467, "eval_steps_per_second": 0.558, "step": 39 }, { "epoch": 1.25, "grad_norm": 0.24171584871667898, "learning_rate": 2e-05, "loss": 0.9897, "step": 40 }, { "epoch": 1.25, "eval_loss": 0.8785097599029541, "eval_runtime": 45.0743, "eval_samples_per_second": 4.437, "eval_steps_per_second": 0.555, "step": 40 }, { "epoch": 1.28125, "grad_norm": 0.23629659647320733, "learning_rate": 2e-05, "loss": 0.9641, "step": 41 }, { "epoch": 1.28125, "eval_loss": 0.8742367625236511, "eval_runtime": 45.6624, "eval_samples_per_second": 4.38, "eval_steps_per_second": 0.547, "step": 41 }, { "epoch": 1.3125, "grad_norm": 0.23515869880744614, "learning_rate": 2e-05, "loss": 0.9445, "step": 42 }, { "epoch": 1.3125, "eval_loss": 0.8701191544532776, "eval_runtime": 46.6778, "eval_samples_per_second": 4.285, "eval_steps_per_second": 0.536, "step": 42 }, { "epoch": 1.34375, "grad_norm": 0.2328447853974619, "learning_rate": 2e-05, "loss": 0.9098, "step": 43 }, { "epoch": 1.34375, "eval_loss": 0.8661414980888367, "eval_runtime": 45.7682, "eval_samples_per_second": 4.37, "eval_steps_per_second": 0.546, "step": 43 }, { "epoch": 1.375, "grad_norm": 0.2208565035546648, "learning_rate": 2e-05, "loss": 0.9269, "step": 44 }, { "epoch": 1.375, "eval_loss": 0.8625122904777527, "eval_runtime": 47.7405, "eval_samples_per_second": 4.189, "eval_steps_per_second": 0.524, "step": 44 }, { "epoch": 1.40625, "grad_norm": 0.24194310531833832, "learning_rate": 2e-05, "loss": 0.9126, "step": 45 }, { "epoch": 1.40625, "eval_loss": 0.859275221824646, "eval_runtime": 46.14, "eval_samples_per_second": 4.335, "eval_steps_per_second": 0.542, "step": 45 }, { "epoch": 1.4375, "grad_norm": 0.23294071980639222, "learning_rate": 2e-05, "loss": 0.9525, "step": 46 }, { "epoch": 1.4375, "eval_loss": 0.8560716509819031, "eval_runtime": 47.2955, "eval_samples_per_second": 4.229, "eval_steps_per_second": 0.529, "step": 46 }, { "epoch": 1.46875, "grad_norm": 0.22565596183142483, "learning_rate": 2e-05, "loss": 0.9635, "step": 47 }, { "epoch": 1.46875, "eval_loss": 0.8531911373138428, "eval_runtime": 46.3183, "eval_samples_per_second": 4.318, "eval_steps_per_second": 0.54, "step": 47 }, { "epoch": 1.5, "grad_norm": 0.23251096636792043, "learning_rate": 2e-05, "loss": 0.8684, "step": 48 }, { "epoch": 1.5, "eval_loss": 0.8504599928855896, "eval_runtime": 45.7129, "eval_samples_per_second": 4.375, "eval_steps_per_second": 0.547, "step": 48 }, { "epoch": 1.53125, "grad_norm": 0.253882583102031, "learning_rate": 2e-05, "loss": 0.881, "step": 49 }, { "epoch": 1.53125, "eval_loss": 0.8476203680038452, "eval_runtime": 45.8764, "eval_samples_per_second": 4.36, "eval_steps_per_second": 0.545, "step": 49 }, { "epoch": 1.5625, "grad_norm": 0.2572282615843019, "learning_rate": 2e-05, "loss": 0.8634, "step": 50 }, { "epoch": 1.5625, "eval_loss": 0.8446447849273682, "eval_runtime": 46.1254, "eval_samples_per_second": 4.336, "eval_steps_per_second": 0.542, "step": 50 }, { "epoch": 1.59375, "grad_norm": 0.24021257130991572, "learning_rate": 2e-05, "loss": 0.8915, "step": 51 }, { "epoch": 1.59375, "eval_loss": 0.8415327668190002, "eval_runtime": 45.7173, "eval_samples_per_second": 4.375, "eval_steps_per_second": 0.547, "step": 51 }, { "epoch": 1.625, "grad_norm": 0.22076828593901424, "learning_rate": 2e-05, "loss": 0.7849, "step": 52 }, { "epoch": 1.625, "eval_loss": 0.8386600017547607, "eval_runtime": 45.7889, "eval_samples_per_second": 4.368, "eval_steps_per_second": 0.546, "step": 52 }, { "epoch": 1.65625, "grad_norm": 0.2255866641078328, "learning_rate": 2e-05, "loss": 0.9282, "step": 53 }, { "epoch": 1.65625, "eval_loss": 0.8356924653053284, "eval_runtime": 45.6221, "eval_samples_per_second": 4.384, "eval_steps_per_second": 0.548, "step": 53 }, { "epoch": 1.6875, "grad_norm": 0.22783298909181773, "learning_rate": 2e-05, "loss": 0.9012, "step": 54 }, { "epoch": 1.6875, "eval_loss": 0.8328012228012085, "eval_runtime": 47.1607, "eval_samples_per_second": 4.241, "eval_steps_per_second": 0.53, "step": 54 }, { "epoch": 1.71875, "grad_norm": 0.22832233862063558, "learning_rate": 2e-05, "loss": 0.9055, "step": 55 }, { "epoch": 1.71875, "eval_loss": 0.830295741558075, "eval_runtime": 46.0231, "eval_samples_per_second": 4.346, "eval_steps_per_second": 0.543, "step": 55 }, { "epoch": 1.75, "grad_norm": 0.2160389858258543, "learning_rate": 2e-05, "loss": 0.9646, "step": 56 }, { "epoch": 1.75, "eval_loss": 0.8281158208847046, "eval_runtime": 50.2412, "eval_samples_per_second": 3.981, "eval_steps_per_second": 0.498, "step": 56 }, { "epoch": 1.78125, "grad_norm": 0.2577519779258931, "learning_rate": 2e-05, "loss": 0.8908, "step": 57 }, { "epoch": 1.78125, "eval_loss": 0.8254660964012146, "eval_runtime": 43.4999, "eval_samples_per_second": 4.598, "eval_steps_per_second": 0.575, "step": 57 }, { "epoch": 1.8125, "grad_norm": 0.2425252190238059, "learning_rate": 2e-05, "loss": 0.9392, "step": 58 }, { "epoch": 1.8125, "eval_loss": 0.8230564594268799, "eval_runtime": 43.1396, "eval_samples_per_second": 4.636, "eval_steps_per_second": 0.58, "step": 58 }, { "epoch": 1.84375, "grad_norm": 0.2403612422125405, "learning_rate": 2e-05, "loss": 0.8458, "step": 59 }, { "epoch": 1.84375, "eval_loss": 0.8206232190132141, "eval_runtime": 43.4097, "eval_samples_per_second": 4.607, "eval_steps_per_second": 0.576, "step": 59 }, { "epoch": 1.875, "grad_norm": 0.24599794763439686, "learning_rate": 2e-05, "loss": 0.8533, "step": 60 }, { "epoch": 1.875, "eval_loss": 0.8178582787513733, "eval_runtime": 43.3225, "eval_samples_per_second": 4.617, "eval_steps_per_second": 0.577, "step": 60 }, { "epoch": 1.90625, "grad_norm": 0.24455796239061778, "learning_rate": 2e-05, "loss": 0.9019, "step": 61 }, { "epoch": 1.90625, "eval_loss": 0.81532883644104, "eval_runtime": 43.3919, "eval_samples_per_second": 4.609, "eval_steps_per_second": 0.576, "step": 61 }, { "epoch": 1.9375, "grad_norm": 0.25994876629591135, "learning_rate": 2e-05, "loss": 0.9294, "step": 62 }, { "epoch": 1.9375, "eval_loss": 0.813098669052124, "eval_runtime": 43.5546, "eval_samples_per_second": 4.592, "eval_steps_per_second": 0.574, "step": 62 }, { "epoch": 1.96875, "grad_norm": 0.2671215171096013, "learning_rate": 2e-05, "loss": 0.7728, "step": 63 }, { "epoch": 1.96875, "eval_loss": 0.8106216192245483, "eval_runtime": 43.3363, "eval_samples_per_second": 4.615, "eval_steps_per_second": 0.577, "step": 63 }, { "epoch": 2.0, "grad_norm": 0.26274475710090606, "learning_rate": 2e-05, "loss": 0.8746, "step": 64 }, { "epoch": 2.0, "eval_loss": 0.8080699443817139, "eval_runtime": 44.6331, "eval_samples_per_second": 4.481, "eval_steps_per_second": 0.56, "step": 64 }, { "epoch": 2.03125, "grad_norm": 0.2775753424365695, "learning_rate": 2e-05, "loss": 0.8665, "step": 65 }, { "epoch": 2.03125, "eval_loss": 0.8051960468292236, "eval_runtime": 43.2561, "eval_samples_per_second": 4.624, "eval_steps_per_second": 0.578, "step": 65 }, { "epoch": 2.0625, "grad_norm": 0.27249086550617724, "learning_rate": 2e-05, "loss": 0.8868, "step": 66 }, { "epoch": 2.0625, "eval_loss": 0.8029299378395081, "eval_runtime": 43.1171, "eval_samples_per_second": 4.639, "eval_steps_per_second": 0.58, "step": 66 }, { "epoch": 2.09375, "grad_norm": 0.2719871749974866, "learning_rate": 2e-05, "loss": 0.8651, "step": 67 }, { "epoch": 2.09375, "eval_loss": 0.8006068468093872, "eval_runtime": 43.0661, "eval_samples_per_second": 4.644, "eval_steps_per_second": 0.581, "step": 67 }, { "epoch": 2.125, "grad_norm": 0.24961006779343242, "learning_rate": 2e-05, "loss": 0.9303, "step": 68 }, { "epoch": 2.125, "eval_loss": 0.7983291745185852, "eval_runtime": 44.5821, "eval_samples_per_second": 4.486, "eval_steps_per_second": 0.561, "step": 68 }, { "epoch": 2.15625, "grad_norm": 0.26632839922388696, "learning_rate": 2e-05, "loss": 0.8625, "step": 69 }, { "epoch": 2.15625, "eval_loss": 0.7961746454238892, "eval_runtime": 44.7163, "eval_samples_per_second": 4.473, "eval_steps_per_second": 0.559, "step": 69 }, { "epoch": 2.1875, "grad_norm": 0.28665202557154024, "learning_rate": 2e-05, "loss": 0.8084, "step": 70 }, { "epoch": 2.1875, "eval_loss": 0.7937586307525635, "eval_runtime": 43.1349, "eval_samples_per_second": 4.637, "eval_steps_per_second": 0.58, "step": 70 }, { "epoch": 2.21875, "grad_norm": 0.25474181970896226, "learning_rate": 2e-05, "loss": 0.8943, "step": 71 }, { "epoch": 2.21875, "eval_loss": 0.7917373776435852, "eval_runtime": 43.1701, "eval_samples_per_second": 4.633, "eval_steps_per_second": 0.579, "step": 71 }, { "epoch": 2.25, "grad_norm": 0.28289708669257335, "learning_rate": 2e-05, "loss": 0.8183, "step": 72 }, { "epoch": 2.25, "eval_loss": 0.7898543477058411, "eval_runtime": 43.3669, "eval_samples_per_second": 4.612, "eval_steps_per_second": 0.576, "step": 72 }, { "epoch": 2.28125, "grad_norm": 0.3081846543495751, "learning_rate": 2e-05, "loss": 0.866, "step": 73 }, { "epoch": 2.28125, "eval_loss": 0.7878245711326599, "eval_runtime": 43.2404, "eval_samples_per_second": 4.625, "eval_steps_per_second": 0.578, "step": 73 }, { "epoch": 2.3125, "grad_norm": 0.25291911217221025, "learning_rate": 2e-05, "loss": 0.8643, "step": 74 }, { "epoch": 2.3125, "eval_loss": 0.7859254479408264, "eval_runtime": 43.158, "eval_samples_per_second": 4.634, "eval_steps_per_second": 0.579, "step": 74 }, { "epoch": 2.34375, "grad_norm": 0.2671411105926486, "learning_rate": 2e-05, "loss": 0.9148, "step": 75 }, { "epoch": 2.34375, "eval_loss": 0.7841793894767761, "eval_runtime": 43.5393, "eval_samples_per_second": 4.594, "eval_steps_per_second": 0.574, "step": 75 }, { "epoch": 2.375, "grad_norm": 0.2649328385798148, "learning_rate": 2e-05, "loss": 0.8322, "step": 76 }, { "epoch": 2.375, "eval_loss": 0.7824788093566895, "eval_runtime": 44.6161, "eval_samples_per_second": 4.483, "eval_steps_per_second": 0.56, "step": 76 }, { "epoch": 2.40625, "grad_norm": 0.2770584815336495, "learning_rate": 2e-05, "loss": 0.8845, "step": 77 }, { "epoch": 2.40625, "eval_loss": 0.7810197472572327, "eval_runtime": 44.3474, "eval_samples_per_second": 4.51, "eval_steps_per_second": 0.564, "step": 77 }, { "epoch": 2.4375, "grad_norm": 0.3134056914363824, "learning_rate": 2e-05, "loss": 0.8764, "step": 78 }, { "epoch": 2.4375, "eval_loss": 0.7796530723571777, "eval_runtime": 44.6727, "eval_samples_per_second": 4.477, "eval_steps_per_second": 0.56, "step": 78 }, { "epoch": 2.46875, "grad_norm": 0.31159260857820364, "learning_rate": 2e-05, "loss": 0.8842, "step": 79 }, { "epoch": 2.46875, "eval_loss": 0.7792640924453735, "eval_runtime": 44.9476, "eval_samples_per_second": 4.45, "eval_steps_per_second": 0.556, "step": 79 }, { "epoch": 2.5, "grad_norm": 0.30072325605647415, "learning_rate": 2e-05, "loss": 0.9214, "step": 80 }, { "epoch": 2.5, "eval_loss": 0.7791906595230103, "eval_runtime": 44.5732, "eval_samples_per_second": 4.487, "eval_steps_per_second": 0.561, "step": 80 }, { "epoch": 2.53125, "grad_norm": 0.3021628861526586, "learning_rate": 2e-05, "loss": 0.854, "step": 81 }, { "epoch": 2.53125, "eval_loss": 0.7786081433296204, "eval_runtime": 46.7962, "eval_samples_per_second": 4.274, "eval_steps_per_second": 0.534, "step": 81 }, { "epoch": 2.5625, "grad_norm": 0.28647643667873335, "learning_rate": 2e-05, "loss": 0.915, "step": 82 }, { "epoch": 2.5625, "eval_loss": 0.777721643447876, "eval_runtime": 46.0168, "eval_samples_per_second": 4.346, "eval_steps_per_second": 0.543, "step": 82 }, { "epoch": 2.59375, "grad_norm": 0.3053967339779788, "learning_rate": 2e-05, "loss": 0.8616, "step": 83 }, { "epoch": 2.59375, "eval_loss": 0.7763125896453857, "eval_runtime": 46.9482, "eval_samples_per_second": 4.26, "eval_steps_per_second": 0.533, "step": 83 }, { "epoch": 2.625, "grad_norm": 0.3285655628944688, "learning_rate": 2e-05, "loss": 0.8242, "step": 84 }, { "epoch": 2.625, "eval_loss": 0.7744290232658386, "eval_runtime": 45.8201, "eval_samples_per_second": 4.365, "eval_steps_per_second": 0.546, "step": 84 }, { "epoch": 2.65625, "grad_norm": 0.29338609850548214, "learning_rate": 2e-05, "loss": 0.7927, "step": 85 }, { "epoch": 2.65625, "eval_loss": 0.7727124094963074, "eval_runtime": 47.0822, "eval_samples_per_second": 4.248, "eval_steps_per_second": 0.531, "step": 85 }, { "epoch": 2.6875, "grad_norm": 0.3360259804530201, "learning_rate": 2e-05, "loss": 0.8225, "step": 86 }, { "epoch": 2.6875, "eval_loss": 0.7707045078277588, "eval_runtime": 45.904, "eval_samples_per_second": 4.357, "eval_steps_per_second": 0.545, "step": 86 }, { "epoch": 2.71875, "grad_norm": 0.3086865804573199, "learning_rate": 2e-05, "loss": 0.8428, "step": 87 }, { "epoch": 2.71875, "eval_loss": 0.7689979672431946, "eval_runtime": 46.5498, "eval_samples_per_second": 4.296, "eval_steps_per_second": 0.537, "step": 87 }, { "epoch": 2.75, "grad_norm": 0.3441174342366127, "learning_rate": 2e-05, "loss": 0.9349, "step": 88 }, { "epoch": 2.75, "eval_loss": 0.7670918107032776, "eval_runtime": 45.9533, "eval_samples_per_second": 4.352, "eval_steps_per_second": 0.544, "step": 88 }, { "epoch": 2.78125, "grad_norm": 0.3192564489143439, "learning_rate": 2e-05, "loss": 0.8281, "step": 89 }, { "epoch": 2.78125, "eval_loss": 0.7653720378875732, "eval_runtime": 46.4157, "eval_samples_per_second": 4.309, "eval_steps_per_second": 0.539, "step": 89 }, { "epoch": 2.8125, "grad_norm": 0.318307521318246, "learning_rate": 2e-05, "loss": 0.8826, "step": 90 }, { "epoch": 2.8125, "eval_loss": 0.7641046047210693, "eval_runtime": 43.6527, "eval_samples_per_second": 4.582, "eval_steps_per_second": 0.573, "step": 90 }, { "epoch": 2.84375, "grad_norm": 0.3088619418824691, "learning_rate": 2e-05, "loss": 0.7792, "step": 91 }, { "epoch": 2.84375, "eval_loss": 0.7630372643470764, "eval_runtime": 43.3688, "eval_samples_per_second": 4.612, "eval_steps_per_second": 0.576, "step": 91 }, { "epoch": 2.875, "grad_norm": 0.31484830204628667, "learning_rate": 2e-05, "loss": 0.8771, "step": 92 }, { "epoch": 2.875, "eval_loss": 0.7621588110923767, "eval_runtime": 43.4895, "eval_samples_per_second": 4.599, "eval_steps_per_second": 0.575, "step": 92 }, { "epoch": 2.90625, "grad_norm": 0.3210986538440627, "learning_rate": 2e-05, "loss": 0.8125, "step": 93 }, { "epoch": 2.90625, "eval_loss": 0.7610002160072327, "eval_runtime": 44.5951, "eval_samples_per_second": 4.485, "eval_steps_per_second": 0.561, "step": 93 }, { "epoch": 2.9375, "grad_norm": 0.3584955691897743, "learning_rate": 2e-05, "loss": 0.8869, "step": 94 }, { "epoch": 2.9375, "eval_loss": 0.7591326832771301, "eval_runtime": 44.778, "eval_samples_per_second": 4.466, "eval_steps_per_second": 0.558, "step": 94 }, { "epoch": 2.96875, "grad_norm": 0.3231987362149406, "learning_rate": 2e-05, "loss": 0.828, "step": 95 }, { "epoch": 2.96875, "eval_loss": 0.7578966021537781, "eval_runtime": 44.832, "eval_samples_per_second": 4.461, "eval_steps_per_second": 0.558, "step": 95 }, { "epoch": 3.0, "grad_norm": 0.3195106075306484, "learning_rate": 2e-05, "loss": 0.8, "step": 96 }, { "epoch": 3.0, "eval_loss": 0.7563678026199341, "eval_runtime": 43.2334, "eval_samples_per_second": 4.626, "eval_steps_per_second": 0.578, "step": 96 }, { "epoch": 3.03125, "grad_norm": 0.3319055768203625, "learning_rate": 2e-05, "loss": 0.7632, "step": 97 }, { "epoch": 3.03125, "eval_loss": 0.7547956705093384, "eval_runtime": 50.7388, "eval_samples_per_second": 3.942, "eval_steps_per_second": 0.493, "step": 97 }, { "epoch": 3.0625, "grad_norm": 0.2995834652715153, "learning_rate": 2e-05, "loss": 0.8407, "step": 98 }, { "epoch": 3.0625, "eval_loss": 0.7533387541770935, "eval_runtime": 45.0847, "eval_samples_per_second": 4.436, "eval_steps_per_second": 0.555, "step": 98 }, { "epoch": 3.09375, "grad_norm": 0.30711749226961915, "learning_rate": 2e-05, "loss": 0.8117, "step": 99 }, { "epoch": 3.09375, "eval_loss": 0.7517553567886353, "eval_runtime": 43.2975, "eval_samples_per_second": 4.619, "eval_steps_per_second": 0.577, "step": 99 }, { "epoch": 3.125, "grad_norm": 0.3443284045264722, "learning_rate": 2e-05, "loss": 0.8347, "step": 100 }, { "epoch": 3.125, "eval_loss": 0.749790370464325, "eval_runtime": 43.3922, "eval_samples_per_second": 4.609, "eval_steps_per_second": 0.576, "step": 100 }, { "epoch": 3.15625, "grad_norm": 0.3080766546496095, "learning_rate": 2e-05, "loss": 0.7748, "step": 101 }, { "epoch": 3.15625, "eval_loss": 0.7480612397193909, "eval_runtime": 45.0132, "eval_samples_per_second": 4.443, "eval_steps_per_second": 0.555, "step": 101 }, { "epoch": 3.1875, "grad_norm": 0.34717566244235637, "learning_rate": 2e-05, "loss": 0.8407, "step": 102 }, { "epoch": 3.1875, "eval_loss": 0.7468411326408386, "eval_runtime": 43.1171, "eval_samples_per_second": 4.639, "eval_steps_per_second": 0.58, "step": 102 }, { "epoch": 3.21875, "grad_norm": 0.3374839165175488, "learning_rate": 2e-05, "loss": 0.8498, "step": 103 }, { "epoch": 3.21875, "eval_loss": 0.7462002038955688, "eval_runtime": 44.7301, "eval_samples_per_second": 4.471, "eval_steps_per_second": 0.559, "step": 103 }, { "epoch": 3.25, "grad_norm": 0.35610377004267274, "learning_rate": 2e-05, "loss": 0.7608, "step": 104 }, { "epoch": 3.25, "eval_loss": 0.7451856732368469, "eval_runtime": 43.1396, "eval_samples_per_second": 4.636, "eval_steps_per_second": 0.58, "step": 104 }, { "epoch": 3.28125, "grad_norm": 0.3147450389365033, "learning_rate": 2e-05, "loss": 0.8077, "step": 105 }, { "epoch": 3.28125, "eval_loss": 0.7444003224372864, "eval_runtime": 45.0088, "eval_samples_per_second": 4.444, "eval_steps_per_second": 0.555, "step": 105 }, { "epoch": 3.3125, "grad_norm": 0.3706462973318254, "learning_rate": 2e-05, "loss": 0.8401, "step": 106 }, { "epoch": 3.3125, "eval_loss": 0.7432863116264343, "eval_runtime": 43.5403, "eval_samples_per_second": 4.593, "eval_steps_per_second": 0.574, "step": 106 }, { "epoch": 3.34375, "grad_norm": 0.40870394852693054, "learning_rate": 2e-05, "loss": 0.7369, "step": 107 }, { "epoch": 3.34375, "eval_loss": 0.7409774661064148, "eval_runtime": 43.3731, "eval_samples_per_second": 4.611, "eval_steps_per_second": 0.576, "step": 107 }, { "epoch": 3.375, "grad_norm": 0.36546514227995835, "learning_rate": 2e-05, "loss": 0.7822, "step": 108 }, { "epoch": 3.375, "eval_loss": 0.7388054132461548, "eval_runtime": 43.2852, "eval_samples_per_second": 4.621, "eval_steps_per_second": 0.578, "step": 108 }, { "epoch": 3.40625, "grad_norm": 0.3623356150462002, "learning_rate": 2e-05, "loss": 0.7693, "step": 109 }, { "epoch": 3.40625, "eval_loss": 0.7370558977127075, "eval_runtime": 43.2105, "eval_samples_per_second": 4.629, "eval_steps_per_second": 0.579, "step": 109 }, { "epoch": 3.4375, "grad_norm": 0.36956774509216733, "learning_rate": 2e-05, "loss": 0.7631, "step": 110 }, { "epoch": 3.4375, "eval_loss": 0.7354567050933838, "eval_runtime": 45.0512, "eval_samples_per_second": 4.439, "eval_steps_per_second": 0.555, "step": 110 }, { "epoch": 3.46875, "grad_norm": 0.37499211223571893, "learning_rate": 2e-05, "loss": 0.8397, "step": 111 }, { "epoch": 3.46875, "eval_loss": 0.7342872619628906, "eval_runtime": 44.1989, "eval_samples_per_second": 4.525, "eval_steps_per_second": 0.566, "step": 111 }, { "epoch": 3.5, "grad_norm": 0.3656781606255811, "learning_rate": 2e-05, "loss": 0.8156, "step": 112 }, { "epoch": 3.5, "eval_loss": 0.7334136962890625, "eval_runtime": 43.3314, "eval_samples_per_second": 4.616, "eval_steps_per_second": 0.577, "step": 112 }, { "epoch": 3.53125, "grad_norm": 0.360531666311953, "learning_rate": 2e-05, "loss": 0.9039, "step": 113 }, { "epoch": 3.53125, "eval_loss": 0.732928454875946, "eval_runtime": 43.6452, "eval_samples_per_second": 4.582, "eval_steps_per_second": 0.573, "step": 113 }, { "epoch": 3.5625, "grad_norm": 0.4106498291544766, "learning_rate": 2e-05, "loss": 0.7632, "step": 114 }, { "epoch": 3.5625, "eval_loss": 0.7328732013702393, "eval_runtime": 43.2922, "eval_samples_per_second": 4.62, "eval_steps_per_second": 0.577, "step": 114 }, { "epoch": 3.59375, "grad_norm": 0.35030054786635473, "learning_rate": 2e-05, "loss": 0.8328, "step": 115 }, { "epoch": 3.59375, "eval_loss": 0.7332839369773865, "eval_runtime": 43.1392, "eval_samples_per_second": 4.636, "eval_steps_per_second": 0.58, "step": 115 }, { "epoch": 3.625, "grad_norm": 0.37866907463824806, "learning_rate": 2e-05, "loss": 0.7992, "step": 116 }, { "epoch": 3.625, "eval_loss": 0.7333321571350098, "eval_runtime": 44.5672, "eval_samples_per_second": 4.488, "eval_steps_per_second": 0.561, "step": 116 }, { "epoch": 3.65625, "grad_norm": 0.3868782215569731, "learning_rate": 2e-05, "loss": 0.7929, "step": 117 }, { "epoch": 3.65625, "eval_loss": 0.7327985167503357, "eval_runtime": 45.9132, "eval_samples_per_second": 4.356, "eval_steps_per_second": 0.545, "step": 117 }, { "epoch": 3.6875, "grad_norm": 0.3823386198135366, "learning_rate": 2e-05, "loss": 0.8064, "step": 118 }, { "epoch": 3.6875, "eval_loss": 0.7325207591056824, "eval_runtime": 45.1557, "eval_samples_per_second": 4.429, "eval_steps_per_second": 0.554, "step": 118 }, { "epoch": 3.71875, "grad_norm": 0.3586002374199349, "learning_rate": 2e-05, "loss": 0.8677, "step": 119 }, { "epoch": 3.71875, "eval_loss": 0.732402503490448, "eval_runtime": 44.5906, "eval_samples_per_second": 4.485, "eval_steps_per_second": 0.561, "step": 119 }, { "epoch": 3.75, "grad_norm": 0.34075042751380596, "learning_rate": 2e-05, "loss": 0.8119, "step": 120 }, { "epoch": 3.75, "eval_loss": 0.7322152853012085, "eval_runtime": 44.3386, "eval_samples_per_second": 4.511, "eval_steps_per_second": 0.564, "step": 120 }, { "epoch": 3.78125, "grad_norm": 0.38915259379047296, "learning_rate": 2e-05, "loss": 0.7866, "step": 121 }, { "epoch": 3.78125, "eval_loss": 0.7307778000831604, "eval_runtime": 45.0342, "eval_samples_per_second": 4.441, "eval_steps_per_second": 0.555, "step": 121 }, { "epoch": 3.8125, "grad_norm": 0.39774471715347587, "learning_rate": 2e-05, "loss": 0.8635, "step": 122 }, { "epoch": 3.8125, "eval_loss": 0.7294437885284424, "eval_runtime": 47.2205, "eval_samples_per_second": 4.235, "eval_steps_per_second": 0.529, "step": 122 }, { "epoch": 3.84375, "grad_norm": 0.3880340672056078, "learning_rate": 2e-05, "loss": 0.7834, "step": 123 }, { "epoch": 3.84375, "eval_loss": 0.7277958393096924, "eval_runtime": 45.5116, "eval_samples_per_second": 4.394, "eval_steps_per_second": 0.549, "step": 123 }, { "epoch": 3.875, "grad_norm": 0.34955832039339413, "learning_rate": 2e-05, "loss": 0.8048, "step": 124 }, { "epoch": 3.875, "eval_loss": 0.7262464761734009, "eval_runtime": 45.3196, "eval_samples_per_second": 4.413, "eval_steps_per_second": 0.552, "step": 124 }, { "epoch": 3.90625, "grad_norm": 0.4502351954206266, "learning_rate": 2e-05, "loss": 0.8494, "step": 125 }, { "epoch": 3.90625, "eval_loss": 0.724558413028717, "eval_runtime": 45.2241, "eval_samples_per_second": 4.422, "eval_steps_per_second": 0.553, "step": 125 }, { "epoch": 3.9375, "grad_norm": 0.40148506382728893, "learning_rate": 2e-05, "loss": 0.8163, "step": 126 }, { "epoch": 3.9375, "eval_loss": 0.7235116362571716, "eval_runtime": 46.1839, "eval_samples_per_second": 4.331, "eval_steps_per_second": 0.541, "step": 126 }, { "epoch": 3.96875, "grad_norm": 0.41595103877364653, "learning_rate": 2e-05, "loss": 0.7756, "step": 127 }, { "epoch": 3.96875, "eval_loss": 0.7227371335029602, "eval_runtime": 43.5883, "eval_samples_per_second": 4.588, "eval_steps_per_second": 0.574, "step": 127 }, { "epoch": 4.0, "grad_norm": 0.3959213167419436, "learning_rate": 2e-05, "loss": 0.7107, "step": 128 }, { "epoch": 4.0, "eval_loss": 0.721717357635498, "eval_runtime": 44.8751, "eval_samples_per_second": 4.457, "eval_steps_per_second": 0.557, "step": 128 }, { "epoch": 4.03125, "grad_norm": 0.34668934768327436, "learning_rate": 2e-05, "loss": 0.8028, "step": 129 }, { "epoch": 4.03125, "eval_loss": 0.7208954095840454, "eval_runtime": 43.2092, "eval_samples_per_second": 4.629, "eval_steps_per_second": 0.579, "step": 129 }, { "epoch": 4.0625, "grad_norm": 0.3776564287872586, "learning_rate": 2e-05, "loss": 0.8162, "step": 130 }, { "epoch": 4.0625, "eval_loss": 0.7200332880020142, "eval_runtime": 43.1981, "eval_samples_per_second": 4.63, "eval_steps_per_second": 0.579, "step": 130 }, { "epoch": 4.09375, "grad_norm": 0.35166731437552645, "learning_rate": 2e-05, "loss": 0.814, "step": 131 }, { "epoch": 4.09375, "eval_loss": 0.7193570137023926, "eval_runtime": 43.3306, "eval_samples_per_second": 4.616, "eval_steps_per_second": 0.577, "step": 131 }, { "epoch": 4.125, "grad_norm": 0.39783214883157875, "learning_rate": 2e-05, "loss": 0.7743, "step": 132 }, { "epoch": 4.125, "eval_loss": 0.7187802791595459, "eval_runtime": 44.0701, "eval_samples_per_second": 4.538, "eval_steps_per_second": 0.567, "step": 132 }, { "epoch": 4.15625, "grad_norm": 0.3828880469066703, "learning_rate": 2e-05, "loss": 0.8766, "step": 133 }, { "epoch": 4.15625, "eval_loss": 0.7184324860572815, "eval_runtime": 43.3218, "eval_samples_per_second": 4.617, "eval_steps_per_second": 0.577, "step": 133 }, { "epoch": 4.1875, "grad_norm": 0.46175115507112535, "learning_rate": 2e-05, "loss": 0.7827, "step": 134 }, { "epoch": 4.1875, "eval_loss": 0.717852771282196, "eval_runtime": 43.3706, "eval_samples_per_second": 4.611, "eval_steps_per_second": 0.576, "step": 134 }, { "epoch": 4.21875, "grad_norm": 0.39552167703322383, "learning_rate": 2e-05, "loss": 0.7846, "step": 135 }, { "epoch": 4.21875, "eval_loss": 0.7171714901924133, "eval_runtime": 43.3199, "eval_samples_per_second": 4.617, "eval_steps_per_second": 0.577, "step": 135 }, { "epoch": 4.25, "grad_norm": 0.40883049825529505, "learning_rate": 2e-05, "loss": 0.7711, "step": 136 }, { "epoch": 4.25, "eval_loss": 0.7167998552322388, "eval_runtime": 43.4601, "eval_samples_per_second": 4.602, "eval_steps_per_second": 0.575, "step": 136 }, { "epoch": 4.28125, "grad_norm": 0.4411120151436577, "learning_rate": 2e-05, "loss": 0.755, "step": 137 }, { "epoch": 4.28125, "eval_loss": 0.7161502838134766, "eval_runtime": 45.0586, "eval_samples_per_second": 4.439, "eval_steps_per_second": 0.555, "step": 137 }, { "epoch": 4.3125, "grad_norm": 0.4307733167956254, "learning_rate": 2e-05, "loss": 0.7708, "step": 138 }, { "epoch": 4.3125, "eval_loss": 0.7155695557594299, "eval_runtime": 44.7913, "eval_samples_per_second": 4.465, "eval_steps_per_second": 0.558, "step": 138 }, { "epoch": 4.34375, "grad_norm": 0.4303129845521591, "learning_rate": 2e-05, "loss": 0.7384, "step": 139 }, { "epoch": 4.34375, "eval_loss": 0.7146069407463074, "eval_runtime": 43.3745, "eval_samples_per_second": 4.611, "eval_steps_per_second": 0.576, "step": 139 }, { "epoch": 4.375, "grad_norm": 0.4160861103360693, "learning_rate": 2e-05, "loss": 0.7693, "step": 140 }, { "epoch": 4.375, "eval_loss": 0.7138718962669373, "eval_runtime": 43.2941, "eval_samples_per_second": 4.62, "eval_steps_per_second": 0.577, "step": 140 }, { "epoch": 4.40625, "grad_norm": 0.3974304749908327, "learning_rate": 2e-05, "loss": 0.7855, "step": 141 }, { "epoch": 4.40625, "eval_loss": 0.7131789922714233, "eval_runtime": 43.6908, "eval_samples_per_second": 4.578, "eval_steps_per_second": 0.572, "step": 141 }, { "epoch": 4.4375, "grad_norm": 0.42212623603465876, "learning_rate": 2e-05, "loss": 0.733, "step": 142 }, { "epoch": 4.4375, "eval_loss": 0.7126344442367554, "eval_runtime": 43.5706, "eval_samples_per_second": 4.59, "eval_steps_per_second": 0.574, "step": 142 }, { "epoch": 4.46875, "grad_norm": 0.4290602874698813, "learning_rate": 2e-05, "loss": 0.7372, "step": 143 }, { "epoch": 4.46875, "eval_loss": 0.7121153473854065, "eval_runtime": 44.0917, "eval_samples_per_second": 4.536, "eval_steps_per_second": 0.567, "step": 143 }, { "epoch": 4.5, "grad_norm": 0.38778639331277664, "learning_rate": 2e-05, "loss": 0.715, "step": 144 }, { "epoch": 4.5, "eval_loss": 0.7114359140396118, "eval_runtime": 90.4172, "eval_samples_per_second": 2.212, "eval_steps_per_second": 0.276, "step": 144 }, { "epoch": 4.53125, "grad_norm": 0.44014343297224434, "learning_rate": 2e-05, "loss": 0.802, "step": 145 }, { "epoch": 4.53125, "eval_loss": 0.7106121778488159, "eval_runtime": 43.5235, "eval_samples_per_second": 4.595, "eval_steps_per_second": 0.574, "step": 145 }, { "epoch": 4.5625, "grad_norm": 0.45549843169611287, "learning_rate": 2e-05, "loss": 0.6899, "step": 146 }, { "epoch": 4.5625, "eval_loss": 0.7094995975494385, "eval_runtime": 43.5264, "eval_samples_per_second": 4.595, "eval_steps_per_second": 0.574, "step": 146 }, { "epoch": 4.59375, "grad_norm": 0.46209967918252776, "learning_rate": 2e-05, "loss": 0.7503, "step": 147 }, { "epoch": 4.59375, "eval_loss": 0.7082768082618713, "eval_runtime": 44.8411, "eval_samples_per_second": 4.46, "eval_steps_per_second": 0.558, "step": 147 }, { "epoch": 4.625, "grad_norm": 0.43001381014670376, "learning_rate": 2e-05, "loss": 0.7041, "step": 148 }, { "epoch": 4.625, "eval_loss": 0.7072634696960449, "eval_runtime": 43.1988, "eval_samples_per_second": 4.63, "eval_steps_per_second": 0.579, "step": 148 }, { "epoch": 4.65625, "grad_norm": 0.4151229594087744, "learning_rate": 2e-05, "loss": 0.8181, "step": 149 }, { "epoch": 4.65625, "eval_loss": 0.7068669199943542, "eval_runtime": 43.3996, "eval_samples_per_second": 4.608, "eval_steps_per_second": 0.576, "step": 149 }, { "epoch": 4.6875, "grad_norm": 0.4534048991771139, "learning_rate": 2e-05, "loss": 0.7411, "step": 150 }, { "epoch": 4.6875, "eval_loss": 0.7062075734138489, "eval_runtime": 43.3013, "eval_samples_per_second": 4.619, "eval_steps_per_second": 0.577, "step": 150 }, { "epoch": 4.71875, "grad_norm": 0.4739932075357852, "learning_rate": 2e-05, "loss": 0.7621, "step": 151 }, { "epoch": 4.71875, "eval_loss": 0.7047030925750732, "eval_runtime": 43.4211, "eval_samples_per_second": 4.606, "eval_steps_per_second": 0.576, "step": 151 }, { "epoch": 4.75, "grad_norm": 0.46573796534078227, "learning_rate": 2e-05, "loss": 0.7852, "step": 152 }, { "epoch": 4.75, "eval_loss": 0.7033020257949829, "eval_runtime": 43.4066, "eval_samples_per_second": 4.608, "eval_steps_per_second": 0.576, "step": 152 }, { "epoch": 4.78125, "grad_norm": 0.463007545995704, "learning_rate": 2e-05, "loss": 0.7331, "step": 153 }, { "epoch": 4.78125, "eval_loss": 0.7021228671073914, "eval_runtime": 43.4184, "eval_samples_per_second": 4.606, "eval_steps_per_second": 0.576, "step": 153 }, { "epoch": 4.8125, "grad_norm": 0.46580692487948094, "learning_rate": 2e-05, "loss": 0.76, "step": 154 }, { "epoch": 4.8125, "eval_loss": 0.701519250869751, "eval_runtime": 44.9732, "eval_samples_per_second": 4.447, "eval_steps_per_second": 0.556, "step": 154 }, { "epoch": 4.84375, "grad_norm": 0.47378674394843967, "learning_rate": 2e-05, "loss": 0.6912, "step": 155 }, { "epoch": 4.84375, "eval_loss": 0.7011644244194031, "eval_runtime": 44.898, "eval_samples_per_second": 4.455, "eval_steps_per_second": 0.557, "step": 155 }, { "epoch": 4.875, "grad_norm": 0.44883703516788587, "learning_rate": 2e-05, "loss": 0.812, "step": 156 }, { "epoch": 4.875, "eval_loss": 0.7009950876235962, "eval_runtime": 44.4765, "eval_samples_per_second": 4.497, "eval_steps_per_second": 0.562, "step": 156 }, { "epoch": 4.90625, "grad_norm": 0.43366130955490684, "learning_rate": 2e-05, "loss": 0.7902, "step": 157 }, { "epoch": 4.90625, "eval_loss": 0.7011439800262451, "eval_runtime": 44.3528, "eval_samples_per_second": 4.509, "eval_steps_per_second": 0.564, "step": 157 }, { "epoch": 4.9375, "grad_norm": 0.4501399670257468, "learning_rate": 2e-05, "loss": 0.7927, "step": 158 }, { "epoch": 4.9375, "eval_loss": 0.7011370062828064, "eval_runtime": 46.6518, "eval_samples_per_second": 4.287, "eval_steps_per_second": 0.536, "step": 158 }, { "epoch": 4.96875, "grad_norm": 0.44946550972510596, "learning_rate": 2e-05, "loss": 0.7437, "step": 159 }, { "epoch": 4.96875, "eval_loss": 0.7008097767829895, "eval_runtime": 45.6401, "eval_samples_per_second": 4.382, "eval_steps_per_second": 0.548, "step": 159 }, { "epoch": 5.0, "grad_norm": 0.455086081766797, "learning_rate": 2e-05, "loss": 0.7274, "step": 160 }, { "epoch": 5.0, "eval_loss": 0.7002915143966675, "eval_runtime": 44.5003, "eval_samples_per_second": 4.494, "eval_steps_per_second": 0.562, "step": 160 }, { "epoch": 5.03125, "grad_norm": 0.42610507864697433, "learning_rate": 2e-05, "loss": 0.7084, "step": 161 }, { "epoch": 5.03125, "eval_loss": 0.6996615529060364, "eval_runtime": 50.423, "eval_samples_per_second": 3.966, "eval_steps_per_second": 0.496, "step": 161 }, { "epoch": 5.0625, "grad_norm": 0.41530618486274595, "learning_rate": 2e-05, "loss": 0.8549, "step": 162 }, { "epoch": 5.0625, "eval_loss": 0.6996638774871826, "eval_runtime": 43.3726, "eval_samples_per_second": 4.611, "eval_steps_per_second": 0.576, "step": 162 }, { "epoch": 5.09375, "grad_norm": 0.46020582285044187, "learning_rate": 2e-05, "loss": 0.6554, "step": 163 }, { "epoch": 5.09375, "eval_loss": 0.6997809410095215, "eval_runtime": 43.1108, "eval_samples_per_second": 4.639, "eval_steps_per_second": 0.58, "step": 163 }, { "epoch": 5.125, "grad_norm": 0.45217206658399783, "learning_rate": 2e-05, "loss": 0.7908, "step": 164 }, { "epoch": 5.125, "eval_loss": 0.7001843452453613, "eval_runtime": 43.3575, "eval_samples_per_second": 4.613, "eval_steps_per_second": 0.577, "step": 164 }, { "epoch": 5.15625, "grad_norm": 0.5297838342887452, "learning_rate": 2e-05, "loss": 0.6311, "step": 165 }, { "epoch": 5.15625, "eval_loss": 0.6998342871665955, "eval_runtime": 44.2692, "eval_samples_per_second": 4.518, "eval_steps_per_second": 0.565, "step": 165 }, { "epoch": 5.1875, "grad_norm": 0.5041508044224997, "learning_rate": 2e-05, "loss": 0.7407, "step": 166 }, { "epoch": 5.1875, "eval_loss": 0.6997390985488892, "eval_runtime": 44.9429, "eval_samples_per_second": 4.45, "eval_steps_per_second": 0.556, "step": 166 }, { "epoch": 5.21875, "grad_norm": 0.4379864270565459, "learning_rate": 2e-05, "loss": 0.7601, "step": 167 }, { "epoch": 5.21875, "eval_loss": 0.6998906135559082, "eval_runtime": 44.7922, "eval_samples_per_second": 4.465, "eval_steps_per_second": 0.558, "step": 167 }, { "epoch": 5.25, "grad_norm": 0.4908573554102339, "learning_rate": 2e-05, "loss": 0.802, "step": 168 }, { "epoch": 5.25, "eval_loss": 0.6996601819992065, "eval_runtime": 44.8792, "eval_samples_per_second": 4.456, "eval_steps_per_second": 0.557, "step": 168 }, { "epoch": 5.28125, "grad_norm": 0.4708754671143599, "learning_rate": 2e-05, "loss": 0.7212, "step": 169 }, { "epoch": 5.28125, "eval_loss": 0.699320375919342, "eval_runtime": 42.8958, "eval_samples_per_second": 4.662, "eval_steps_per_second": 0.583, "step": 169 }, { "epoch": 5.3125, "grad_norm": 0.5157421152452428, "learning_rate": 2e-05, "loss": 0.6919, "step": 170 }, { "epoch": 5.3125, "eval_loss": 0.6992219686508179, "eval_runtime": 43.1543, "eval_samples_per_second": 4.635, "eval_steps_per_second": 0.579, "step": 170 }, { "epoch": 5.34375, "grad_norm": 0.5604495452491726, "learning_rate": 2e-05, "loss": 0.708, "step": 171 }, { "epoch": 5.34375, "eval_loss": 0.6983294486999512, "eval_runtime": 43.0431, "eval_samples_per_second": 4.647, "eval_steps_per_second": 0.581, "step": 171 }, { "epoch": 5.375, "grad_norm": 0.5538353889452822, "learning_rate": 2e-05, "loss": 0.7922, "step": 172 }, { "epoch": 5.375, "eval_loss": 0.6967844367027283, "eval_runtime": 43.3554, "eval_samples_per_second": 4.613, "eval_steps_per_second": 0.577, "step": 172 }, { "epoch": 5.40625, "grad_norm": 0.4750896425737706, "learning_rate": 2e-05, "loss": 0.7552, "step": 173 }, { "epoch": 5.40625, "eval_loss": 0.6954870820045471, "eval_runtime": 43.2105, "eval_samples_per_second": 4.629, "eval_steps_per_second": 0.579, "step": 173 }, { "epoch": 5.4375, "grad_norm": 0.4939578777629157, "learning_rate": 2e-05, "loss": 0.793, "step": 174 }, { "epoch": 5.4375, "eval_loss": 0.6942651271820068, "eval_runtime": 43.2018, "eval_samples_per_second": 4.629, "eval_steps_per_second": 0.579, "step": 174 }, { "epoch": 5.46875, "grad_norm": 0.5275775814858564, "learning_rate": 2e-05, "loss": 0.7812, "step": 175 }, { "epoch": 5.46875, "eval_loss": 0.6938748359680176, "eval_runtime": 43.0238, "eval_samples_per_second": 4.649, "eval_steps_per_second": 0.581, "step": 175 }, { "epoch": 5.5, "grad_norm": 0.516931179872771, "learning_rate": 2e-05, "loss": 0.7157, "step": 176 }, { "epoch": 5.5, "eval_loss": 0.6937347650527954, "eval_runtime": 44.7687, "eval_samples_per_second": 4.467, "eval_steps_per_second": 0.558, "step": 176 }, { "epoch": 5.53125, "grad_norm": 0.527427864430588, "learning_rate": 2e-05, "loss": 0.7505, "step": 177 }, { "epoch": 5.53125, "eval_loss": 0.6932395696640015, "eval_runtime": 44.5644, "eval_samples_per_second": 4.488, "eval_steps_per_second": 0.561, "step": 177 }, { "epoch": 5.5625, "grad_norm": 0.5073638107520839, "learning_rate": 2e-05, "loss": 0.7893, "step": 178 }, { "epoch": 5.5625, "eval_loss": 0.692828357219696, "eval_runtime": 46.0526, "eval_samples_per_second": 4.343, "eval_steps_per_second": 0.543, "step": 178 }, { "epoch": 5.59375, "grad_norm": 0.5234480045460208, "learning_rate": 2e-05, "loss": 0.6786, "step": 179 }, { "epoch": 5.59375, "eval_loss": 0.6927328705787659, "eval_runtime": 44.4221, "eval_samples_per_second": 4.502, "eval_steps_per_second": 0.563, "step": 179 }, { "epoch": 5.625, "grad_norm": 0.509921375319416, "learning_rate": 2e-05, "loss": 0.6839, "step": 180 }, { "epoch": 5.625, "eval_loss": 0.6922880411148071, "eval_runtime": 44.5254, "eval_samples_per_second": 4.492, "eval_steps_per_second": 0.561, "step": 180 }, { "epoch": 5.65625, "grad_norm": 0.5307701692724383, "learning_rate": 2e-05, "loss": 0.6949, "step": 181 }, { "epoch": 5.65625, "eval_loss": 0.6916860938072205, "eval_runtime": 46.1897, "eval_samples_per_second": 4.33, "eval_steps_per_second": 0.541, "step": 181 }, { "epoch": 5.6875, "grad_norm": 0.5405944672270007, "learning_rate": 2e-05, "loss": 0.6644, "step": 182 }, { "epoch": 5.6875, "eval_loss": 0.6913076639175415, "eval_runtime": 45.6494, "eval_samples_per_second": 4.381, "eval_steps_per_second": 0.548, "step": 182 }, { "epoch": 5.71875, "grad_norm": 0.5911050914106935, "learning_rate": 2e-05, "loss": 0.6993, "step": 183 }, { "epoch": 5.71875, "eval_loss": 0.6910421848297119, "eval_runtime": 45.6849, "eval_samples_per_second": 4.378, "eval_steps_per_second": 0.547, "step": 183 }, { "epoch": 5.75, "grad_norm": 0.5738317262291136, "learning_rate": 2e-05, "loss": 0.6909, "step": 184 }, { "epoch": 5.75, "eval_loss": 0.6906780004501343, "eval_runtime": 45.8103, "eval_samples_per_second": 4.366, "eval_steps_per_second": 0.546, "step": 184 }, { "epoch": 5.78125, "grad_norm": 0.6176885912626084, "learning_rate": 2e-05, "loss": 0.7418, "step": 185 }, { "epoch": 5.78125, "eval_loss": 0.6897534132003784, "eval_runtime": 46.2895, "eval_samples_per_second": 4.321, "eval_steps_per_second": 0.54, "step": 185 }, { "epoch": 5.8125, "grad_norm": 0.5804047612157957, "learning_rate": 2e-05, "loss": 0.7046, "step": 186 }, { "epoch": 5.8125, "eval_loss": 0.6883871555328369, "eval_runtime": 46.9282, "eval_samples_per_second": 4.262, "eval_steps_per_second": 0.533, "step": 186 }, { "epoch": 5.84375, "grad_norm": 0.5408722725454089, "learning_rate": 2e-05, "loss": 0.7561, "step": 187 }, { "epoch": 5.84375, "eval_loss": 0.6878187656402588, "eval_runtime": 47.6969, "eval_samples_per_second": 4.193, "eval_steps_per_second": 0.524, "step": 187 }, { "epoch": 5.875, "grad_norm": 0.5492560188161619, "learning_rate": 2e-05, "loss": 0.6903, "step": 188 }, { "epoch": 5.875, "eval_loss": 0.6882662773132324, "eval_runtime": 47.2072, "eval_samples_per_second": 4.237, "eval_steps_per_second": 0.53, "step": 188 }, { "epoch": 5.90625, "grad_norm": 0.5286439760924038, "learning_rate": 2e-05, "loss": 0.7036, "step": 189 }, { "epoch": 5.90625, "eval_loss": 0.6890198588371277, "eval_runtime": 47.4378, "eval_samples_per_second": 4.216, "eval_steps_per_second": 0.527, "step": 189 }, { "epoch": 5.9375, "grad_norm": 0.5540465829524065, "learning_rate": 2e-05, "loss": 0.715, "step": 190 }, { "epoch": 5.9375, "eval_loss": 0.6893854737281799, "eval_runtime": 47.5957, "eval_samples_per_second": 4.202, "eval_steps_per_second": 0.525, "step": 190 }, { "epoch": 5.96875, "grad_norm": 0.543055712644853, "learning_rate": 2e-05, "loss": 0.7122, "step": 191 }, { "epoch": 5.96875, "eval_loss": 0.688640296459198, "eval_runtime": 47.2791, "eval_samples_per_second": 4.23, "eval_steps_per_second": 0.529, "step": 191 }, { "epoch": 6.0, "grad_norm": 0.5243011011968818, "learning_rate": 2e-05, "loss": 0.6989, "step": 192 }, { "epoch": 6.0, "eval_loss": 0.6877474784851074, "eval_runtime": 49.6808, "eval_samples_per_second": 4.026, "eval_steps_per_second": 0.503, "step": 192 }, { "epoch": 6.03125, "grad_norm": 0.5427998890836598, "learning_rate": 2e-05, "loss": 0.7643, "step": 193 }, { "epoch": 6.03125, "eval_loss": 0.6871516704559326, "eval_runtime": 43.2416, "eval_samples_per_second": 4.625, "eval_steps_per_second": 0.578, "step": 193 }, { "epoch": 6.0625, "grad_norm": 0.4848261239833822, "learning_rate": 2e-05, "loss": 0.7333, "step": 194 }, { "epoch": 6.0625, "eval_loss": 0.6872122287750244, "eval_runtime": 43.027, "eval_samples_per_second": 4.648, "eval_steps_per_second": 0.581, "step": 194 }, { "epoch": 6.09375, "grad_norm": 0.5476878256408845, "learning_rate": 2e-05, "loss": 0.6621, "step": 195 }, { "epoch": 6.09375, "eval_loss": 0.6873424053192139, "eval_runtime": 43.0047, "eval_samples_per_second": 4.651, "eval_steps_per_second": 0.581, "step": 195 }, { "epoch": 6.125, "grad_norm": 0.5198863257357437, "learning_rate": 2e-05, "loss": 0.6936, "step": 196 }, { "epoch": 6.125, "eval_loss": 0.6874563097953796, "eval_runtime": 43.2855, "eval_samples_per_second": 4.62, "eval_steps_per_second": 0.578, "step": 196 }, { "epoch": 6.15625, "grad_norm": 0.5705568756769012, "learning_rate": 2e-05, "loss": 0.7237, "step": 197 }, { "epoch": 6.15625, "eval_loss": 0.6877203583717346, "eval_runtime": 44.8778, "eval_samples_per_second": 4.457, "eval_steps_per_second": 0.557, "step": 197 }, { "epoch": 6.1875, "grad_norm": 0.5546703873264635, "learning_rate": 2e-05, "loss": 0.8033, "step": 198 }, { "epoch": 6.1875, "eval_loss": 0.6876934170722961, "eval_runtime": 43.3351, "eval_samples_per_second": 4.615, "eval_steps_per_second": 0.577, "step": 198 }, { "epoch": 6.21875, "grad_norm": 0.5846944975931198, "learning_rate": 2e-05, "loss": 0.6687, "step": 199 }, { "epoch": 6.21875, "eval_loss": 0.6877866983413696, "eval_runtime": 43.1456, "eval_samples_per_second": 4.635, "eval_steps_per_second": 0.579, "step": 199 }, { "epoch": 6.25, "grad_norm": 0.5882658410555619, "learning_rate": 2e-05, "loss": 0.7169, "step": 200 }, { "epoch": 6.25, "eval_loss": 0.6881275773048401, "eval_runtime": 44.9645, "eval_samples_per_second": 4.448, "eval_steps_per_second": 0.556, "step": 200 }, { "epoch": 6.28125, "grad_norm": 0.5831610447904351, "learning_rate": 2e-05, "loss": 0.7394, "step": 201 }, { "epoch": 6.28125, "eval_loss": 0.6888833045959473, "eval_runtime": 45.09, "eval_samples_per_second": 4.436, "eval_steps_per_second": 0.554, "step": 201 }, { "epoch": 6.3125, "grad_norm": 0.6592966385691889, "learning_rate": 2e-05, "loss": 0.6537, "step": 202 }, { "epoch": 6.3125, "eval_loss": 0.6880140900611877, "eval_runtime": 43.2447, "eval_samples_per_second": 4.625, "eval_steps_per_second": 0.578, "step": 202 }, { "epoch": 6.34375, "grad_norm": 0.558654488415818, "learning_rate": 2e-05, "loss": 0.7991, "step": 203 }, { "epoch": 6.34375, "eval_loss": 0.6874076724052429, "eval_runtime": 42.9406, "eval_samples_per_second": 4.658, "eval_steps_per_second": 0.582, "step": 203 }, { "epoch": 6.375, "grad_norm": 0.6342316949523702, "learning_rate": 2e-05, "loss": 0.6403, "step": 204 }, { "epoch": 6.375, "eval_loss": 0.6866291761398315, "eval_runtime": 43.1217, "eval_samples_per_second": 4.638, "eval_steps_per_second": 0.58, "step": 204 }, { "epoch": 6.40625, "grad_norm": 0.544206621558966, "learning_rate": 2e-05, "loss": 0.6314, "step": 205 }, { "epoch": 6.40625, "eval_loss": 0.6863086223602295, "eval_runtime": 43.2951, "eval_samples_per_second": 4.619, "eval_steps_per_second": 0.577, "step": 205 }, { "epoch": 6.4375, "grad_norm": 0.6380097809956626, "learning_rate": 2e-05, "loss": 0.6851, "step": 206 }, { "epoch": 6.4375, "eval_loss": 0.6859965324401855, "eval_runtime": 44.9257, "eval_samples_per_second": 4.452, "eval_steps_per_second": 0.556, "step": 206 }, { "epoch": 6.46875, "grad_norm": 0.5870799307885896, "learning_rate": 2e-05, "loss": 0.7367, "step": 207 }, { "epoch": 6.46875, "eval_loss": 0.6856269836425781, "eval_runtime": 44.8384, "eval_samples_per_second": 4.46, "eval_steps_per_second": 0.558, "step": 207 }, { "epoch": 6.5, "grad_norm": 0.6115022356518031, "learning_rate": 2e-05, "loss": 0.6814, "step": 208 }, { "epoch": 6.5, "eval_loss": 0.6856591701507568, "eval_runtime": 42.9528, "eval_samples_per_second": 4.656, "eval_steps_per_second": 0.582, "step": 208 }, { "epoch": 6.53125, "grad_norm": 0.6655918462314045, "learning_rate": 2e-05, "loss": 0.657, "step": 209 }, { "epoch": 6.53125, "eval_loss": 0.6854197978973389, "eval_runtime": 43.2366, "eval_samples_per_second": 4.626, "eval_steps_per_second": 0.578, "step": 209 }, { "epoch": 6.5625, "grad_norm": 0.6102352184035382, "learning_rate": 2e-05, "loss": 0.6343, "step": 210 }, { "epoch": 6.5625, "eval_loss": 0.6852834820747375, "eval_runtime": 43.1789, "eval_samples_per_second": 4.632, "eval_steps_per_second": 0.579, "step": 210 }, { "epoch": 6.59375, "grad_norm": 0.6354143085331753, "learning_rate": 2e-05, "loss": 0.6736, "step": 211 }, { "epoch": 6.59375, "eval_loss": 0.6851873993873596, "eval_runtime": 44.5173, "eval_samples_per_second": 4.493, "eval_steps_per_second": 0.562, "step": 211 }, { "epoch": 6.625, "grad_norm": 0.6069083787831553, "learning_rate": 2e-05, "loss": 0.6466, "step": 212 }, { "epoch": 6.625, "eval_loss": 0.6846270561218262, "eval_runtime": 44.7412, "eval_samples_per_second": 4.47, "eval_steps_per_second": 0.559, "step": 212 }, { "epoch": 6.65625, "grad_norm": 0.5918704953369675, "learning_rate": 2e-05, "loss": 0.7174, "step": 213 }, { "epoch": 6.65625, "eval_loss": 0.6842523217201233, "eval_runtime": 46.0503, "eval_samples_per_second": 4.343, "eval_steps_per_second": 0.543, "step": 213 }, { "epoch": 6.6875, "grad_norm": 0.5824866849171524, "learning_rate": 2e-05, "loss": 0.6955, "step": 214 }, { "epoch": 6.6875, "eval_loss": 0.6838890314102173, "eval_runtime": 44.5781, "eval_samples_per_second": 4.487, "eval_steps_per_second": 0.561, "step": 214 }, { "epoch": 6.71875, "grad_norm": 0.6278777152900226, "learning_rate": 2e-05, "loss": 0.6926, "step": 215 }, { "epoch": 6.71875, "eval_loss": 0.6827735900878906, "eval_runtime": 44.483, "eval_samples_per_second": 4.496, "eval_steps_per_second": 0.562, "step": 215 }, { "epoch": 6.75, "grad_norm": 0.6627082254561003, "learning_rate": 2e-05, "loss": 0.6931, "step": 216 }, { "epoch": 6.75, "eval_loss": 0.6818405389785767, "eval_runtime": 46.0477, "eval_samples_per_second": 4.343, "eval_steps_per_second": 0.543, "step": 216 }, { "epoch": 6.78125, "grad_norm": 0.6551951149808454, "learning_rate": 2e-05, "loss": 0.6386, "step": 217 }, { "epoch": 6.78125, "eval_loss": 0.6824897527694702, "eval_runtime": 47.3712, "eval_samples_per_second": 4.222, "eval_steps_per_second": 0.528, "step": 217 }, { "epoch": 6.8125, "grad_norm": 0.6821330786477059, "learning_rate": 2e-05, "loss": 0.635, "step": 218 }, { "epoch": 6.8125, "eval_loss": 0.6829469203948975, "eval_runtime": 46.2003, "eval_samples_per_second": 4.329, "eval_steps_per_second": 0.541, "step": 218 }, { "epoch": 6.84375, "grad_norm": 0.7440273168609611, "learning_rate": 2e-05, "loss": 0.7286, "step": 219 }, { "epoch": 6.84375, "eval_loss": 0.6824621558189392, "eval_runtime": 45.8201, "eval_samples_per_second": 4.365, "eval_steps_per_second": 0.546, "step": 219 }, { "epoch": 6.875, "grad_norm": 0.7007032012854347, "learning_rate": 2e-05, "loss": 0.7376, "step": 220 }, { "epoch": 6.875, "eval_loss": 0.6805981397628784, "eval_runtime": 45.7474, "eval_samples_per_second": 4.372, "eval_steps_per_second": 0.546, "step": 220 }, { "epoch": 6.90625, "grad_norm": 0.6422764032088494, "learning_rate": 2e-05, "loss": 0.6959, "step": 221 }, { "epoch": 6.90625, "eval_loss": 0.679237961769104, "eval_runtime": 48.4646, "eval_samples_per_second": 4.127, "eval_steps_per_second": 0.516, "step": 221 }, { "epoch": 6.9375, "grad_norm": 0.7159695125034813, "learning_rate": 2e-05, "loss": 0.6894, "step": 222 }, { "epoch": 6.9375, "eval_loss": 0.6775233745574951, "eval_runtime": 47.4563, "eval_samples_per_second": 4.214, "eval_steps_per_second": 0.527, "step": 222 }, { "epoch": 6.96875, "grad_norm": 0.6358380926544867, "learning_rate": 2e-05, "loss": 0.7073, "step": 223 }, { "epoch": 6.96875, "eval_loss": 0.6766613721847534, "eval_runtime": 47.4483, "eval_samples_per_second": 4.215, "eval_steps_per_second": 0.527, "step": 223 }, { "epoch": 7.0, "grad_norm": 0.6716901613635139, "learning_rate": 2e-05, "loss": 0.76, "step": 224 }, { "epoch": 7.0, "eval_loss": 0.6770586371421814, "eval_runtime": 47.0209, "eval_samples_per_second": 4.253, "eval_steps_per_second": 0.532, "step": 224 }, { "epoch": 7.03125, "grad_norm": 0.5953096184448028, "learning_rate": 2e-05, "loss": 0.6798, "step": 225 }, { "epoch": 7.03125, "eval_loss": 0.6774635314941406, "eval_runtime": 51.4624, "eval_samples_per_second": 3.886, "eval_steps_per_second": 0.486, "step": 225 }, { "epoch": 7.0625, "grad_norm": 0.6549589081607252, "learning_rate": 2e-05, "loss": 0.6122, "step": 226 }, { "epoch": 7.0625, "eval_loss": 0.6784033179283142, "eval_runtime": 45.8732, "eval_samples_per_second": 4.36, "eval_steps_per_second": 0.545, "step": 226 }, { "epoch": 7.09375, "grad_norm": 0.6573259751745981, "learning_rate": 2e-05, "loss": 0.6829, "step": 227 }, { "epoch": 7.09375, "eval_loss": 0.6796069145202637, "eval_runtime": 44.2994, "eval_samples_per_second": 4.515, "eval_steps_per_second": 0.564, "step": 227 }, { "epoch": 7.125, "grad_norm": 0.725599779122791, "learning_rate": 2e-05, "loss": 0.6336, "step": 228 }, { "epoch": 7.125, "eval_loss": 0.681220531463623, "eval_runtime": 45.7641, "eval_samples_per_second": 4.37, "eval_steps_per_second": 0.546, "step": 228 }, { "epoch": 7.15625, "grad_norm": 0.7811517272176121, "learning_rate": 2e-05, "loss": 0.6387, "step": 229 }, { "epoch": 7.15625, "eval_loss": 0.6828885674476624, "eval_runtime": 44.7953, "eval_samples_per_second": 4.465, "eval_steps_per_second": 0.558, "step": 229 }, { "epoch": 7.1875, "grad_norm": 0.6760384395465522, "learning_rate": 2e-05, "loss": 0.6245, "step": 230 }, { "epoch": 7.1875, "eval_loss": 0.6845852732658386, "eval_runtime": 44.3812, "eval_samples_per_second": 4.506, "eval_steps_per_second": 0.563, "step": 230 }, { "epoch": 7.21875, "grad_norm": 0.7361186814868562, "learning_rate": 2e-05, "loss": 0.7128, "step": 231 }, { "epoch": 7.21875, "eval_loss": 0.685402512550354, "eval_runtime": 44.3763, "eval_samples_per_second": 4.507, "eval_steps_per_second": 0.563, "step": 231 }, { "epoch": 7.25, "grad_norm": 0.7299978196751681, "learning_rate": 2e-05, "loss": 0.7176, "step": 232 }, { "epoch": 7.25, "eval_loss": 0.685026228427887, "eval_runtime": 44.3181, "eval_samples_per_second": 4.513, "eval_steps_per_second": 0.564, "step": 232 }, { "epoch": 7.28125, "grad_norm": 0.8584091654553072, "learning_rate": 2e-05, "loss": 0.6653, "step": 233 }, { "epoch": 7.28125, "eval_loss": 0.6831257343292236, "eval_runtime": 44.3805, "eval_samples_per_second": 4.506, "eval_steps_per_second": 0.563, "step": 233 }, { "epoch": 7.3125, "grad_norm": 0.6919046534495772, "learning_rate": 2e-05, "loss": 0.6968, "step": 234 }, { "epoch": 7.3125, "eval_loss": 0.6820144653320312, "eval_runtime": 44.3397, "eval_samples_per_second": 4.511, "eval_steps_per_second": 0.564, "step": 234 }, { "epoch": 7.34375, "grad_norm": 0.6716381808914595, "learning_rate": 2e-05, "loss": 0.6626, "step": 235 }, { "epoch": 7.34375, "eval_loss": 0.6815916299819946, "eval_runtime": 44.2997, "eval_samples_per_second": 4.515, "eval_steps_per_second": 0.564, "step": 235 }, { "epoch": 7.375, "grad_norm": 0.7098466238055623, "learning_rate": 2e-05, "loss": 0.629, "step": 236 }, { "epoch": 7.375, "eval_loss": 0.681601881980896, "eval_runtime": 44.2722, "eval_samples_per_second": 4.518, "eval_steps_per_second": 0.565, "step": 236 }, { "epoch": 7.40625, "grad_norm": 0.7700763843474521, "learning_rate": 2e-05, "loss": 0.6796, "step": 237 }, { "epoch": 7.40625, "eval_loss": 0.6809589862823486, "eval_runtime": 44.4518, "eval_samples_per_second": 4.499, "eval_steps_per_second": 0.562, "step": 237 }, { "epoch": 7.4375, "grad_norm": 0.7925088234539602, "learning_rate": 2e-05, "loss": 0.6722, "step": 238 }, { "epoch": 7.4375, "eval_loss": 0.6801493763923645, "eval_runtime": 44.4078, "eval_samples_per_second": 4.504, "eval_steps_per_second": 0.563, "step": 238 }, { "epoch": 7.46875, "grad_norm": 0.6778717561377235, "learning_rate": 2e-05, "loss": 0.6889, "step": 239 }, { "epoch": 7.46875, "eval_loss": 0.6798510551452637, "eval_runtime": 44.3303, "eval_samples_per_second": 4.512, "eval_steps_per_second": 0.564, "step": 239 }, { "epoch": 7.5, "grad_norm": 0.6683599876699755, "learning_rate": 2e-05, "loss": 0.6383, "step": 240 }, { "epoch": 7.5, "eval_loss": 0.6800721883773804, "eval_runtime": 44.5868, "eval_samples_per_second": 4.486, "eval_steps_per_second": 0.561, "step": 240 }, { "epoch": 7.53125, "grad_norm": 0.6242371910913779, "learning_rate": 2e-05, "loss": 0.6809, "step": 241 }, { "epoch": 7.53125, "eval_loss": 0.6809727549552917, "eval_runtime": 44.7112, "eval_samples_per_second": 4.473, "eval_steps_per_second": 0.559, "step": 241 }, { "epoch": 7.5625, "grad_norm": 0.6966989602775038, "learning_rate": 2e-05, "loss": 0.6777, "step": 242 }, { "epoch": 7.5625, "eval_loss": 0.6819994449615479, "eval_runtime": 44.3272, "eval_samples_per_second": 4.512, "eval_steps_per_second": 0.564, "step": 242 }, { "epoch": 7.59375, "grad_norm": 0.7373050917062219, "learning_rate": 2e-05, "loss": 0.6622, "step": 243 }, { "epoch": 7.59375, "eval_loss": 0.6821829080581665, "eval_runtime": 46.0527, "eval_samples_per_second": 4.343, "eval_steps_per_second": 0.543, "step": 243 }, { "epoch": 7.625, "grad_norm": 0.8266617785650243, "learning_rate": 2e-05, "loss": 0.7248, "step": 244 }, { "epoch": 7.625, "eval_loss": 0.6813778877258301, "eval_runtime": 45.7663, "eval_samples_per_second": 4.37, "eval_steps_per_second": 0.546, "step": 244 }, { "epoch": 7.65625, "grad_norm": 0.7459146574284048, "learning_rate": 2e-05, "loss": 0.6301, "step": 245 }, { "epoch": 7.65625, "eval_loss": 0.6811490654945374, "eval_runtime": 45.9361, "eval_samples_per_second": 4.354, "eval_steps_per_second": 0.544, "step": 245 }, { "epoch": 7.6875, "grad_norm": 0.7612602223178182, "learning_rate": 2e-05, "loss": 0.6713, "step": 246 }, { "epoch": 7.6875, "eval_loss": 0.6800392866134644, "eval_runtime": 46.3359, "eval_samples_per_second": 4.316, "eval_steps_per_second": 0.54, "step": 246 }, { "epoch": 7.71875, "grad_norm": 0.7391445622441601, "learning_rate": 2e-05, "loss": 0.6721, "step": 247 }, { "epoch": 7.71875, "eval_loss": 0.6794085502624512, "eval_runtime": 46.9877, "eval_samples_per_second": 4.256, "eval_steps_per_second": 0.532, "step": 247 }, { "epoch": 7.75, "grad_norm": 0.7019243161207622, "learning_rate": 2e-05, "loss": 0.6578, "step": 248 }, { "epoch": 7.75, "eval_loss": 0.6786046624183655, "eval_runtime": 46.0364, "eval_samples_per_second": 4.344, "eval_steps_per_second": 0.543, "step": 248 }, { "epoch": 7.78125, "grad_norm": 0.7933438921741315, "learning_rate": 2e-05, "loss": 0.7023, "step": 249 }, { "epoch": 7.78125, "eval_loss": 0.6770951747894287, "eval_runtime": 45.6655, "eval_samples_per_second": 4.38, "eval_steps_per_second": 0.547, "step": 249 }, { "epoch": 7.8125, "grad_norm": 0.7313927502966258, "learning_rate": 2e-05, "loss": 0.7114, "step": 250 }, { "epoch": 7.8125, "eval_loss": 0.6766157746315002, "eval_runtime": 45.7602, "eval_samples_per_second": 4.371, "eval_steps_per_second": 0.546, "step": 250 }, { "epoch": 7.84375, "grad_norm": 0.7235467321597684, "learning_rate": 2e-05, "loss": 0.6259, "step": 251 }, { "epoch": 7.84375, "eval_loss": 0.6770395040512085, "eval_runtime": 46.9839, "eval_samples_per_second": 4.257, "eval_steps_per_second": 0.532, "step": 251 }, { "epoch": 7.875, "grad_norm": 0.773244621810685, "learning_rate": 2e-05, "loss": 0.6262, "step": 252 }, { "epoch": 7.875, "eval_loss": 0.6780049800872803, "eval_runtime": 46.9808, "eval_samples_per_second": 4.257, "eval_steps_per_second": 0.532, "step": 252 }, { "epoch": 7.90625, "grad_norm": 0.7620627775664955, "learning_rate": 2e-05, "loss": 0.7219, "step": 253 }, { "epoch": 7.90625, "eval_loss": 0.6781153678894043, "eval_runtime": 49.7208, "eval_samples_per_second": 4.022, "eval_steps_per_second": 0.503, "step": 253 }, { "epoch": 7.9375, "grad_norm": 0.7332381519045823, "learning_rate": 2e-05, "loss": 0.6777, "step": 254 }, { "epoch": 7.9375, "eval_loss": 0.6787923574447632, "eval_runtime": 43.1001, "eval_samples_per_second": 4.64, "eval_steps_per_second": 0.58, "step": 254 }, { "epoch": 7.96875, "grad_norm": 0.7847956878083815, "learning_rate": 2e-05, "loss": 0.5983, "step": 255 }, { "epoch": 7.96875, "eval_loss": 0.6779956817626953, "eval_runtime": 43.1273, "eval_samples_per_second": 4.637, "eval_steps_per_second": 0.58, "step": 255 }, { "epoch": 8.0, "grad_norm": 0.7095399891563587, "learning_rate": 2e-05, "loss": 0.6609, "step": 256 }, { "epoch": 8.0, "eval_loss": 0.677204430103302, "eval_runtime": 43.0632, "eval_samples_per_second": 4.644, "eval_steps_per_second": 0.581, "step": 256 }, { "epoch": 8.03125, "grad_norm": 0.7654004838243704, "learning_rate": 2e-05, "loss": 0.6297, "step": 257 }, { "epoch": 8.03125, "eval_loss": 0.6774580478668213, "eval_runtime": 50.3948, "eval_samples_per_second": 3.969, "eval_steps_per_second": 0.496, "step": 257 }, { "epoch": 8.0625, "grad_norm": 0.7337064337590912, "learning_rate": 2e-05, "loss": 0.6446, "step": 258 }, { "epoch": 8.0625, "eval_loss": 0.6788855195045471, "eval_runtime": 43.163, "eval_samples_per_second": 4.634, "eval_steps_per_second": 0.579, "step": 258 }, { "epoch": 8.09375, "grad_norm": 0.7426969285671609, "learning_rate": 2e-05, "loss": 0.6732, "step": 259 }, { "epoch": 8.09375, "eval_loss": 0.6811656355857849, "eval_runtime": 43.113, "eval_samples_per_second": 4.639, "eval_steps_per_second": 0.58, "step": 259 }, { "epoch": 8.125, "grad_norm": 0.8495552484217858, "learning_rate": 2e-05, "loss": 0.6857, "step": 260 }, { "epoch": 8.125, "eval_loss": 0.6831929683685303, "eval_runtime": 43.0506, "eval_samples_per_second": 4.646, "eval_steps_per_second": 0.581, "step": 260 }, { "epoch": 8.15625, "grad_norm": 0.8137654207236353, "learning_rate": 2e-05, "loss": 0.6076, "step": 261 }, { "epoch": 8.15625, "eval_loss": 0.685956597328186, "eval_runtime": 43.6958, "eval_samples_per_second": 4.577, "eval_steps_per_second": 0.572, "step": 261 }, { "epoch": 8.1875, "grad_norm": 0.7920289131050305, "learning_rate": 2e-05, "loss": 0.723, "step": 262 }, { "epoch": 8.1875, "eval_loss": 0.6895143389701843, "eval_runtime": 44.5485, "eval_samples_per_second": 4.489, "eval_steps_per_second": 0.561, "step": 262 }, { "epoch": 8.21875, "grad_norm": 0.9058951636873679, "learning_rate": 2e-05, "loss": 0.5836, "step": 263 }, { "epoch": 8.21875, "eval_loss": 0.6920652985572815, "eval_runtime": 43.2986, "eval_samples_per_second": 4.619, "eval_steps_per_second": 0.577, "step": 263 }, { "epoch": 8.25, "grad_norm": 0.8945234539908303, "learning_rate": 2e-05, "loss": 0.6484, "step": 264 }, { "epoch": 8.25, "eval_loss": 0.69307541847229, "eval_runtime": 43.0302, "eval_samples_per_second": 4.648, "eval_steps_per_second": 0.581, "step": 264 }, { "epoch": 8.28125, "grad_norm": 0.9973855113532047, "learning_rate": 2e-05, "loss": 0.6735, "step": 265 }, { "epoch": 8.28125, "eval_loss": 0.6918882727622986, "eval_runtime": 43.035, "eval_samples_per_second": 4.647, "eval_steps_per_second": 0.581, "step": 265 }, { "epoch": 8.3125, "grad_norm": 0.8604633375599925, "learning_rate": 2e-05, "loss": 0.6618, "step": 266 }, { "epoch": 8.3125, "eval_loss": 0.6895372867584229, "eval_runtime": 43.31, "eval_samples_per_second": 4.618, "eval_steps_per_second": 0.577, "step": 266 }, { "epoch": 8.34375, "grad_norm": 0.8414418828391491, "learning_rate": 2e-05, "loss": 0.5879, "step": 267 }, { "epoch": 8.34375, "eval_loss": 0.687466561794281, "eval_runtime": 43.1943, "eval_samples_per_second": 4.63, "eval_steps_per_second": 0.579, "step": 267 }, { "epoch": 8.375, "grad_norm": 0.9186307751895403, "learning_rate": 2e-05, "loss": 0.6488, "step": 268 }, { "epoch": 8.375, "eval_loss": 0.6843683123588562, "eval_runtime": 43.0073, "eval_samples_per_second": 4.65, "eval_steps_per_second": 0.581, "step": 268 }, { "epoch": 8.40625, "grad_norm": 0.8308076771594943, "learning_rate": 2e-05, "loss": 0.6357, "step": 269 }, { "epoch": 8.40625, "eval_loss": 0.6821109056472778, "eval_runtime": 43.2217, "eval_samples_per_second": 4.627, "eval_steps_per_second": 0.578, "step": 269 }, { "epoch": 8.4375, "grad_norm": 0.7743250830620387, "learning_rate": 2e-05, "loss": 0.6408, "step": 270 }, { "epoch": 8.4375, "eval_loss": 0.6811809539794922, "eval_runtime": 44.8789, "eval_samples_per_second": 4.456, "eval_steps_per_second": 0.557, "step": 270 }, { "epoch": 8.46875, "grad_norm": 0.8351441656367814, "learning_rate": 2e-05, "loss": 0.5387, "step": 271 }, { "epoch": 8.46875, "eval_loss": 0.6824797987937927, "eval_runtime": 43.2723, "eval_samples_per_second": 4.622, "eval_steps_per_second": 0.578, "step": 271 }, { "epoch": 8.5, "grad_norm": 0.7808346216305826, "learning_rate": 2e-05, "loss": 0.637, "step": 272 }, { "epoch": 8.5, "eval_loss": 0.6853922009468079, "eval_runtime": 43.1091, "eval_samples_per_second": 4.639, "eval_steps_per_second": 0.58, "step": 272 }, { "epoch": 8.53125, "grad_norm": 0.8566382439854656, "learning_rate": 2e-05, "loss": 0.6524, "step": 273 }, { "epoch": 8.53125, "eval_loss": 0.6853267550468445, "eval_runtime": 43.1515, "eval_samples_per_second": 4.635, "eval_steps_per_second": 0.579, "step": 273 }, { "epoch": 8.5625, "grad_norm": 0.872427052560813, "learning_rate": 2e-05, "loss": 0.6393, "step": 274 }, { "epoch": 8.5625, "eval_loss": 0.6836146712303162, "eval_runtime": 44.9084, "eval_samples_per_second": 4.454, "eval_steps_per_second": 0.557, "step": 274 }, { "epoch": 8.59375, "grad_norm": 0.8437899827314175, "learning_rate": 2e-05, "loss": 0.6506, "step": 275 }, { "epoch": 8.59375, "eval_loss": 0.6817864179611206, "eval_runtime": 44.8879, "eval_samples_per_second": 4.456, "eval_steps_per_second": 0.557, "step": 275 }, { "epoch": 8.625, "grad_norm": 0.8790612317241222, "learning_rate": 2e-05, "loss": 0.6442, "step": 276 }, { "epoch": 8.625, "eval_loss": 0.6796035766601562, "eval_runtime": 43.3127, "eval_samples_per_second": 4.618, "eval_steps_per_second": 0.577, "step": 276 }, { "epoch": 8.65625, "grad_norm": 0.8158092597576191, "learning_rate": 2e-05, "loss": 0.5893, "step": 277 }, { "epoch": 8.65625, "eval_loss": 0.6795459985733032, "eval_runtime": 44.6925, "eval_samples_per_second": 4.475, "eval_steps_per_second": 0.559, "step": 277 }, { "epoch": 8.6875, "grad_norm": 0.878065597316925, "learning_rate": 2e-05, "loss": 0.6418, "step": 278 }, { "epoch": 8.6875, "eval_loss": 0.6804844737052917, "eval_runtime": 43.1496, "eval_samples_per_second": 4.635, "eval_steps_per_second": 0.579, "step": 278 }, { "epoch": 8.71875, "grad_norm": 0.8184085366861941, "learning_rate": 2e-05, "loss": 0.6007, "step": 279 }, { "epoch": 8.71875, "eval_loss": 0.6821385025978088, "eval_runtime": 44.9156, "eval_samples_per_second": 4.453, "eval_steps_per_second": 0.557, "step": 279 }, { "epoch": 8.75, "grad_norm": 0.9005368790411379, "learning_rate": 2e-05, "loss": 0.6683, "step": 280 }, { "epoch": 8.75, "eval_loss": 0.6848174333572388, "eval_runtime": 43.544, "eval_samples_per_second": 4.593, "eval_steps_per_second": 0.574, "step": 280 }, { "epoch": 8.78125, "grad_norm": 0.8154265443661354, "learning_rate": 2e-05, "loss": 0.612, "step": 281 }, { "epoch": 8.78125, "eval_loss": 0.6864734888076782, "eval_runtime": 44.5814, "eval_samples_per_second": 4.486, "eval_steps_per_second": 0.561, "step": 281 }, { "epoch": 8.8125, "grad_norm": 0.8905054313305548, "learning_rate": 2e-05, "loss": 0.5992, "step": 282 }, { "epoch": 8.8125, "eval_loss": 0.6864038109779358, "eval_runtime": 44.0748, "eval_samples_per_second": 4.538, "eval_steps_per_second": 0.567, "step": 282 }, { "epoch": 8.84375, "grad_norm": 0.8492838619646935, "learning_rate": 2e-05, "loss": 0.5775, "step": 283 }, { "epoch": 8.84375, "eval_loss": 0.686205267906189, "eval_runtime": 44.1813, "eval_samples_per_second": 4.527, "eval_steps_per_second": 0.566, "step": 283 }, { "epoch": 8.875, "grad_norm": 0.8739982729224768, "learning_rate": 2e-05, "loss": 0.5447, "step": 284 }, { "epoch": 8.875, "eval_loss": 0.6865501403808594, "eval_runtime": 46.5428, "eval_samples_per_second": 4.297, "eval_steps_per_second": 0.537, "step": 284 }, { "epoch": 8.90625, "grad_norm": 0.9936570525936491, "learning_rate": 2e-05, "loss": 0.59, "step": 285 }, { "epoch": 8.90625, "eval_loss": 0.6856868267059326, "eval_runtime": 44.6352, "eval_samples_per_second": 4.481, "eval_steps_per_second": 0.56, "step": 285 }, { "epoch": 8.9375, "grad_norm": 0.9517307402112732, "learning_rate": 2e-05, "loss": 0.7253, "step": 286 }, { "epoch": 8.9375, "eval_loss": 0.6847086548805237, "eval_runtime": 47.1289, "eval_samples_per_second": 4.244, "eval_steps_per_second": 0.53, "step": 286 }, { "epoch": 8.96875, "grad_norm": 0.8541430299481336, "learning_rate": 2e-05, "loss": 0.6436, "step": 287 }, { "epoch": 8.96875, "eval_loss": 0.6847487092018127, "eval_runtime": 46.395, "eval_samples_per_second": 4.311, "eval_steps_per_second": 0.539, "step": 287 }, { "epoch": 9.0, "grad_norm": 0.9356185152979635, "learning_rate": 2e-05, "loss": 0.5919, "step": 288 }, { "epoch": 9.0, "eval_loss": 0.6830996870994568, "eval_runtime": 45.0389, "eval_samples_per_second": 4.441, "eval_steps_per_second": 0.555, "step": 288 }, { "epoch": 9.03125, "grad_norm": 0.895841912664687, "learning_rate": 2e-05, "loss": 0.6074, "step": 289 }, { "epoch": 9.03125, "eval_loss": 0.6805940866470337, "eval_runtime": 43.275, "eval_samples_per_second": 4.622, "eval_steps_per_second": 0.578, "step": 289 }, { "epoch": 9.0625, "grad_norm": 0.8181374187415763, "learning_rate": 2e-05, "loss": 0.6233, "step": 290 }, { "epoch": 9.0625, "eval_loss": 0.679899275302887, "eval_runtime": 43.4137, "eval_samples_per_second": 4.607, "eval_steps_per_second": 0.576, "step": 290 }, { "epoch": 9.09375, "grad_norm": 0.8491986564498026, "learning_rate": 2e-05, "loss": 0.6262, "step": 291 }, { "epoch": 9.09375, "eval_loss": 0.682360053062439, "eval_runtime": 45.2147, "eval_samples_per_second": 4.423, "eval_steps_per_second": 0.553, "step": 291 }, { "epoch": 9.125, "grad_norm": 0.9355368723165358, "learning_rate": 2e-05, "loss": 0.595, "step": 292 }, { "epoch": 9.125, "eval_loss": 0.6852359175682068, "eval_runtime": 44.335, "eval_samples_per_second": 4.511, "eval_steps_per_second": 0.564, "step": 292 }, { "epoch": 9.15625, "grad_norm": 0.9196086439363605, "learning_rate": 2e-05, "loss": 0.6534, "step": 293 }, { "epoch": 9.15625, "eval_loss": 0.6872662305831909, "eval_runtime": 46.5007, "eval_samples_per_second": 4.301, "eval_steps_per_second": 0.538, "step": 293 }, { "epoch": 9.1875, "grad_norm": 0.8393737542433595, "learning_rate": 2e-05, "loss": 0.5908, "step": 294 }, { "epoch": 9.1875, "eval_loss": 0.6902926564216614, "eval_runtime": 46.1133, "eval_samples_per_second": 4.337, "eval_steps_per_second": 0.542, "step": 294 }, { "epoch": 9.21875, "grad_norm": 1.00507877022181, "learning_rate": 2e-05, "loss": 0.536, "step": 295 }, { "epoch": 9.21875, "eval_loss": 0.6969813704490662, "eval_runtime": 45.209, "eval_samples_per_second": 4.424, "eval_steps_per_second": 0.553, "step": 295 }, { "epoch": 9.25, "grad_norm": 0.9241164807887086, "learning_rate": 2e-05, "loss": 0.5562, "step": 296 }, { "epoch": 9.25, "eval_loss": 0.7055781483650208, "eval_runtime": 44.1347, "eval_samples_per_second": 4.532, "eval_steps_per_second": 0.566, "step": 296 }, { "epoch": 9.28125, "grad_norm": 1.085449108925152, "learning_rate": 2e-05, "loss": 0.6582, "step": 297 }, { "epoch": 9.28125, "eval_loss": 0.7090529799461365, "eval_runtime": 46.0924, "eval_samples_per_second": 4.339, "eval_steps_per_second": 0.542, "step": 297 }, { "epoch": 9.3125, "grad_norm": 1.2857794830276748, "learning_rate": 2e-05, "loss": 0.5942, "step": 298 }, { "epoch": 9.3125, "eval_loss": 0.7092991471290588, "eval_runtime": 45.9455, "eval_samples_per_second": 4.353, "eval_steps_per_second": 0.544, "step": 298 }, { "epoch": 9.34375, "grad_norm": 1.1012657793973455, "learning_rate": 2e-05, "loss": 0.5681, "step": 299 }, { "epoch": 9.34375, "eval_loss": 0.7078263759613037, "eval_runtime": 44.3361, "eval_samples_per_second": 4.511, "eval_steps_per_second": 0.564, "step": 299 }, { "epoch": 9.375, "grad_norm": 1.0150133491916107, "learning_rate": 2e-05, "loss": 0.5829, "step": 300 }, { "epoch": 9.375, "eval_loss": 0.7039945721626282, "eval_runtime": 46.0368, "eval_samples_per_second": 4.344, "eval_steps_per_second": 0.543, "step": 300 }, { "epoch": 9.40625, "grad_norm": 1.0183449928898174, "learning_rate": 2e-05, "loss": 0.5622, "step": 301 }, { "epoch": 9.40625, "eval_loss": 0.6970013380050659, "eval_runtime": 44.3071, "eval_samples_per_second": 4.514, "eval_steps_per_second": 0.564, "step": 301 }, { "epoch": 9.4375, "grad_norm": 1.160561076731859, "learning_rate": 2e-05, "loss": 0.6207, "step": 302 }, { "epoch": 9.4375, "eval_loss": 0.6882898211479187, "eval_runtime": 44.3423, "eval_samples_per_second": 4.51, "eval_steps_per_second": 0.564, "step": 302 }, { "epoch": 9.46875, "grad_norm": 0.9775130871533282, "learning_rate": 2e-05, "loss": 0.6121, "step": 303 }, { "epoch": 9.46875, "eval_loss": 0.6842953562736511, "eval_runtime": 45.0998, "eval_samples_per_second": 4.435, "eval_steps_per_second": 0.554, "step": 303 }, { "epoch": 9.5, "grad_norm": 0.8440645832373606, "learning_rate": 2e-05, "loss": 0.6495, "step": 304 }, { "epoch": 9.5, "eval_loss": 0.6841378808021545, "eval_runtime": 44.4679, "eval_samples_per_second": 4.498, "eval_steps_per_second": 0.562, "step": 304 }, { "epoch": 9.53125, "grad_norm": 0.9112261594523882, "learning_rate": 2e-05, "loss": 0.6188, "step": 305 }, { "epoch": 9.53125, "eval_loss": 0.6845135688781738, "eval_runtime": 44.4427, "eval_samples_per_second": 4.5, "eval_steps_per_second": 0.563, "step": 305 }, { "epoch": 9.5625, "grad_norm": 1.0253409237396724, "learning_rate": 2e-05, "loss": 0.602, "step": 306 }, { "epoch": 9.5625, "eval_loss": 0.6839584112167358, "eval_runtime": 44.1975, "eval_samples_per_second": 4.525, "eval_steps_per_second": 0.566, "step": 306 }, { "epoch": 9.59375, "grad_norm": 1.0395385110757185, "learning_rate": 2e-05, "loss": 0.6007, "step": 307 }, { "epoch": 9.59375, "eval_loss": 0.6852008104324341, "eval_runtime": 44.4015, "eval_samples_per_second": 4.504, "eval_steps_per_second": 0.563, "step": 307 }, { "epoch": 9.625, "grad_norm": 0.9468230481893222, "learning_rate": 2e-05, "loss": 0.6376, "step": 308 }, { "epoch": 9.625, "eval_loss": 0.6902636885643005, "eval_runtime": 45.6849, "eval_samples_per_second": 4.378, "eval_steps_per_second": 0.547, "step": 308 }, { "epoch": 9.65625, "grad_norm": 0.9298141136824676, "learning_rate": 2e-05, "loss": 0.6094, "step": 309 }, { "epoch": 9.65625, "eval_loss": 0.6970698833465576, "eval_runtime": 44.2879, "eval_samples_per_second": 4.516, "eval_steps_per_second": 0.564, "step": 309 }, { "epoch": 9.6875, "grad_norm": 1.2537810836544294, "learning_rate": 2e-05, "loss": 0.6049, "step": 310 }, { "epoch": 9.6875, "eval_loss": 0.6991828083992004, "eval_runtime": 46.2429, "eval_samples_per_second": 4.325, "eval_steps_per_second": 0.541, "step": 310 }, { "epoch": 9.71875, "grad_norm": 1.082420692181638, "learning_rate": 2e-05, "loss": 0.5241, "step": 311 }, { "epoch": 9.71875, "eval_loss": 0.7002778649330139, "eval_runtime": 44.2468, "eval_samples_per_second": 4.52, "eval_steps_per_second": 0.565, "step": 311 }, { "epoch": 9.75, "grad_norm": 1.0383910110357883, "learning_rate": 2e-05, "loss": 0.6162, "step": 312 }, { "epoch": 9.75, "eval_loss": 0.7004844546318054, "eval_runtime": 44.357, "eval_samples_per_second": 4.509, "eval_steps_per_second": 0.564, "step": 312 }, { "epoch": 9.78125, "grad_norm": 0.9375392905585037, "learning_rate": 2e-05, "loss": 0.6082, "step": 313 }, { "epoch": 9.78125, "eval_loss": 0.6998957991600037, "eval_runtime": 44.3911, "eval_samples_per_second": 4.505, "eval_steps_per_second": 0.563, "step": 313 }, { "epoch": 9.8125, "grad_norm": 1.080227501802435, "learning_rate": 2e-05, "loss": 0.5826, "step": 314 }, { "epoch": 9.8125, "eval_loss": 0.698168158531189, "eval_runtime": 44.2481, "eval_samples_per_second": 4.52, "eval_steps_per_second": 0.565, "step": 314 }, { "epoch": 9.84375, "grad_norm": 0.9707388919250783, "learning_rate": 2e-05, "loss": 0.61, "step": 315 }, { "epoch": 9.84375, "eval_loss": 0.6951956152915955, "eval_runtime": 44.4353, "eval_samples_per_second": 4.501, "eval_steps_per_second": 0.563, "step": 315 }, { "epoch": 9.875, "grad_norm": 0.9491238644745222, "learning_rate": 2e-05, "loss": 0.5957, "step": 316 }, { "epoch": 9.875, "eval_loss": 0.6926063299179077, "eval_runtime": 45.2893, "eval_samples_per_second": 4.416, "eval_steps_per_second": 0.552, "step": 316 }, { "epoch": 9.90625, "grad_norm": 1.0530872213679219, "learning_rate": 2e-05, "loss": 0.5611, "step": 317 }, { "epoch": 9.90625, "eval_loss": 0.6899718642234802, "eval_runtime": 45.2963, "eval_samples_per_second": 4.415, "eval_steps_per_second": 0.552, "step": 317 }, { "epoch": 9.9375, "grad_norm": 1.0052684640770637, "learning_rate": 2e-05, "loss": 0.5838, "step": 318 }, { "epoch": 9.9375, "eval_loss": 0.6875657439231873, "eval_runtime": 45.4935, "eval_samples_per_second": 4.396, "eval_steps_per_second": 0.55, "step": 318 }, { "epoch": 9.96875, "grad_norm": 1.1010229534386275, "learning_rate": 2e-05, "loss": 0.6106, "step": 319 }, { "epoch": 9.96875, "eval_loss": 0.6842039227485657, "eval_runtime": 46.6347, "eval_samples_per_second": 4.289, "eval_steps_per_second": 0.536, "step": 319 }, { "epoch": 10.0, "grad_norm": 1.0125768255592298, "learning_rate": 2e-05, "loss": 0.5698, "step": 320 }, { "epoch": 10.0, "eval_loss": 0.6834940910339355, "eval_runtime": 45.1875, "eval_samples_per_second": 4.426, "eval_steps_per_second": 0.553, "step": 320 }, { "epoch": 10.0, "step": 320, "total_flos": 414794833330176.0, "train_loss": 0.12300790920853615, "train_runtime": 3683.354, "train_samples_per_second": 2.715, "train_steps_per_second": 0.087 } ], "logging_steps": 1.0, "max_steps": 320, "num_input_tokens_seen": 0, "num_train_epochs": 10, "save_steps": 5, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 414794833330176.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }