{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 0, "global_step": 128, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0078125, "grad_norm": 0.61328125, "learning_rate": 1e-05, "loss": 2.1689, "step": 1 }, { "epoch": 0.015625, "grad_norm": 0.58984375, "learning_rate": 9.921875e-06, "loss": 2.1485, "step": 2 }, { "epoch": 0.0234375, "grad_norm": 0.59375, "learning_rate": 9.84375e-06, "loss": 2.1564, "step": 3 }, { "epoch": 0.03125, "grad_norm": 0.5546875, "learning_rate": 9.765625e-06, "loss": 2.2022, "step": 4 }, { "epoch": 0.0390625, "grad_norm": 0.55078125, "learning_rate": 9.6875e-06, "loss": 2.0902, "step": 5 }, { "epoch": 0.046875, "grad_norm": 0.5859375, "learning_rate": 9.609375000000001e-06, "loss": 2.1296, "step": 6 }, { "epoch": 0.0546875, "grad_norm": 0.51171875, "learning_rate": 9.531250000000001e-06, "loss": 2.1615, "step": 7 }, { "epoch": 0.0625, "grad_norm": 0.91015625, "learning_rate": 9.453125000000001e-06, "loss": 2.2045, "step": 8 }, { "epoch": 0.0703125, "grad_norm": 0.474609375, "learning_rate": 9.375000000000001e-06, "loss": 2.0598, "step": 9 }, { "epoch": 0.078125, "grad_norm": 0.47265625, "learning_rate": 9.296875e-06, "loss": 2.0913, "step": 10 }, { "epoch": 0.0859375, "grad_norm": 0.462890625, "learning_rate": 9.21875e-06, "loss": 2.0043, "step": 11 }, { "epoch": 0.09375, "grad_norm": 0.4609375, "learning_rate": 9.140625e-06, "loss": 1.973, "step": 12 }, { "epoch": 0.1015625, "grad_norm": 0.4609375, "learning_rate": 9.0625e-06, "loss": 1.9907, "step": 13 }, { "epoch": 0.109375, "grad_norm": 0.478515625, "learning_rate": 8.984375000000002e-06, "loss": 2.0225, "step": 14 }, { "epoch": 0.1171875, "grad_norm": 0.455078125, "learning_rate": 8.906250000000001e-06, "loss": 1.868, "step": 15 }, { "epoch": 0.125, "grad_norm": 0.48046875, "learning_rate": 8.828125000000001e-06, "loss": 1.9023, "step": 16 }, { "epoch": 0.1328125, "grad_norm": 0.474609375, "learning_rate": 8.750000000000001e-06, "loss": 1.8461, "step": 17 }, { "epoch": 0.140625, "grad_norm": 0.46875, "learning_rate": 8.671875e-06, "loss": 1.8176, "step": 18 }, { "epoch": 0.1484375, "grad_norm": 0.474609375, "learning_rate": 8.59375e-06, "loss": 1.7922, "step": 19 }, { "epoch": 0.15625, "grad_norm": 0.4375, "learning_rate": 8.515625e-06, "loss": 1.7756, "step": 20 }, { "epoch": 0.1640625, "grad_norm": 0.421875, "learning_rate": 8.4375e-06, "loss": 1.7932, "step": 21 }, { "epoch": 0.171875, "grad_norm": 0.408203125, "learning_rate": 8.359375e-06, "loss": 1.702, "step": 22 }, { "epoch": 0.1796875, "grad_norm": 0.4375, "learning_rate": 8.281250000000001e-06, "loss": 1.7802, "step": 23 }, { "epoch": 0.1875, "grad_norm": 0.392578125, "learning_rate": 8.203125000000001e-06, "loss": 1.759, "step": 24 }, { "epoch": 0.1953125, "grad_norm": 0.357421875, "learning_rate": 8.125000000000001e-06, "loss": 1.5837, "step": 25 }, { "epoch": 0.203125, "grad_norm": 0.357421875, "learning_rate": 8.046875e-06, "loss": 1.6122, "step": 26 }, { "epoch": 0.2109375, "grad_norm": 0.392578125, "learning_rate": 7.96875e-06, "loss": 1.7369, "step": 27 }, { "epoch": 0.21875, "grad_norm": 0.341796875, "learning_rate": 7.890625e-06, "loss": 1.5856, "step": 28 }, { "epoch": 0.2265625, "grad_norm": 0.34375, "learning_rate": 7.8125e-06, "loss": 1.5203, "step": 29 }, { "epoch": 0.234375, "grad_norm": 0.322265625, "learning_rate": 7.734375e-06, "loss": 1.5575, "step": 30 }, { "epoch": 0.2421875, "grad_norm": 0.33203125, "learning_rate": 7.656250000000001e-06, "loss": 1.6101, "step": 31 }, { "epoch": 0.25, "grad_norm": 0.361328125, "learning_rate": 7.578125e-06, "loss": 1.6346, "step": 32 }, { "epoch": 0.2578125, "grad_norm": 0.486328125, "learning_rate": 7.500000000000001e-06, "loss": 1.5728, "step": 33 }, { "epoch": 0.265625, "grad_norm": 0.32421875, "learning_rate": 7.421875000000001e-06, "loss": 1.5461, "step": 34 }, { "epoch": 0.2734375, "grad_norm": 0.3046875, "learning_rate": 7.343750000000001e-06, "loss": 1.4986, "step": 35 }, { "epoch": 0.28125, "grad_norm": 0.33203125, "learning_rate": 7.265625e-06, "loss": 1.5123, "step": 36 }, { "epoch": 0.2890625, "grad_norm": 0.28515625, "learning_rate": 7.1875e-06, "loss": 1.4697, "step": 37 }, { "epoch": 0.296875, "grad_norm": 0.2890625, "learning_rate": 7.109375000000001e-06, "loss": 1.501, "step": 38 }, { "epoch": 0.3046875, "grad_norm": 0.296875, "learning_rate": 7.031250000000001e-06, "loss": 1.4536, "step": 39 }, { "epoch": 0.3125, "grad_norm": 0.376953125, "learning_rate": 6.9531250000000004e-06, "loss": 1.5039, "step": 40 }, { "epoch": 0.3203125, "grad_norm": 0.28125, "learning_rate": 6.875e-06, "loss": 1.4613, "step": 41 }, { "epoch": 0.328125, "grad_norm": 0.287109375, "learning_rate": 6.796875000000001e-06, "loss": 1.4792, "step": 42 }, { "epoch": 0.3359375, "grad_norm": 0.251953125, "learning_rate": 6.718750000000001e-06, "loss": 1.4381, "step": 43 }, { "epoch": 0.34375, "grad_norm": 0.26171875, "learning_rate": 6.6406250000000005e-06, "loss": 1.4575, "step": 44 }, { "epoch": 0.3515625, "grad_norm": 0.2890625, "learning_rate": 6.5625e-06, "loss": 1.4558, "step": 45 }, { "epoch": 0.359375, "grad_norm": 0.26171875, "learning_rate": 6.484375000000001e-06, "loss": 1.4537, "step": 46 }, { "epoch": 0.3671875, "grad_norm": 0.2421875, "learning_rate": 6.406250000000001e-06, "loss": 1.3925, "step": 47 }, { "epoch": 0.375, "grad_norm": 0.275390625, "learning_rate": 6.3281250000000005e-06, "loss": 1.5034, "step": 48 }, { "epoch": 0.3828125, "grad_norm": 0.27734375, "learning_rate": 6.25e-06, "loss": 1.4933, "step": 49 }, { "epoch": 0.390625, "grad_norm": 0.27734375, "learning_rate": 6.171875e-06, "loss": 1.4682, "step": 50 }, { "epoch": 0.3984375, "grad_norm": 0.2412109375, "learning_rate": 6.093750000000001e-06, "loss": 1.3798, "step": 51 }, { "epoch": 0.40625, "grad_norm": 0.2412109375, "learning_rate": 6.0156250000000005e-06, "loss": 1.4287, "step": 52 }, { "epoch": 0.4140625, "grad_norm": 0.259765625, "learning_rate": 5.9375e-06, "loss": 1.4116, "step": 53 }, { "epoch": 0.421875, "grad_norm": 0.27734375, "learning_rate": 5.859375e-06, "loss": 1.391, "step": 54 }, { "epoch": 0.4296875, "grad_norm": 0.24609375, "learning_rate": 5.781250000000001e-06, "loss": 1.366, "step": 55 }, { "epoch": 0.4375, "grad_norm": 0.23828125, "learning_rate": 5.7031250000000006e-06, "loss": 1.3947, "step": 56 }, { "epoch": 0.4453125, "grad_norm": 0.228515625, "learning_rate": 5.625e-06, "loss": 1.3593, "step": 57 }, { "epoch": 0.453125, "grad_norm": 0.251953125, "learning_rate": 5.546875e-06, "loss": 1.3858, "step": 58 }, { "epoch": 0.4609375, "grad_norm": 0.412109375, "learning_rate": 5.468750000000001e-06, "loss": 1.2658, "step": 59 }, { "epoch": 0.46875, "grad_norm": 0.2373046875, "learning_rate": 5.390625000000001e-06, "loss": 1.3146, "step": 60 }, { "epoch": 0.4765625, "grad_norm": 0.2333984375, "learning_rate": 5.3125e-06, "loss": 1.3277, "step": 61 }, { "epoch": 0.484375, "grad_norm": 0.2353515625, "learning_rate": 5.234375e-06, "loss": 1.3387, "step": 62 }, { "epoch": 0.4921875, "grad_norm": 0.236328125, "learning_rate": 5.156250000000001e-06, "loss": 1.2693, "step": 63 }, { "epoch": 0.5, "grad_norm": 0.283203125, "learning_rate": 5.078125000000001e-06, "loss": 1.4156, "step": 64 }, { "epoch": 0.5078125, "grad_norm": 0.22265625, "learning_rate": 5e-06, "loss": 1.3413, "step": 65 }, { "epoch": 0.515625, "grad_norm": 0.27734375, "learning_rate": 4.921875e-06, "loss": 1.2651, "step": 66 }, { "epoch": 0.5234375, "grad_norm": 0.28515625, "learning_rate": 4.84375e-06, "loss": 1.2676, "step": 67 }, { "epoch": 0.53125, "grad_norm": 0.28515625, "learning_rate": 4.765625000000001e-06, "loss": 1.3619, "step": 68 }, { "epoch": 0.5390625, "grad_norm": 0.265625, "learning_rate": 4.6875000000000004e-06, "loss": 1.293, "step": 69 }, { "epoch": 0.546875, "grad_norm": 0.294921875, "learning_rate": 4.609375e-06, "loss": 1.3701, "step": 70 }, { "epoch": 0.5546875, "grad_norm": 0.263671875, "learning_rate": 4.53125e-06, "loss": 1.3667, "step": 71 }, { "epoch": 0.5625, "grad_norm": 0.216796875, "learning_rate": 4.453125000000001e-06, "loss": 1.312, "step": 72 }, { "epoch": 0.5703125, "grad_norm": 0.2255859375, "learning_rate": 4.3750000000000005e-06, "loss": 1.2787, "step": 73 }, { "epoch": 0.578125, "grad_norm": 0.2021484375, "learning_rate": 4.296875e-06, "loss": 1.2332, "step": 74 }, { "epoch": 0.5859375, "grad_norm": 0.216796875, "learning_rate": 4.21875e-06, "loss": 1.2995, "step": 75 }, { "epoch": 0.59375, "grad_norm": 0.296875, "learning_rate": 4.140625000000001e-06, "loss": 1.2753, "step": 76 }, { "epoch": 0.6015625, "grad_norm": 0.2080078125, "learning_rate": 4.0625000000000005e-06, "loss": 1.2731, "step": 77 }, { "epoch": 0.609375, "grad_norm": 0.25390625, "learning_rate": 3.984375e-06, "loss": 1.2729, "step": 78 }, { "epoch": 0.6171875, "grad_norm": 0.224609375, "learning_rate": 3.90625e-06, "loss": 1.2847, "step": 79 }, { "epoch": 0.625, "grad_norm": 0.2353515625, "learning_rate": 3.828125000000001e-06, "loss": 1.2964, "step": 80 }, { "epoch": 0.6328125, "grad_norm": 0.2177734375, "learning_rate": 3.7500000000000005e-06, "loss": 1.3205, "step": 81 }, { "epoch": 0.640625, "grad_norm": 0.2119140625, "learning_rate": 3.6718750000000003e-06, "loss": 1.2755, "step": 82 }, { "epoch": 0.6484375, "grad_norm": 0.21875, "learning_rate": 3.59375e-06, "loss": 1.2444, "step": 83 }, { "epoch": 0.65625, "grad_norm": 0.21875, "learning_rate": 3.5156250000000003e-06, "loss": 1.2853, "step": 84 }, { "epoch": 0.6640625, "grad_norm": 0.337890625, "learning_rate": 3.4375e-06, "loss": 1.3118, "step": 85 }, { "epoch": 0.671875, "grad_norm": 0.310546875, "learning_rate": 3.3593750000000003e-06, "loss": 1.2616, "step": 86 }, { "epoch": 0.6796875, "grad_norm": 0.2099609375, "learning_rate": 3.28125e-06, "loss": 1.2478, "step": 87 }, { "epoch": 0.6875, "grad_norm": 0.34765625, "learning_rate": 3.2031250000000004e-06, "loss": 1.2478, "step": 88 }, { "epoch": 0.6953125, "grad_norm": 0.19921875, "learning_rate": 3.125e-06, "loss": 1.1987, "step": 89 }, { "epoch": 0.703125, "grad_norm": 0.224609375, "learning_rate": 3.0468750000000004e-06, "loss": 1.3421, "step": 90 }, { "epoch": 0.7109375, "grad_norm": 0.203125, "learning_rate": 2.96875e-06, "loss": 1.2491, "step": 91 }, { "epoch": 0.71875, "grad_norm": 0.205078125, "learning_rate": 2.8906250000000004e-06, "loss": 1.2191, "step": 92 }, { "epoch": 0.7265625, "grad_norm": 0.212890625, "learning_rate": 2.8125e-06, "loss": 1.2694, "step": 93 }, { "epoch": 0.734375, "grad_norm": 0.283203125, "learning_rate": 2.7343750000000004e-06, "loss": 1.3663, "step": 94 }, { "epoch": 0.7421875, "grad_norm": 0.27734375, "learning_rate": 2.65625e-06, "loss": 1.29, "step": 95 }, { "epoch": 0.75, "grad_norm": 0.240234375, "learning_rate": 2.5781250000000004e-06, "loss": 1.266, "step": 96 }, { "epoch": 0.7578125, "grad_norm": 0.2275390625, "learning_rate": 2.5e-06, "loss": 1.2643, "step": 97 }, { "epoch": 0.765625, "grad_norm": 0.2392578125, "learning_rate": 2.421875e-06, "loss": 1.2281, "step": 98 }, { "epoch": 0.7734375, "grad_norm": 0.1982421875, "learning_rate": 2.3437500000000002e-06, "loss": 1.1966, "step": 99 }, { "epoch": 0.78125, "grad_norm": 0.197265625, "learning_rate": 2.265625e-06, "loss": 1.2174, "step": 100 }, { "epoch": 0.7890625, "grad_norm": 0.2001953125, "learning_rate": 2.1875000000000002e-06, "loss": 1.1622, "step": 101 }, { "epoch": 0.796875, "grad_norm": 0.2041015625, "learning_rate": 2.109375e-06, "loss": 1.2519, "step": 102 }, { "epoch": 0.8046875, "grad_norm": 0.2392578125, "learning_rate": 2.0312500000000002e-06, "loss": 1.3168, "step": 103 }, { "epoch": 0.8125, "grad_norm": 0.298828125, "learning_rate": 1.953125e-06, "loss": 1.2386, "step": 104 }, { "epoch": 0.8203125, "grad_norm": 0.2421875, "learning_rate": 1.8750000000000003e-06, "loss": 1.2855, "step": 105 }, { "epoch": 0.828125, "grad_norm": 0.240234375, "learning_rate": 1.796875e-06, "loss": 1.2156, "step": 106 }, { "epoch": 0.8359375, "grad_norm": 0.255859375, "learning_rate": 1.71875e-06, "loss": 1.2288, "step": 107 }, { "epoch": 0.84375, "grad_norm": 0.2255859375, "learning_rate": 1.640625e-06, "loss": 1.2893, "step": 108 }, { "epoch": 0.8515625, "grad_norm": 0.2451171875, "learning_rate": 1.5625e-06, "loss": 1.2798, "step": 109 }, { "epoch": 0.859375, "grad_norm": 0.2451171875, "learning_rate": 1.484375e-06, "loss": 1.28, "step": 110 }, { "epoch": 0.8671875, "grad_norm": 0.208984375, "learning_rate": 1.40625e-06, "loss": 1.2544, "step": 111 }, { "epoch": 0.875, "grad_norm": 0.275390625, "learning_rate": 1.328125e-06, "loss": 1.2865, "step": 112 }, { "epoch": 0.8828125, "grad_norm": 0.2119140625, "learning_rate": 1.25e-06, "loss": 1.2482, "step": 113 }, { "epoch": 0.890625, "grad_norm": 0.197265625, "learning_rate": 1.1718750000000001e-06, "loss": 1.1816, "step": 114 }, { "epoch": 0.8984375, "grad_norm": 0.208984375, "learning_rate": 1.0937500000000001e-06, "loss": 1.2452, "step": 115 }, { "epoch": 0.90625, "grad_norm": 0.2041015625, "learning_rate": 1.0156250000000001e-06, "loss": 1.2285, "step": 116 }, { "epoch": 0.9140625, "grad_norm": 0.2177734375, "learning_rate": 9.375000000000001e-07, "loss": 1.2178, "step": 117 }, { "epoch": 0.921875, "grad_norm": 0.216796875, "learning_rate": 8.59375e-07, "loss": 1.2501, "step": 118 }, { "epoch": 0.9296875, "grad_norm": 0.2080078125, "learning_rate": 7.8125e-07, "loss": 1.1922, "step": 119 }, { "epoch": 0.9375, "grad_norm": 0.2138671875, "learning_rate": 7.03125e-07, "loss": 1.247, "step": 120 }, { "epoch": 0.9453125, "grad_norm": 0.2080078125, "learning_rate": 6.25e-07, "loss": 1.2454, "step": 121 }, { "epoch": 0.953125, "grad_norm": 0.2001953125, "learning_rate": 5.468750000000001e-07, "loss": 1.1875, "step": 122 }, { "epoch": 0.9609375, "grad_norm": 0.2255859375, "learning_rate": 4.6875000000000006e-07, "loss": 1.254, "step": 123 }, { "epoch": 0.96875, "grad_norm": 0.2197265625, "learning_rate": 3.90625e-07, "loss": 1.2924, "step": 124 }, { "epoch": 0.9765625, "grad_norm": 0.2119140625, "learning_rate": 3.125e-07, "loss": 1.2041, "step": 125 }, { "epoch": 0.984375, "grad_norm": 0.2392578125, "learning_rate": 2.3437500000000003e-07, "loss": 1.2446, "step": 126 }, { "epoch": 0.9921875, "grad_norm": 0.212890625, "learning_rate": 1.5625e-07, "loss": 1.2699, "step": 127 }, { "epoch": 1.0, "grad_norm": 0.2314453125, "learning_rate": 7.8125e-08, "loss": 1.2642, "step": 128 } ], "logging_steps": 1.0, "max_steps": 128, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 0, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 4.109983433055273e+17, "train_batch_size": 1, "trial_name": null, "trial_params": null }