{ "best_metric": null, "best_model_checkpoint": null, "epoch": 3.8720930232558137, "eval_steps": 500, "global_step": 112, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.03488372093023256, "grad_norm": 43.01280212402344, "learning_rate": 5.0000000000000004e-08, "loss": 2.7496, "step": 1 }, { "epoch": 0.06976744186046512, "grad_norm": 44.429630279541016, "learning_rate": 1.0000000000000001e-07, "loss": 2.7809, "step": 2 }, { "epoch": 0.10465116279069768, "grad_norm": 42.754234313964844, "learning_rate": 1.5000000000000002e-07, "loss": 2.6968, "step": 3 }, { "epoch": 0.13953488372093023, "grad_norm": 45.27891159057617, "learning_rate": 2.0000000000000002e-07, "loss": 2.8076, "step": 4 }, { "epoch": 0.1744186046511628, "grad_norm": 43.920860290527344, "learning_rate": 2.5000000000000004e-07, "loss": 2.7255, "step": 5 }, { "epoch": 0.20930232558139536, "grad_norm": 44.61790466308594, "learning_rate": 3.0000000000000004e-07, "loss": 2.7178, "step": 6 }, { "epoch": 0.2441860465116279, "grad_norm": 44.28205871582031, "learning_rate": 3.5000000000000004e-07, "loss": 2.7193, "step": 7 }, { "epoch": 0.27906976744186046, "grad_norm": 45.1516227722168, "learning_rate": 4.0000000000000003e-07, "loss": 2.6846, "step": 8 }, { "epoch": 0.313953488372093, "grad_norm": 46.8723258972168, "learning_rate": 4.5000000000000003e-07, "loss": 2.67, "step": 9 }, { "epoch": 0.3488372093023256, "grad_norm": 48.33848571777344, "learning_rate": 5.000000000000001e-07, "loss": 2.6368, "step": 10 }, { "epoch": 0.38372093023255816, "grad_norm": 46.772193908691406, "learning_rate": 5.5e-07, "loss": 2.5562, "step": 11 }, { "epoch": 0.4186046511627907, "grad_norm": 48.12417984008789, "learning_rate": 6.000000000000001e-07, "loss": 2.5362, "step": 12 }, { "epoch": 0.45348837209302323, "grad_norm": 49.0787239074707, "learning_rate": 6.5e-07, "loss": 2.3981, "step": 13 }, { "epoch": 0.4883720930232558, "grad_norm": 52.367183685302734, "learning_rate": 7.000000000000001e-07, "loss": 2.3837, "step": 14 }, { "epoch": 0.5232558139534884, "grad_norm": 53.57649230957031, "learning_rate": 7.5e-07, "loss": 2.2885, "step": 15 }, { "epoch": 0.5581395348837209, "grad_norm": 56.58591079711914, "learning_rate": 8.000000000000001e-07, "loss": 2.1447, "step": 16 }, { "epoch": 0.5930232558139535, "grad_norm": 55.064735412597656, "learning_rate": 8.500000000000001e-07, "loss": 2.0242, "step": 17 }, { "epoch": 0.627906976744186, "grad_norm": 52.61149597167969, "learning_rate": 9.000000000000001e-07, "loss": 1.9263, "step": 18 }, { "epoch": 0.6627906976744186, "grad_norm": 44.34925079345703, "learning_rate": 9.500000000000001e-07, "loss": 1.7466, "step": 19 }, { "epoch": 0.6976744186046512, "grad_norm": 36.627296447753906, "learning_rate": 1.0000000000000002e-06, "loss": 1.6084, "step": 20 }, { "epoch": 0.7325581395348837, "grad_norm": 30.89563751220703, "learning_rate": 1.0500000000000001e-06, "loss": 1.4701, "step": 21 }, { "epoch": 0.7674418604651163, "grad_norm": 29.17167091369629, "learning_rate": 1.1e-06, "loss": 1.4218, "step": 22 }, { "epoch": 0.8023255813953488, "grad_norm": 28.237022399902344, "learning_rate": 1.1500000000000002e-06, "loss": 1.3014, "step": 23 }, { "epoch": 0.8372093023255814, "grad_norm": 28.778654098510742, "learning_rate": 1.2000000000000002e-06, "loss": 1.1857, "step": 24 }, { "epoch": 0.872093023255814, "grad_norm": 29.368289947509766, "learning_rate": 1.25e-06, "loss": 1.082, "step": 25 }, { "epoch": 0.9069767441860465, "grad_norm": 28.46448516845703, "learning_rate": 1.3e-06, "loss": 0.8716, "step": 26 }, { "epoch": 0.9418604651162791, "grad_norm": 24.584318161010742, "learning_rate": 1.3500000000000002e-06, "loss": 0.6848, "step": 27 }, { "epoch": 0.9767441860465116, "grad_norm": 21.166847229003906, "learning_rate": 1.4000000000000001e-06, "loss": 0.5438, "step": 28 }, { "epoch": 1.0, "grad_norm": 21.166847229003906, "learning_rate": 1.45e-06, "loss": 0.4084, "step": 29 }, { "epoch": 1.0348837209302326, "grad_norm": 24.098440170288086, "learning_rate": 1.5e-06, "loss": 0.3545, "step": 30 }, { "epoch": 1.069767441860465, "grad_norm": 12.6813325881958, "learning_rate": 1.5500000000000002e-06, "loss": 0.2673, "step": 31 }, { "epoch": 1.1046511627906976, "grad_norm": 7.6524128913879395, "learning_rate": 1.6000000000000001e-06, "loss": 0.2033, "step": 32 }, { "epoch": 1.1395348837209303, "grad_norm": 5.23082160949707, "learning_rate": 1.6500000000000003e-06, "loss": 0.1887, "step": 33 }, { "epoch": 1.1744186046511629, "grad_norm": 4.729929447174072, "learning_rate": 1.7000000000000002e-06, "loss": 0.1683, "step": 34 }, { "epoch": 1.2093023255813953, "grad_norm": 5.831579208374023, "learning_rate": 1.75e-06, "loss": 0.1643, "step": 35 }, { "epoch": 1.244186046511628, "grad_norm": 4.030057430267334, "learning_rate": 1.8000000000000001e-06, "loss": 0.1528, "step": 36 }, { "epoch": 1.2790697674418605, "grad_norm": 3.560523509979248, "learning_rate": 1.85e-06, "loss": 0.1422, "step": 37 }, { "epoch": 1.3139534883720931, "grad_norm": 3.3749780654907227, "learning_rate": 1.9000000000000002e-06, "loss": 0.1327, "step": 38 }, { "epoch": 1.3488372093023255, "grad_norm": 2.7184131145477295, "learning_rate": 1.9500000000000004e-06, "loss": 0.1199, "step": 39 }, { "epoch": 1.3837209302325582, "grad_norm": 2.8681583404541016, "learning_rate": 2.0000000000000003e-06, "loss": 0.1107, "step": 40 }, { "epoch": 1.4186046511627908, "grad_norm": 2.8731987476348877, "learning_rate": 2.05e-06, "loss": 0.1063, "step": 41 }, { "epoch": 1.4534883720930232, "grad_norm": 2.9522054195404053, "learning_rate": 2.1000000000000002e-06, "loss": 0.0989, "step": 42 }, { "epoch": 1.4883720930232558, "grad_norm": 2.689038038253784, "learning_rate": 2.15e-06, "loss": 0.0949, "step": 43 }, { "epoch": 1.5232558139534884, "grad_norm": 2.6328952312469482, "learning_rate": 2.2e-06, "loss": 0.1006, "step": 44 }, { "epoch": 1.558139534883721, "grad_norm": 2.7056033611297607, "learning_rate": 2.25e-06, "loss": 0.0929, "step": 45 }, { "epoch": 1.5930232558139537, "grad_norm": 2.568206310272217, "learning_rate": 2.3000000000000004e-06, "loss": 0.0881, "step": 46 }, { "epoch": 1.627906976744186, "grad_norm": 2.714211940765381, "learning_rate": 2.35e-06, "loss": 0.096, "step": 47 }, { "epoch": 1.6627906976744184, "grad_norm": 2.3650710582733154, "learning_rate": 2.4000000000000003e-06, "loss": 0.086, "step": 48 }, { "epoch": 1.697674418604651, "grad_norm": 2.1763498783111572, "learning_rate": 2.4500000000000003e-06, "loss": 0.0858, "step": 49 }, { "epoch": 1.7325581395348837, "grad_norm": 2.124727487564087, "learning_rate": 2.5e-06, "loss": 0.0843, "step": 50 }, { "epoch": 1.7674418604651163, "grad_norm": 1.641269564628601, "learning_rate": 2.55e-06, "loss": 0.0803, "step": 51 }, { "epoch": 1.802325581395349, "grad_norm": 1.5644842386245728, "learning_rate": 2.6e-06, "loss": 0.0779, "step": 52 }, { "epoch": 1.8372093023255816, "grad_norm": 1.1404681205749512, "learning_rate": 2.6500000000000005e-06, "loss": 0.0765, "step": 53 }, { "epoch": 1.872093023255814, "grad_norm": 0.8386123776435852, "learning_rate": 2.7000000000000004e-06, "loss": 0.0749, "step": 54 }, { "epoch": 1.9069767441860463, "grad_norm": 1.8363338708877563, "learning_rate": 2.7500000000000004e-06, "loss": 0.0737, "step": 55 }, { "epoch": 1.941860465116279, "grad_norm": 1.2105377912521362, "learning_rate": 2.8000000000000003e-06, "loss": 0.0762, "step": 56 }, { "epoch": 1.9767441860465116, "grad_norm": 1.5502218008041382, "learning_rate": 2.85e-06, "loss": 0.07, "step": 57 }, { "epoch": 2.0, "grad_norm": 1.0641449689865112, "learning_rate": 2.9e-06, "loss": 0.0679, "step": 58 }, { "epoch": 2.0348837209302326, "grad_norm": 0.9201306104660034, "learning_rate": 2.95e-06, "loss": 0.0677, "step": 59 }, { "epoch": 2.0697674418604652, "grad_norm": 0.5951386094093323, "learning_rate": 3e-06, "loss": 0.0654, "step": 60 }, { "epoch": 2.104651162790698, "grad_norm": 0.8307608962059021, "learning_rate": 3.05e-06, "loss": 0.0706, "step": 61 }, { "epoch": 2.13953488372093, "grad_norm": 0.7110892534255981, "learning_rate": 3.1000000000000004e-06, "loss": 0.0665, "step": 62 }, { "epoch": 2.1744186046511627, "grad_norm": 0.6766234040260315, "learning_rate": 3.1500000000000003e-06, "loss": 0.0618, "step": 63 }, { "epoch": 2.2093023255813953, "grad_norm": 0.3967410922050476, "learning_rate": 3.2000000000000003e-06, "loss": 0.0644, "step": 64 }, { "epoch": 2.244186046511628, "grad_norm": 0.3713420331478119, "learning_rate": 3.2500000000000002e-06, "loss": 0.0607, "step": 65 }, { "epoch": 2.2790697674418605, "grad_norm": 0.5613359212875366, "learning_rate": 3.3000000000000006e-06, "loss": 0.0623, "step": 66 }, { "epoch": 2.313953488372093, "grad_norm": 0.5458635687828064, "learning_rate": 3.3500000000000005e-06, "loss": 0.0563, "step": 67 }, { "epoch": 2.3488372093023258, "grad_norm": 0.4196176528930664, "learning_rate": 3.4000000000000005e-06, "loss": 0.0547, "step": 68 }, { "epoch": 2.383720930232558, "grad_norm": 0.5757117867469788, "learning_rate": 3.45e-06, "loss": 0.0551, "step": 69 }, { "epoch": 2.4186046511627906, "grad_norm": 0.45777687430381775, "learning_rate": 3.5e-06, "loss": 0.0575, "step": 70 }, { "epoch": 2.453488372093023, "grad_norm": 0.5204553008079529, "learning_rate": 3.5500000000000003e-06, "loss": 0.057, "step": 71 }, { "epoch": 2.488372093023256, "grad_norm": 0.6110821962356567, "learning_rate": 3.6000000000000003e-06, "loss": 0.0556, "step": 72 }, { "epoch": 2.5232558139534884, "grad_norm": 0.45246991515159607, "learning_rate": 3.65e-06, "loss": 0.0564, "step": 73 }, { "epoch": 2.558139534883721, "grad_norm": 0.441976398229599, "learning_rate": 3.7e-06, "loss": 0.0534, "step": 74 }, { "epoch": 2.5930232558139537, "grad_norm": 0.46637651324272156, "learning_rate": 3.7500000000000005e-06, "loss": 0.0498, "step": 75 }, { "epoch": 2.6279069767441863, "grad_norm": 0.482038289308548, "learning_rate": 3.8000000000000005e-06, "loss": 0.0521, "step": 76 }, { "epoch": 2.6627906976744184, "grad_norm": 0.5914385318756104, "learning_rate": 3.85e-06, "loss": 0.0511, "step": 77 }, { "epoch": 2.697674418604651, "grad_norm": 0.34532907605171204, "learning_rate": 3.900000000000001e-06, "loss": 0.0527, "step": 78 }, { "epoch": 2.7325581395348837, "grad_norm": 0.35315006971359253, "learning_rate": 3.95e-06, "loss": 0.0486, "step": 79 }, { "epoch": 2.7674418604651163, "grad_norm": 0.4521324634552002, "learning_rate": 4.000000000000001e-06, "loss": 0.0485, "step": 80 }, { "epoch": 2.802325581395349, "grad_norm": 0.49457868933677673, "learning_rate": 4.05e-06, "loss": 0.0525, "step": 81 }, { "epoch": 2.8372093023255816, "grad_norm": 0.7616601586341858, "learning_rate": 4.1e-06, "loss": 0.0528, "step": 82 }, { "epoch": 2.8720930232558137, "grad_norm": 0.4791123569011688, "learning_rate": 4.15e-06, "loss": 0.0512, "step": 83 }, { "epoch": 2.9069767441860463, "grad_norm": 0.38186997175216675, "learning_rate": 4.2000000000000004e-06, "loss": 0.0442, "step": 84 }, { "epoch": 2.941860465116279, "grad_norm": 0.32648414373397827, "learning_rate": 4.25e-06, "loss": 0.0432, "step": 85 }, { "epoch": 2.9767441860465116, "grad_norm": 0.410888671875, "learning_rate": 4.3e-06, "loss": 0.042, "step": 86 }, { "epoch": 3.0, "grad_norm": 0.4732547104358673, "learning_rate": 4.350000000000001e-06, "loss": 0.0483, "step": 87 }, { "epoch": 3.0348837209302326, "grad_norm": 0.6372231841087341, "learning_rate": 4.4e-06, "loss": 0.0382, "step": 88 }, { "epoch": 3.0697674418604652, "grad_norm": 0.393078088760376, "learning_rate": 4.450000000000001e-06, "loss": 0.0403, "step": 89 }, { "epoch": 3.104651162790698, "grad_norm": 0.4366248846054077, "learning_rate": 4.5e-06, "loss": 0.0367, "step": 90 }, { "epoch": 3.13953488372093, "grad_norm": 0.301724374294281, "learning_rate": 4.5500000000000005e-06, "loss": 0.0351, "step": 91 }, { "epoch": 3.1744186046511627, "grad_norm": 0.37665656208992004, "learning_rate": 4.600000000000001e-06, "loss": 0.03, "step": 92 }, { "epoch": 3.2093023255813953, "grad_norm": 0.581331729888916, "learning_rate": 4.65e-06, "loss": 0.0316, "step": 93 }, { "epoch": 3.244186046511628, "grad_norm": 0.44136878848075867, "learning_rate": 4.7e-06, "loss": 0.0312, "step": 94 }, { "epoch": 3.2790697674418605, "grad_norm": 0.7624006271362305, "learning_rate": 4.75e-06, "loss": 0.0323, "step": 95 }, { "epoch": 3.313953488372093, "grad_norm": 0.35294386744499207, "learning_rate": 4.800000000000001e-06, "loss": 0.0286, "step": 96 }, { "epoch": 3.3488372093023258, "grad_norm": 0.6240035891532898, "learning_rate": 4.85e-06, "loss": 0.0289, "step": 97 }, { "epoch": 3.383720930232558, "grad_norm": 0.40580570697784424, "learning_rate": 4.9000000000000005e-06, "loss": 0.0259, "step": 98 }, { "epoch": 3.4186046511627906, "grad_norm": 0.2971636652946472, "learning_rate": 4.95e-06, "loss": 0.0251, "step": 99 }, { "epoch": 3.453488372093023, "grad_norm": 0.3758476972579956, "learning_rate": 5e-06, "loss": 0.0244, "step": 100 }, { "epoch": 3.488372093023256, "grad_norm": 0.3845921754837036, "learning_rate": 4.997332437005932e-06, "loss": 0.0242, "step": 101 }, { "epoch": 3.5232558139534884, "grad_norm": 0.48894616961479187, "learning_rate": 4.989335440737587e-06, "loss": 0.0252, "step": 102 }, { "epoch": 3.558139534883721, "grad_norm": 0.36844325065612793, "learning_rate": 4.976026077188013e-06, "loss": 0.0197, "step": 103 }, { "epoch": 3.5930232558139537, "grad_norm": 0.2800253629684448, "learning_rate": 4.957432749209755e-06, "loss": 0.0187, "step": 104 }, { "epoch": 3.6279069767441863, "grad_norm": 0.34042924642562866, "learning_rate": 4.933595135901733e-06, "loss": 0.0182, "step": 105 }, { "epoch": 3.6627906976744184, "grad_norm": 0.33217477798461914, "learning_rate": 4.904564107932048e-06, "loss": 0.0166, "step": 106 }, { "epoch": 3.697674418604651, "grad_norm": 0.32867398858070374, "learning_rate": 4.870401618977415e-06, "loss": 0.017, "step": 107 }, { "epoch": 3.7325581395348837, "grad_norm": 0.24199356138706207, "learning_rate": 4.83118057351089e-06, "loss": 0.0138, "step": 108 }, { "epoch": 3.7674418604651163, "grad_norm": 0.3216392695903778, "learning_rate": 4.786984671220053e-06, "loss": 0.0171, "step": 109 }, { "epoch": 3.802325581395349, "grad_norm": 0.3574189841747284, "learning_rate": 4.737908228387656e-06, "loss": 0.0146, "step": 110 }, { "epoch": 3.8372093023255816, "grad_norm": 0.30931738018989563, "learning_rate": 4.684055976615924e-06, "loss": 0.0128, "step": 111 }, { "epoch": 3.8720930232558137, "grad_norm": 0.2074785828590393, "learning_rate": 4.625542839324036e-06, "loss": 0.0089, "step": 112 } ], "logging_steps": 1, "max_steps": 168, "num_input_tokens_seen": 0, "num_train_epochs": 6, "save_steps": 28, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 9.521111556499177e+18, "train_batch_size": 4, "trial_name": null, "trial_params": null }