{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 3.8351648351648353, "eval_steps": 500, "global_step": 88, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.04395604395604396, "grad_norm": 12.68799877166748, "learning_rate": 0.0, "loss": 2.3586, "step": 1 }, { "epoch": 0.08791208791208792, "grad_norm": 11.960691452026367, "learning_rate": 1.0000000000000001e-07, "loss": 2.4032, "step": 2 }, { "epoch": 0.13186813186813187, "grad_norm": 11.234224319458008, "learning_rate": 2.0000000000000002e-07, "loss": 2.3596, "step": 3 }, { "epoch": 0.17582417582417584, "grad_norm": 11.478860855102539, "learning_rate": 3.0000000000000004e-07, "loss": 2.359, "step": 4 }, { "epoch": 0.21978021978021978, "grad_norm": 11.855159759521484, "learning_rate": 4.0000000000000003e-07, "loss": 2.389, "step": 5 }, { "epoch": 0.26373626373626374, "grad_norm": 12.25625228881836, "learning_rate": 5.000000000000001e-07, "loss": 2.3624, "step": 6 }, { "epoch": 0.3076923076923077, "grad_norm": 11.3080415725708, "learning_rate": 6.000000000000001e-07, "loss": 2.3715, "step": 7 }, { "epoch": 0.3516483516483517, "grad_norm": 11.263334274291992, "learning_rate": 7.000000000000001e-07, "loss": 2.3607, "step": 8 }, { "epoch": 0.3956043956043956, "grad_norm": 10.836712837219238, "learning_rate": 8.000000000000001e-07, "loss": 2.3603, "step": 9 }, { "epoch": 0.43956043956043955, "grad_norm": 10.755544662475586, "learning_rate": 9.000000000000001e-07, "loss": 2.3942, "step": 10 }, { "epoch": 0.4835164835164835, "grad_norm": 10.037590026855469, "learning_rate": 1.0000000000000002e-06, "loss": 2.3521, "step": 11 }, { "epoch": 0.5274725274725275, "grad_norm": 9.6759614944458, "learning_rate": 1.1e-06, "loss": 2.3864, "step": 12 }, { "epoch": 0.5714285714285714, "grad_norm": 9.317275047302246, "learning_rate": 1.2000000000000002e-06, "loss": 2.3682, "step": 13 }, { "epoch": 0.6153846153846154, "grad_norm": 8.477849960327148, "learning_rate": 1.3e-06, "loss": 2.3351, "step": 14 }, { "epoch": 0.6593406593406593, "grad_norm": 7.95522928237915, "learning_rate": 1.4000000000000001e-06, "loss": 2.316, "step": 15 }, { "epoch": 0.7032967032967034, "grad_norm": 7.304463863372803, "learning_rate": 1.5e-06, "loss": 2.334, "step": 16 }, { "epoch": 0.7472527472527473, "grad_norm": 6.289999961853027, "learning_rate": 1.6000000000000001e-06, "loss": 2.318, "step": 17 }, { "epoch": 0.7912087912087912, "grad_norm": 6.027122497558594, "learning_rate": 1.7000000000000002e-06, "loss": 2.3296, "step": 18 }, { "epoch": 0.8351648351648352, "grad_norm": 5.818652629852295, "learning_rate": 1.8000000000000001e-06, "loss": 2.3141, "step": 19 }, { "epoch": 0.8791208791208791, "grad_norm": 4.624997138977051, "learning_rate": 1.9000000000000002e-06, "loss": 2.3109, "step": 20 }, { "epoch": 0.9230769230769231, "grad_norm": 3.5909717082977295, "learning_rate": 2.0000000000000003e-06, "loss": 2.2997, "step": 21 }, { "epoch": 0.967032967032967, "grad_norm": 3.4130618572235107, "learning_rate": 2.1000000000000002e-06, "loss": 2.3231, "step": 22 }, { "epoch": 1.0, "grad_norm": 2.9837982654571533, "learning_rate": 2.2e-06, "loss": 2.29, "step": 23 }, { "epoch": 1.043956043956044, "grad_norm": 4.985344409942627, "learning_rate": 2.3000000000000004e-06, "loss": 2.2736, "step": 24 }, { "epoch": 1.0879120879120878, "grad_norm": 3.360139846801758, "learning_rate": 2.4000000000000003e-06, "loss": 2.3187, "step": 25 }, { "epoch": 1.1318681318681318, "grad_norm": 2.8821568489074707, "learning_rate": 2.5e-06, "loss": 2.2779, "step": 26 }, { "epoch": 1.1758241758241759, "grad_norm": 2.9288103580474854, "learning_rate": 2.6e-06, "loss": 2.2737, "step": 27 }, { "epoch": 1.2197802197802199, "grad_norm": 2.41239595413208, "learning_rate": 2.7000000000000004e-06, "loss": 2.2987, "step": 28 }, { "epoch": 1.2637362637362637, "grad_norm": 3.1214678287506104, "learning_rate": 2.8000000000000003e-06, "loss": 2.2666, "step": 29 }, { "epoch": 1.3076923076923077, "grad_norm": 1.981587529182434, "learning_rate": 2.9e-06, "loss": 2.2825, "step": 30 }, { "epoch": 1.3516483516483517, "grad_norm": 1.6952852010726929, "learning_rate": 3e-06, "loss": 2.2697, "step": 31 }, { "epoch": 1.3956043956043955, "grad_norm": 1.8179521560668945, "learning_rate": 3.1000000000000004e-06, "loss": 2.2739, "step": 32 }, { "epoch": 1.4395604395604396, "grad_norm": 1.9612782001495361, "learning_rate": 3.2000000000000003e-06, "loss": 2.3062, "step": 33 }, { "epoch": 1.4835164835164836, "grad_norm": 1.7110782861709595, "learning_rate": 3.3000000000000006e-06, "loss": 2.2654, "step": 34 }, { "epoch": 1.5274725274725274, "grad_norm": 1.4163310527801514, "learning_rate": 3.4000000000000005e-06, "loss": 2.3029, "step": 35 }, { "epoch": 1.5714285714285714, "grad_norm": 1.357823371887207, "learning_rate": 3.5e-06, "loss": 2.2852, "step": 36 }, { "epoch": 1.6153846153846154, "grad_norm": 1.281903862953186, "learning_rate": 3.6000000000000003e-06, "loss": 2.2589, "step": 37 }, { "epoch": 1.6593406593406592, "grad_norm": 1.156381607055664, "learning_rate": 3.7e-06, "loss": 2.2403, "step": 38 }, { "epoch": 1.7032967032967035, "grad_norm": 1.1381186246871948, "learning_rate": 3.8000000000000005e-06, "loss": 2.2635, "step": 39 }, { "epoch": 1.7472527472527473, "grad_norm": 1.0350444316864014, "learning_rate": 3.900000000000001e-06, "loss": 2.2564, "step": 40 }, { "epoch": 1.791208791208791, "grad_norm": 0.9795923233032227, "learning_rate": 4.000000000000001e-06, "loss": 2.2672, "step": 41 }, { "epoch": 1.8351648351648353, "grad_norm": 0.9845089912414551, "learning_rate": 4.1e-06, "loss": 2.2522, "step": 42 }, { "epoch": 1.879120879120879, "grad_norm": 0.9356728196144104, "learning_rate": 4.2000000000000004e-06, "loss": 2.2596, "step": 43 }, { "epoch": 1.9230769230769231, "grad_norm": 0.9409797787666321, "learning_rate": 4.3e-06, "loss": 2.2585, "step": 44 }, { "epoch": 1.9670329670329672, "grad_norm": 0.8999571204185486, "learning_rate": 4.4e-06, "loss": 2.2821, "step": 45 }, { "epoch": 2.0, "grad_norm": 0.8957112431526184, "learning_rate": 4.5e-06, "loss": 2.2528, "step": 46 }, { "epoch": 2.043956043956044, "grad_norm": 0.9083137512207031, "learning_rate": 4.600000000000001e-06, "loss": 2.2385, "step": 47 }, { "epoch": 2.087912087912088, "grad_norm": 0.8808560371398926, "learning_rate": 4.7e-06, "loss": 2.2904, "step": 48 }, { "epoch": 2.131868131868132, "grad_norm": 1.0237456560134888, "learning_rate": 4.800000000000001e-06, "loss": 2.2509, "step": 49 }, { "epoch": 2.1758241758241756, "grad_norm": 0.9366745948791504, "learning_rate": 4.9000000000000005e-06, "loss": 2.2487, "step": 50 }, { "epoch": 2.21978021978022, "grad_norm": 0.9902853965759277, "learning_rate": 5e-06, "loss": 2.2746, "step": 51 }, { "epoch": 2.2637362637362637, "grad_norm": 1.0095207691192627, "learning_rate": 4.991461232516675e-06, "loss": 2.2422, "step": 52 }, { "epoch": 2.3076923076923075, "grad_norm": 0.8401025533676147, "learning_rate": 4.965903258506806e-06, "loss": 2.26, "step": 53 }, { "epoch": 2.3516483516483517, "grad_norm": 0.7961649298667908, "learning_rate": 4.923500664848327e-06, "loss": 2.2479, "step": 54 }, { "epoch": 2.3956043956043955, "grad_norm": 0.8089596629142761, "learning_rate": 4.864543104251587e-06, "loss": 2.2532, "step": 55 }, { "epoch": 2.4395604395604398, "grad_norm": 0.7256011962890625, "learning_rate": 4.789433316637644e-06, "loss": 2.2863, "step": 56 }, { "epoch": 2.4835164835164836, "grad_norm": 0.7093259692192078, "learning_rate": 4.698684378016223e-06, "loss": 2.2468, "step": 57 }, { "epoch": 2.5274725274725274, "grad_norm": 0.7478638887405396, "learning_rate": 4.592916195656322e-06, "loss": 2.2859, "step": 58 }, { "epoch": 2.571428571428571, "grad_norm": 0.6463800668716431, "learning_rate": 4.472851273490985e-06, "loss": 2.2686, "step": 59 }, { "epoch": 2.6153846153846154, "grad_norm": 0.8315393328666687, "learning_rate": 4.33930977668283e-06, "loss": 2.2441, "step": 60 }, { "epoch": 2.659340659340659, "grad_norm": 0.693785548210144, "learning_rate": 4.1932039290643534e-06, "loss": 2.2257, "step": 61 }, { "epoch": 2.7032967032967035, "grad_norm": 0.7205367684364319, "learning_rate": 4.0355317817241705e-06, "loss": 2.2502, "step": 62 }, { "epoch": 2.7472527472527473, "grad_norm": 0.7036604881286621, "learning_rate": 3.8673703953060685e-06, "loss": 2.2452, "step": 63 }, { "epoch": 2.791208791208791, "grad_norm": 0.5922514200210571, "learning_rate": 3.6898684825926845e-06, "loss": 2.2555, "step": 64 }, { "epoch": 2.8351648351648353, "grad_norm": 0.6727115511894226, "learning_rate": 3.5042385616324243e-06, "loss": 2.2399, "step": 65 }, { "epoch": 2.879120879120879, "grad_norm": 0.6253023147583008, "learning_rate": 3.3117486730117092e-06, "loss": 2.2496, "step": 66 }, { "epoch": 2.9230769230769234, "grad_norm": 0.6109470129013062, "learning_rate": 3.1137137178519983e-06, "loss": 2.2501, "step": 67 }, { "epoch": 2.967032967032967, "grad_norm": 0.5974487066268921, "learning_rate": 2.911486475701835e-06, "loss": 2.2736, "step": 68 }, { "epoch": 3.0, "grad_norm": 0.6350704431533813, "learning_rate": 2.7064483636808314e-06, "loss": 2.2437, "step": 69 }, { "epoch": 3.043956043956044, "grad_norm": 0.5624422430992126, "learning_rate": 2.5e-06, "loss": 2.2297, "step": 70 }, { "epoch": 3.087912087912088, "grad_norm": 0.563450813293457, "learning_rate": 2.2935516363191695e-06, "loss": 2.2828, "step": 71 }, { "epoch": 3.131868131868132, "grad_norm": 0.6647318601608276, "learning_rate": 2.088513524298165e-06, "loss": 2.2435, "step": 72 }, { "epoch": 3.1758241758241756, "grad_norm": 0.5552098751068115, "learning_rate": 1.8862862821480023e-06, "loss": 2.242, "step": 73 }, { "epoch": 3.21978021978022, "grad_norm": 0.5633946061134338, "learning_rate": 1.6882513269882916e-06, "loss": 2.2683, "step": 74 }, { "epoch": 3.2637362637362637, "grad_norm": 0.5919342041015625, "learning_rate": 1.495761438367577e-06, "loss": 2.235, "step": 75 }, { "epoch": 3.3076923076923075, "grad_norm": 0.5602885484695435, "learning_rate": 1.3101315174073162e-06, "loss": 2.2535, "step": 76 }, { "epoch": 3.3516483516483517, "grad_norm": 0.5972963571548462, "learning_rate": 1.1326296046939334e-06, "loss": 2.2415, "step": 77 }, { "epoch": 3.3956043956043955, "grad_norm": 0.5750962495803833, "learning_rate": 9.644682182758305e-07, "loss": 2.247, "step": 78 }, { "epoch": 3.4395604395604398, "grad_norm": 0.6113232970237732, "learning_rate": 8.067960709356479e-07, "loss": 2.2803, "step": 79 }, { "epoch": 3.4835164835164836, "grad_norm": 0.5419983863830566, "learning_rate": 6.60690223317171e-07, "loss": 2.2412, "step": 80 }, { "epoch": 3.5274725274725274, "grad_norm": 0.5377610325813293, "learning_rate": 5.271487265090163e-07, "loss": 2.2807, "step": 81 }, { "epoch": 3.571428571428571, "grad_norm": 0.5361002683639526, "learning_rate": 4.070838043436787e-07, "loss": 2.2632, "step": 82 }, { "epoch": 3.6153846153846154, "grad_norm": 0.6202074289321899, "learning_rate": 3.0131562198377763e-07, "loss": 2.2389, "step": 83 }, { "epoch": 3.659340659340659, "grad_norm": 0.631415843963623, "learning_rate": 2.1056668336235624e-07, "loss": 2.2203, "step": 84 }, { "epoch": 3.7032967032967035, "grad_norm": 0.598537802696228, "learning_rate": 1.3545689574841341e-07, "loss": 2.2453, "step": 85 }, { "epoch": 3.7472527472527473, "grad_norm": 0.5731582045555115, "learning_rate": 7.649933515167407e-08, "loss": 2.2406, "step": 86 }, { "epoch": 3.791208791208791, "grad_norm": 0.6371837854385376, "learning_rate": 3.4096741493194196e-08, "loss": 2.2511, "step": 87 }, { "epoch": 3.8351648351648353, "grad_norm": 0.6559035181999207, "learning_rate": 8.538767483325384e-09, "loss": 2.2357, "step": 88 } ], "logging_steps": 1, "max_steps": 88, "num_input_tokens_seen": 0, "num_train_epochs": 4, "save_steps": 11, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 2.5419157285835375e+19, "train_batch_size": 4, "trial_name": null, "trial_params": null }