song9
/

CC-KuLLM3-LoRA

+{
+  "best_metric": 0.6711751222610474,
+  "best_model_checkpoint": "./checkpoints/KuLLM3_lora_clm_with_added_tokens_no_shorts-2batch-2epoch_1109/checkpoint-80000",
+  "epoch": 1.9412292834437408,
+  "eval_steps": 8000,
+  "global_step": 80000,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.01941229283443741,
+      "grad_norm": 179.34385681152344,
+      "learning_rate": 1.940805434255216e-06,
+      "loss": 1.3039,
+      "step": 800
+    },
+    {
+      "epoch": 0.03882458566887482,
+      "grad_norm": 72.6114730834961,
+      "learning_rate": 3.881610868510432e-06,
+      "loss": 0.9632,
+      "step": 1600
+    },
+    {
+      "epoch": 0.058236878503312224,
+      "grad_norm": 76.58552551269531,
+      "learning_rate": 5.822416302765648e-06,
+      "loss": 0.8919,
+      "step": 2400
+    },
+    {
+      "epoch": 0.07764917133774964,
+      "grad_norm": 121.52155303955078,
+      "learning_rate": 7.763221737020863e-06,
+      "loss": 0.868,
+      "step": 3200
+    },
+    {
+      "epoch": 0.09706146417218704,
+      "grad_norm": 61.98054885864258,
+      "learning_rate": 9.70402717127608e-06,
+      "loss": 0.8474,
+      "step": 4000
+    },
+    {
+      "epoch": 0.11647375700662445,
+      "grad_norm": 48.987022399902344,
+      "learning_rate": 9.913409961685825e-06,
+      "loss": 0.8259,
+      "step": 4800
+    },
+    {
+      "epoch": 0.13588604984106184,
+      "grad_norm": 52.976566314697266,
+      "learning_rate": 9.81123882503193e-06,
+      "loss": 0.8224,
+      "step": 5600
+    },
+    {
+      "epoch": 0.15529834267549927,
+      "grad_norm": 74.86931610107422,
+      "learning_rate": 9.709067688378035e-06,
+      "loss": 0.8002,
+      "step": 6400
+    },
+    {
+      "epoch": 0.17471063550993668,
+      "grad_norm": 58.481258392333984,
+      "learning_rate": 9.606896551724138e-06,
+      "loss": 0.8021,
+      "step": 7200
+    },
+    {
+      "epoch": 0.19412292834437408,
+      "grad_norm": 82.62439727783203,
+      "learning_rate": 9.504725415070245e-06,
+      "loss": 0.8004,
+      "step": 8000
+    },
+    {
+      "epoch": 0.19412292834437408,
+      "eval_loss": 0.7944459915161133,
+      "eval_runtime": 1619.7602,
+      "eval_samples_per_second": 6.337,
+      "eval_steps_per_second": 6.337,
+      "step": 8000
+    },
+    {
+      "epoch": 0.2135352211788115,
+      "grad_norm": 58.08147430419922,
+      "learning_rate": 9.402554278416348e-06,
+      "loss": 0.7754,
+      "step": 8800
+    },
+    {
+      "epoch": 0.2329475140132489,
+      "grad_norm": 85.25300598144531,
+      "learning_rate": 9.300383141762453e-06,
+      "loss": 0.7728,
+      "step": 9600
+    },
+    {
+      "epoch": 0.2523598068476863,
+      "grad_norm": 139.68646240234375,
+      "learning_rate": 9.198212005108558e-06,
+      "loss": 0.768,
+      "step": 10400
+    },
+    {
+      "epoch": 0.2717720996821237,
+      "grad_norm": 62.37124252319336,
+      "learning_rate": 9.096040868454661e-06,
+      "loss": 0.7773,
+      "step": 11200
+    },
+    {
+      "epoch": 0.2911843925165611,
+      "grad_norm": 32.99665069580078,
+      "learning_rate": 8.993869731800768e-06,
+      "loss": 0.7657,
+      "step": 12000
+    },
+    {
+      "epoch": 0.31059668535099855,
+      "grad_norm": 117.61981964111328,
+      "learning_rate": 8.891698595146871e-06,
+      "loss": 0.7613,
+      "step": 12800
+    },
+    {
+      "epoch": 0.33000897818543595,
+      "grad_norm": 43.9254035949707,
+      "learning_rate": 8.789527458492976e-06,
+      "loss": 0.7573,
+      "step": 13600
+    },
+    {
+      "epoch": 0.34942127101987336,
+      "grad_norm": 31.05263328552246,
+      "learning_rate": 8.687356321839081e-06,
+      "loss": 0.7453,
+      "step": 14400
+    },
+    {
+      "epoch": 0.36883356385431076,
+      "grad_norm": 57.3399543762207,
+      "learning_rate": 8.585185185185186e-06,
+      "loss": 0.743,
+      "step": 15200
+    },
+    {
+      "epoch": 0.38824585668874817,
+      "grad_norm": 29.631826400756836,
+      "learning_rate": 8.483014048531291e-06,
+      "loss": 0.7445,
+      "step": 16000
+    },
+    {
+      "epoch": 0.38824585668874817,
+      "eval_loss": 0.750423789024353,
+      "eval_runtime": 1620.0402,
+      "eval_samples_per_second": 6.336,
+      "eval_steps_per_second": 6.336,
+      "step": 16000
+    },
+    {
+      "epoch": 0.4076581495231856,
+      "grad_norm": 29.647502899169922,
+      "learning_rate": 8.380842911877395e-06,
+      "loss": 0.7331,
+      "step": 16800
+    },
+    {
+      "epoch": 0.427070442357623,
+      "grad_norm": 19.794334411621094,
+      "learning_rate": 8.2786717752235e-06,
+      "loss": 0.7329,
+      "step": 17600
+    },
+    {
+      "epoch": 0.4464827351920604,
+      "grad_norm": 114.88660430908203,
+      "learning_rate": 8.176500638569605e-06,
+      "loss": 0.7334,
+      "step": 18400
+    },
+    {
+      "epoch": 0.4658950280264978,
+      "grad_norm": 29.109107971191406,
+      "learning_rate": 8.07432950191571e-06,
+      "loss": 0.7294,
+      "step": 19200
+    },
+    {
+      "epoch": 0.4853073208609352,
+      "grad_norm": 25.62792205810547,
+      "learning_rate": 7.972158365261815e-06,
+      "loss": 0.7287,
+      "step": 20000
+    },
+    {
+      "epoch": 0.5047196136953725,
+      "grad_norm": 40.84642028808594,
+      "learning_rate": 7.86998722860792e-06,
+      "loss": 0.7157,
+      "step": 20800
+    },
+    {
+      "epoch": 0.52413190652981,
+      "grad_norm": 46.767059326171875,
+      "learning_rate": 7.767816091954023e-06,
+      "loss": 0.7284,
+      "step": 21600
+    },
+    {
+      "epoch": 0.5435441993642474,
+      "grad_norm": 24.68915557861328,
+      "learning_rate": 7.66564495530013e-06,
+      "loss": 0.7314,
+      "step": 22400
+    },
+    {
+      "epoch": 0.5629564921986848,
+      "grad_norm": 31.020458221435547,
+      "learning_rate": 7.563473818646233e-06,
+      "loss": 0.7157,
+      "step": 23200
+    },
+    {
+      "epoch": 0.5823687850331222,
+      "grad_norm": 29.391735076904297,
+      "learning_rate": 7.461302681992337e-06,
+      "loss": 0.7058,
+      "step": 24000
+    },
+    {
+      "epoch": 0.5823687850331222,
+      "eval_loss": 0.7259429693222046,
+      "eval_runtime": 1620.5504,
+      "eval_samples_per_second": 6.334,
+      "eval_steps_per_second": 6.334,
+      "step": 24000
+    },
+    {
+      "epoch": 0.6017810778675596,
+      "grad_norm": 45.48763656616211,
+      "learning_rate": 7.359131545338443e-06,
+      "loss": 0.7117,
+      "step": 24800
+    },
+    {
+      "epoch": 0.6211933707019971,
+      "grad_norm": 24.26145362854004,
+      "learning_rate": 7.256960408684547e-06,
+      "loss": 0.701,
+      "step": 25600
+    },
+    {
+      "epoch": 0.6406056635364344,
+      "grad_norm": 20.341548919677734,
+      "learning_rate": 7.154789272030652e-06,
+      "loss": 0.7227,
+      "step": 26400
+    },
+    {
+      "epoch": 0.6600179563708719,
+      "grad_norm": 55.4034423828125,
+      "learning_rate": 7.052618135376756e-06,
+      "loss": 0.7125,
+      "step": 27200
+    },
+    {
+      "epoch": 0.6794302492053093,
+      "grad_norm": 27.05145263671875,
+      "learning_rate": 6.950446998722862e-06,
+      "loss": 0.7053,
+      "step": 28000
+    },
+    {
+      "epoch": 0.6988425420397467,
+      "grad_norm": 30.649011611938477,
+      "learning_rate": 6.848275862068966e-06,
+      "loss": 0.6974,
+      "step": 28800
+    },
+    {
+      "epoch": 0.7182548348741841,
+      "grad_norm": 54.12739181518555,
+      "learning_rate": 6.746104725415071e-06,
+      "loss": 0.7018,
+      "step": 29600
+    },
+    {
+      "epoch": 0.7376671277086215,
+      "grad_norm": 22.7417049407959,
+      "learning_rate": 6.6439335887611755e-06,
+      "loss": 0.6966,
+      "step": 30400
+    },
+    {
+      "epoch": 0.7570794205430589,
+      "grad_norm": 50.62528991699219,
+      "learning_rate": 6.54176245210728e-06,
+      "loss": 0.6963,
+      "step": 31200
+    },
+    {
+      "epoch": 0.7764917133774963,
+      "grad_norm": 107.59382629394531,
+      "learning_rate": 6.4395913154533855e-06,
+      "loss": 0.7039,
+      "step": 32000
+    },
+    {
+      "epoch": 0.7764917133774963,
+      "eval_loss": 0.7099697589874268,
+      "eval_runtime": 1619.6536,
+      "eval_samples_per_second": 6.338,
+      "eval_steps_per_second": 6.338,
+      "step": 32000
+    },
+    {
+      "epoch": 0.7959040062119337,
+      "grad_norm": 24.90111541748047,
+      "learning_rate": 6.33742017879949e-06,
+      "loss": 0.7021,
+      "step": 32800
+    },
+    {
+      "epoch": 0.8153162990463712,
+      "grad_norm": 42.44001007080078,
+      "learning_rate": 6.235249042145595e-06,
+      "loss": 0.6929,
+      "step": 33600
+    },
+    {
+      "epoch": 0.8347285918808085,
+      "grad_norm": 29.183473587036133,
+      "learning_rate": 6.133077905491699e-06,
+      "loss": 0.6955,
+      "step": 34400
+    },
+    {
+      "epoch": 0.854140884715246,
+      "grad_norm": 38.95024490356445,
+      "learning_rate": 6.030906768837805e-06,
+      "loss": 0.6928,
+      "step": 35200
+    },
+    {
+      "epoch": 0.8735531775496833,
+      "grad_norm": 31.284547805786133,
+      "learning_rate": 5.928735632183909e-06,
+      "loss": 0.6919,
+      "step": 36000
+    },
+    {
+      "epoch": 0.8929654703841208,
+      "grad_norm": 41.60378646850586,
+      "learning_rate": 5.826564495530014e-06,
+      "loss": 0.6955,
+      "step": 36800
+    },
+    {
+      "epoch": 0.9123777632185581,
+      "grad_norm": 47.39144515991211,
+      "learning_rate": 5.724393358876118e-06,
+      "loss": 0.7038,
+      "step": 37600
+    },
+    {
+      "epoch": 0.9317900560529956,
+      "grad_norm": 45.3167610168457,
+      "learning_rate": 5.622222222222222e-06,
+      "loss": 0.6838,
+      "step": 38400
+    },
+    {
+      "epoch": 0.9512023488874329,
+      "grad_norm": 17.134929656982422,
+      "learning_rate": 5.520051085568328e-06,
+      "loss": 0.6933,
+      "step": 39200
+    },
+    {
+      "epoch": 0.9706146417218704,
+      "grad_norm": 32.11737060546875,
+      "learning_rate": 5.417879948914432e-06,
+      "loss": 0.6861,
+      "step": 40000
+    },
+    {
+      "epoch": 0.9706146417218704,
+      "eval_loss": 0.698592483997345,
+      "eval_runtime": 1620.335,
+      "eval_samples_per_second": 6.335,
+      "eval_steps_per_second": 6.335,
+      "step": 40000
+    },
+    {
+      "epoch": 0.9900269345563077,
+      "grad_norm": 36.55326843261719,
+      "learning_rate": 5.315708812260537e-06,
+      "loss": 0.6859,
+      "step": 40800
+    },
+    {
+      "epoch": 1.009439227390745,
+      "grad_norm": 23.36037254333496,
+      "learning_rate": 5.213537675606641e-06,
+      "loss": 0.6842,
+      "step": 41600
+    },
+    {
+      "epoch": 1.0288515202251827,
+      "grad_norm": 21.21329116821289,
+      "learning_rate": 5.111366538952746e-06,
+      "loss": 0.6687,
+      "step": 42400
+    },
+    {
+      "epoch": 1.04826381305962,
+      "grad_norm": 26.16172981262207,
+      "learning_rate": 5.009195402298851e-06,
+      "loss": 0.6859,
+      "step": 43200
+    },
+    {
+      "epoch": 1.0676761058940574,
+      "grad_norm": 34.661190032958984,
+      "learning_rate": 4.9070242656449555e-06,
+      "loss": 0.6667,
+      "step": 44000
+    },
+    {
+      "epoch": 1.0870883987284947,
+      "grad_norm": 47.726898193359375,
+      "learning_rate": 4.8048531289910605e-06,
+      "loss": 0.6628,
+      "step": 44800
+    },
+    {
+      "epoch": 1.1065006915629323,
+      "grad_norm": 26.898998260498047,
+      "learning_rate": 4.7026819923371655e-06,
+      "loss": 0.6715,
+      "step": 45600
+    },
+    {
+      "epoch": 1.1259129843973696,
+      "grad_norm": 31.409574508666992,
+      "learning_rate": 4.60051085568327e-06,
+      "loss": 0.6725,
+      "step": 46400
+    },
+    {
+      "epoch": 1.145325277231807,
+      "grad_norm": 29.432598114013672,
+      "learning_rate": 4.498339719029375e-06,
+      "loss": 0.6589,
+      "step": 47200
+    },
+    {
+      "epoch": 1.1647375700662446,
+      "grad_norm": 33.80415725708008,
+      "learning_rate": 4.396168582375479e-06,
+      "loss": 0.667,
+      "step": 48000
+    },
+    {
+      "epoch": 1.1647375700662446,
+      "eval_loss": 0.6902133226394653,
+      "eval_runtime": 1620.636,
+      "eval_samples_per_second": 6.334,
+      "eval_steps_per_second": 6.334,
+      "step": 48000
+    },
+    {
+      "epoch": 1.184149862900682,
+      "grad_norm": 34.50639724731445,
+      "learning_rate": 4.293997445721584e-06,
+      "loss": 0.658,
+      "step": 48800
+    },
+    {
+      "epoch": 1.2035621557351193,
+      "grad_norm": 18.016963958740234,
+      "learning_rate": 4.191826309067689e-06,
+      "loss": 0.6753,
+      "step": 49600
+    },
+    {
+      "epoch": 1.2229744485695566,
+      "grad_norm": 44.54015350341797,
+      "learning_rate": 4.089655172413794e-06,
+      "loss": 0.6644,
+      "step": 50400
+    },
+    {
+      "epoch": 1.242386741403994,
+      "grad_norm": 33.76991653442383,
+      "learning_rate": 3.987484035759898e-06,
+      "loss": 0.6583,
+      "step": 51200
+    },
+    {
+      "epoch": 1.2617990342384315,
+      "grad_norm": 36.015037536621094,
+      "learning_rate": 3.885312899106003e-06,
+      "loss": 0.6653,
+      "step": 52000
+    },
+    {
+      "epoch": 1.2812113270728689,
+      "grad_norm": 31.974958419799805,
+      "learning_rate": 3.7831417624521076e-06,
+      "loss": 0.6666,
+      "step": 52800
+    },
+    {
+      "epoch": 1.3006236199073062,
+      "grad_norm": 23.773361206054688,
+      "learning_rate": 3.6809706257982126e-06,
+      "loss": 0.6696,
+      "step": 53600
+    },
+    {
+      "epoch": 1.3200359127417438,
+      "grad_norm": 23.204721450805664,
+      "learning_rate": 3.578799489144317e-06,
+      "loss": 0.6641,
+      "step": 54400
+    },
+    {
+      "epoch": 1.3394482055761812,
+      "grad_norm": 35.860713958740234,
+      "learning_rate": 3.4766283524904217e-06,
+      "loss": 0.6572,
+      "step": 55200
+    },
+    {
+      "epoch": 1.3588604984106185,
+      "grad_norm": 31.097198486328125,
+      "learning_rate": 3.3744572158365263e-06,
+      "loss": 0.6562,
+      "step": 56000
+    },
+    {
+      "epoch": 1.3588604984106185,
+      "eval_loss": 0.6831924915313721,
+      "eval_runtime": 1619.9018,
+      "eval_samples_per_second": 6.337,
+      "eval_steps_per_second": 6.337,
+      "step": 56000
+    },
+    {
+      "epoch": 1.3782727912450559,
+      "grad_norm": 24.997142791748047,
+      "learning_rate": 3.272286079182631e-06,
+      "loss": 0.6618,
+      "step": 56800
+    },
+    {
+      "epoch": 1.3976850840794932,
+      "grad_norm": 30.415740966796875,
+      "learning_rate": 3.170114942528736e-06,
+      "loss": 0.6653,
+      "step": 57600
+    },
+    {
+      "epoch": 1.4170973769139308,
+      "grad_norm": 49.423728942871094,
+      "learning_rate": 3.0679438058748405e-06,
+      "loss": 0.6456,
+      "step": 58400
+    },
+    {
+      "epoch": 1.4365096697483681,
+      "grad_norm": 33.79461669921875,
+      "learning_rate": 2.9657726692209455e-06,
+      "loss": 0.6619,
+      "step": 59200
+    },
+    {
+      "epoch": 1.4559219625828055,
+      "grad_norm": 41.93768310546875,
+      "learning_rate": 2.86360153256705e-06,
+      "loss": 0.6592,
+      "step": 60000
+    },
+    {
+      "epoch": 1.475334255417243,
+      "grad_norm": 36.79194641113281,
+      "learning_rate": 2.761430395913155e-06,
+      "loss": 0.6457,
+      "step": 60800
+    },
+    {
+      "epoch": 1.4947465482516804,
+      "grad_norm": 39.22713088989258,
+      "learning_rate": 2.6592592592592592e-06,
+      "loss": 0.6484,
+      "step": 61600
+    },
+    {
+      "epoch": 1.5141588410861178,
+      "grad_norm": 57.07029724121094,
+      "learning_rate": 2.557088122605364e-06,
+      "loss": 0.6599,
+      "step": 62400
+    },
+    {
+      "epoch": 1.5335711339205553,
+      "grad_norm": 34.55560302734375,
+      "learning_rate": 2.4549169859514692e-06,
+      "loss": 0.6526,
+      "step": 63200
+    },
+    {
+      "epoch": 1.5529834267549925,
+      "grad_norm": 47.0441780090332,
+      "learning_rate": 2.3527458492975734e-06,
+      "loss": 0.6605,
+      "step": 64000
+    },
+    {
+      "epoch": 1.5529834267549925,
+      "eval_loss": 0.6782345175743103,
+      "eval_runtime": 1619.8932,
+      "eval_samples_per_second": 6.337,
+      "eval_steps_per_second": 6.337,
+      "step": 64000
+    },
+    {
+      "epoch": 1.57239571958943,
+      "grad_norm": 36.22556686401367,
+      "learning_rate": 2.2505747126436784e-06,
+      "loss": 0.6685,
+      "step": 64800
+    },
+    {
+      "epoch": 1.5918080124238674,
+      "grad_norm": 51.00742721557617,
+      "learning_rate": 2.148403575989783e-06,
+      "loss": 0.6507,
+      "step": 65600
+    },
+    {
+      "epoch": 1.6112203052583047,
+      "grad_norm": 36.90612030029297,
+      "learning_rate": 2.046232439335888e-06,
+      "loss": 0.6397,
+      "step": 66400
+    },
+    {
+      "epoch": 1.6306325980927423,
+      "grad_norm": 46.38152313232422,
+      "learning_rate": 1.9440613026819926e-06,
+      "loss": 0.65,
+      "step": 67200
+    },
+    {
+      "epoch": 1.6500448909271797,
+      "grad_norm": 41.90031051635742,
+      "learning_rate": 1.8418901660280971e-06,
+      "loss": 0.6554,
+      "step": 68000
+    },
+    {
+      "epoch": 1.669457183761617,
+      "grad_norm": 30.752347946166992,
+      "learning_rate": 1.739719029374202e-06,
+      "loss": 0.6513,
+      "step": 68800
+    },
+    {
+      "epoch": 1.6888694765960546,
+      "grad_norm": 34.301795959472656,
+      "learning_rate": 1.6375478927203067e-06,
+      "loss": 0.6467,
+      "step": 69600
+    },
+    {
+      "epoch": 1.7082817694304917,
+      "grad_norm": 46.13114929199219,
+      "learning_rate": 1.5353767560664113e-06,
+      "loss": 0.6582,
+      "step": 70400
+    },
+    {
+      "epoch": 1.7276940622649293,
+      "grad_norm": 30.393938064575195,
+      "learning_rate": 1.4332056194125161e-06,
+      "loss": 0.6535,
+      "step": 71200
+    },
+    {
+      "epoch": 1.7471063550993666,
+      "grad_norm": 37.74268341064453,
+      "learning_rate": 1.331034482758621e-06,
+      "loss": 0.6474,
+      "step": 72000
+    },
+    {
+      "epoch": 1.7471063550993666,
+      "eval_loss": 0.6739329695701599,
+      "eval_runtime": 1620.1308,
+      "eval_samples_per_second": 6.336,
+      "eval_steps_per_second": 6.336,
+      "step": 72000
+    },
+    {
+      "epoch": 1.766518647933804,
+      "grad_norm": 67.96341705322266,
+      "learning_rate": 1.2288633461047255e-06,
+      "loss": 0.6557,
+      "step": 72800
+    },
+    {
+      "epoch": 1.7859309407682415,
+      "grad_norm": 26.30314064025879,
+      "learning_rate": 1.1266922094508303e-06,
+      "loss": 0.6527,
+      "step": 73600
+    },
+    {
+      "epoch": 1.805343233602679,
+      "grad_norm": 46.09438705444336,
+      "learning_rate": 1.024521072796935e-06,
+      "loss": 0.6589,
+      "step": 74400
+    },
+    {
+      "epoch": 1.8247555264371162,
+      "grad_norm": 28.36471176147461,
+      "learning_rate": 9.223499361430396e-07,
+      "loss": 0.6506,
+      "step": 75200
+    },
+    {
+      "epoch": 1.8441678192715538,
+      "grad_norm": 50.00242614746094,
+      "learning_rate": 8.201787994891444e-07,
+      "loss": 0.6441,
+      "step": 76000
+    },
+    {
+      "epoch": 1.8635801121059912,
+      "grad_norm": 27.380474090576172,
+      "learning_rate": 7.18007662835249e-07,
+      "loss": 0.6662,
+      "step": 76800
+    },
+    {
+      "epoch": 1.8829924049404285,
+      "grad_norm": 43.15554428100586,
+      "learning_rate": 6.158365261813538e-07,
+      "loss": 0.6374,
+      "step": 77600
+    },
+    {
+      "epoch": 1.902404697774866,
+      "grad_norm": 41.739227294921875,
+      "learning_rate": 5.136653895274585e-07,
+      "loss": 0.6569,
+      "step": 78400
+    },
+    {
+      "epoch": 1.9218169906093032,
+      "grad_norm": 38.57143783569336,
+      "learning_rate": 4.1149425287356324e-07,
+      "loss": 0.6534,
+      "step": 79200
+    },
+    {
+      "epoch": 1.9412292834437408,
+      "grad_norm": 52.96192932128906,
+      "learning_rate": 3.09323116219668e-07,
+      "loss": 0.6537,
+      "step": 80000
+    },
+    {
+      "epoch": 1.9412292834437408,
+      "eval_loss": 0.6711751222610474,
+      "eval_runtime": 1619.9483,
+      "eval_samples_per_second": 6.337,
+      "eval_steps_per_second": 6.337,
+      "step": 80000
+    }
+  ],
+  "logging_steps": 800,
+  "max_steps": 82422,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 2,
+  "save_steps": 8000,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 1.0083501150921034e+19,
+  "train_batch_size": 2,
+  "trial_name": null,
+  "trial_params": null
+}