Boffl
/

BullingerLM-llama3.1-8B-instruct-add

+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 0.9997049277072882,
+  "eval_steps": 500,
+  "global_step": 847,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.011802891708468575,
+      "grad_norm": 2.2601828575134277,
+      "learning_rate": 5.294117647058824e-06,
+      "loss": 3.4593,
+      "step": 10
+    },
+    {
+      "epoch": 0.02360578341693715,
+      "grad_norm": 2.753499746322632,
+      "learning_rate": 1e-05,
+      "loss": 2.9512,
+      "step": 20
+    },
+    {
+      "epoch": 0.03540867512540572,
+      "grad_norm": 1.42862069606781,
+      "learning_rate": 1.588235294117647e-05,
+      "loss": 2.2602,
+      "step": 30
+    },
+    {
+      "epoch": 0.0472115668338743,
+      "grad_norm": 2.2324745655059814,
+      "learning_rate": 2.1764705882352943e-05,
+      "loss": 1.9542,
+      "step": 40
+    },
+    {
+      "epoch": 0.05901445854234287,
+      "grad_norm": 0.9986198544502258,
+      "learning_rate": 2.7647058823529416e-05,
+      "loss": 1.7491,
+      "step": 50
+    },
+    {
+      "epoch": 0.07081735025081144,
+      "grad_norm": 1.7015349864959717,
+      "learning_rate": 3.352941176470588e-05,
+      "loss": 1.7127,
+      "step": 60
+    },
+    {
+      "epoch": 0.08262024195928003,
+      "grad_norm": 1.410741925239563,
+      "learning_rate": 3.9411764705882356e-05,
+      "loss": 1.5587,
+      "step": 70
+    },
+    {
+      "epoch": 0.0944231336677486,
+      "grad_norm": 1.4795621633529663,
+      "learning_rate": 4.5294117647058826e-05,
+      "loss": 1.6199,
+      "step": 80
+    },
+    {
+      "epoch": 0.10622602537621717,
+      "grad_norm": 1.2762919664382935,
+      "learning_rate": 4.999915012051437e-05,
+      "loss": 1.5481,
+      "step": 90
+    },
+    {
+      "epoch": 0.11802891708468574,
+      "grad_norm": 1.1632708311080933,
+      "learning_rate": 4.996941040535653e-05,
+      "loss": 1.4648,
+      "step": 100
+    },
+    {
+      "epoch": 0.1298318087931543,
+      "grad_norm": 1.950888752937317,
+      "learning_rate": 4.989723448187131e-05,
+      "loss": 1.5273,
+      "step": 110
+    },
+    {
+      "epoch": 0.14163470050162288,
+      "grad_norm": 1.0861647129058838,
+      "learning_rate": 4.978274501505061e-05,
+      "loss": 1.5472,
+      "step": 120
+    },
+    {
+      "epoch": 0.15343759221009148,
+      "grad_norm": 1.347829818725586,
+      "learning_rate": 4.962613658293158e-05,
+      "loss": 1.5428,
+      "step": 130
+    },
+    {
+      "epoch": 0.16524048391856005,
+      "grad_norm": 1.014880657196045,
+      "learning_rate": 4.942767534590581e-05,
+      "loss": 1.3644,
+      "step": 140
+    },
+    {
+      "epoch": 0.17704337562702863,
+      "grad_norm": 1.7476609945297241,
+      "learning_rate": 4.918769859437232e-05,
+      "loss": 1.4653,
+      "step": 150
+    },
+    {
+      "epoch": 0.1888462673354972,
+      "grad_norm": 1.0597649812698364,
+      "learning_rate": 4.890661417550319e-05,
+      "loss": 1.5003,
+      "step": 160
+    },
+    {
+      "epoch": 0.20064915904396577,
+      "grad_norm": 1.098489761352539,
+      "learning_rate": 4.8584899800095864e-05,
+      "loss": 1.462,
+      "step": 170
+    },
+    {
+      "epoch": 0.21245205075243434,
+      "grad_norm": 0.8845277428627014,
+      "learning_rate": 4.822310223069039e-05,
+      "loss": 1.3805,
+      "step": 180
+    },
+    {
+      "epoch": 0.2242549424609029,
+      "grad_norm": 1.181378960609436,
+      "learning_rate": 4.782183635233124e-05,
+      "loss": 1.4678,
+      "step": 190
+    },
+    {
+      "epoch": 0.23605783416937148,
+      "grad_norm": 1.3490713834762573,
+      "learning_rate": 4.738178412755306e-05,
+      "loss": 1.378,
+      "step": 200
+    },
+    {
+      "epoch": 0.24786072587784008,
+      "grad_norm": 1.1515178680419922,
+      "learning_rate": 4.690369343736636e-05,
+      "loss": 1.4216,
+      "step": 210
+    },
+    {
+      "epoch": 0.2596636175863086,
+      "grad_norm": 1.3116216659545898,
+      "learning_rate": 4.6388376810212905e-05,
+      "loss": 1.3915,
+      "step": 220
+    },
+    {
+      "epoch": 0.2714665092947772,
+      "grad_norm": 1.6775559186935425,
+      "learning_rate": 4.583671004105096e-05,
+      "loss": 1.4104,
+      "step": 230
+    },
+    {
+      "epoch": 0.28326940100324577,
+      "grad_norm": 1.4933921098709106,
+      "learning_rate": 4.524963070291744e-05,
+      "loss": 1.411,
+      "step": 240
+    },
+    {
+      "epoch": 0.29507229271171437,
+      "grad_norm": 1.057346224784851,
+      "learning_rate": 4.4628136553496375e-05,
+      "loss": 1.3628,
+      "step": 250
+    },
+    {
+      "epoch": 0.30687518442018297,
+      "grad_norm": 1.389323353767395,
+      "learning_rate": 4.397328383940196e-05,
+      "loss": 1.331,
+      "step": 260
+    },
+    {
+      "epoch": 0.3186780761286515,
+      "grad_norm": 1.2937726974487305,
+      "learning_rate": 4.328618550105802e-05,
+      "loss": 1.338,
+      "step": 270
+    },
+    {
+      "epoch": 0.3304809678371201,
+      "grad_norm": 1.0137064456939697,
+      "learning_rate": 4.256800928122475e-05,
+      "loss": 1.4131,
+      "step": 280
+    },
+    {
+      "epoch": 0.34228385954558865,
+      "grad_norm": 1.2065379619598389,
+      "learning_rate": 4.181997574038741e-05,
+      "loss": 1.3584,
+      "step": 290
+    },
+    {
+      "epoch": 0.35408675125405725,
+      "grad_norm": 1.0482007265090942,
+      "learning_rate": 4.104335618237972e-05,
+      "loss": 1.3541,
+      "step": 300
+    },
+    {
+      "epoch": 0.3658896429625258,
+      "grad_norm": 1.1176925897598267,
+      "learning_rate": 4.0239470493767704e-05,
+      "loss": 1.359,
+      "step": 310
+    },
+    {
+      "epoch": 0.3776925346709944,
+      "grad_norm": 0.9922409653663635,
+      "learning_rate": 3.940968490066559e-05,
+      "loss": 1.261,
+      "step": 320
+    },
+    {
+      "epoch": 0.389495426379463,
+      "grad_norm": 1.4820419549942017,
+      "learning_rate": 3.855540964679658e-05,
+      "loss": 1.2903,
+      "step": 330
+    },
+    {
+      "epoch": 0.40129831808793154,
+      "grad_norm": 1.443935751914978,
+      "learning_rate": 3.767809659674433e-05,
+      "loss": 1.3593,
+      "step": 340
+    },
+    {
+      "epoch": 0.41310120979640014,
+      "grad_norm": 2.1749682426452637,
+      "learning_rate": 3.677923676846864e-05,
+      "loss": 1.3608,
+      "step": 350
+    },
+    {
+      "epoch": 0.4249041015048687,
+      "grad_norm": 1.4614121913909912,
+      "learning_rate": 3.586035779927896e-05,
+      "loss": 1.2742,
+      "step": 360
+    },
+    {
+      "epoch": 0.4367069932133373,
+      "grad_norm": 1.1197657585144043,
+      "learning_rate": 3.492302134957218e-05,
+      "loss": 1.3217,
+      "step": 370
+    },
+    {
+      "epoch": 0.4485098849218058,
+      "grad_norm": 1.1603928804397583,
+      "learning_rate": 3.396882044874736e-05,
+      "loss": 1.2824,
+      "step": 380
+    },
+    {
+      "epoch": 0.4603127766302744,
+      "grad_norm": 1.6083821058273315,
+      "learning_rate": 3.2999376787807864e-05,
+      "loss": 1.344,
+      "step": 390
+    },
+    {
+      "epoch": 0.47211566833874297,
+      "grad_norm": 1.56455397605896,
+      "learning_rate": 3.201633796325233e-05,
+      "loss": 1.3372,
+      "step": 400
+    },
+    {
+      "epoch": 0.48391856004721157,
+      "grad_norm": 1.4750654697418213,
+      "learning_rate": 3.1021374676938584e-05,
+      "loss": 1.33,
+      "step": 410
+    },
+    {
+      "epoch": 0.49572145175568016,
+      "grad_norm": 1.2510316371917725,
+      "learning_rate": 3.0016177896679255e-05,
+      "loss": 1.2919,
+      "step": 420
+    },
+    {
+      "epoch": 0.5075243434641488,
+      "grad_norm": 1.2658268213272095,
+      "learning_rate": 2.9002455982394944e-05,
+      "loss": 1.2649,
+      "step": 430
+    },
+    {
+      "epoch": 0.5193272351726173,
+      "grad_norm": 1.905948519706726,
+      "learning_rate": 2.798193178270889e-05,
+      "loss": 1.3047,
+      "step": 440
+    },
+    {
+      "epoch": 0.5311301268810859,
+      "grad_norm": 1.2382662296295166,
+      "learning_rate": 2.695633970691786e-05,
+      "loss": 1.2862,
+      "step": 450
+    },
+    {
+      "epoch": 0.5429330185895545,
+      "grad_norm": 1.2122917175292969,
+      "learning_rate": 2.592742277731513e-05,
+      "loss": 1.2843,
+      "step": 460
+    },
+    {
+      "epoch": 0.554735910298023,
+      "grad_norm": 1.3638920783996582,
+      "learning_rate": 2.489692966687566e-05,
+      "loss": 1.2795,
+      "step": 470
+    },
+    {
+      "epoch": 0.5665388020064915,
+      "grad_norm": 1.2353355884552002,
+      "learning_rate": 2.386661172733762e-05,
+      "loss": 1.1897,
+      "step": 480
+    },
+    {
+      "epoch": 0.5783416937149601,
+      "grad_norm": 1.440238118171692,
+      "learning_rate": 2.2838220012731365e-05,
+      "loss": 1.3352,
+      "step": 490
+    },
+    {
+      "epoch": 0.5901445854234287,
+      "grad_norm": 1.302875280380249,
+      "learning_rate": 2.1813502303414306e-05,
+      "loss": 1.2552,
+      "step": 500
+    },
+    {
+      "epoch": 0.6019474771318973,
+      "grad_norm": 1.3485292196273804,
+      "learning_rate": 2.0794200135669584e-05,
+      "loss": 1.2573,
+      "step": 510
+    },
+    {
+      "epoch": 0.6137503688403659,
+      "grad_norm": 2.1018223762512207,
+      "learning_rate": 1.9782045841916625e-05,
+      "loss": 1.2564,
+      "step": 520
+    },
+    {
+      "epoch": 0.6255532605488344,
+      "grad_norm": 1.6336476802825928,
+      "learning_rate": 1.877875960656394e-05,
+      "loss": 1.1512,
+      "step": 530
+    },
+    {
+      "epoch": 0.637356152257303,
+      "grad_norm": 1.4360566139221191,
+      "learning_rate": 1.7786046542507843e-05,
+      "loss": 1.2434,
+      "step": 540
+    },
+    {
+      "epoch": 0.6491590439657716,
+      "grad_norm": 1.1216990947723389,
+      "learning_rate": 1.680559379324558e-05,
+      "loss": 1.325,
+      "step": 550
+    },
+    {
+      "epoch": 0.6609619356742402,
+      "grad_norm": 1.6999801397323608,
+      "learning_rate": 1.583906766552799e-05,
+      "loss": 1.2197,
+      "step": 560
+    },
+    {
+      "epoch": 0.6727648273827088,
+      "grad_norm": 1.4907481670379639,
+      "learning_rate": 1.4888110797424782e-05,
+      "loss": 1.2821,
+      "step": 570
+    },
+    {
+      "epoch": 0.6845677190911773,
+      "grad_norm": 1.2150344848632812,
+      "learning_rate": 1.3954339366615334e-05,
+      "loss": 1.239,
+      "step": 580
+    },
+    {
+      "epoch": 0.6963706107996459,
+      "grad_norm": 1.6709622144699097,
+      "learning_rate": 1.303934034364983e-05,
+      "loss": 1.2403,
+      "step": 590
+    },
+    {
+      "epoch": 0.7081735025081145,
+      "grad_norm": 1.5160703659057617,
+      "learning_rate": 1.21446687948485e-05,
+      "loss": 1.2466,
+      "step": 600
+    },
+    {
+      "epoch": 0.7199763942165831,
+      "grad_norm": 1.2667752504348755,
+      "learning_rate": 1.1271845239423196e-05,
+      "loss": 1.1662,
+      "step": 610
+    },
+    {
+      "epoch": 0.7317792859250516,
+      "grad_norm": 1.685145616531372,
+      "learning_rate": 1.0422353065312573e-05,
+      "loss": 1.3161,
+      "step": 620
+    },
+    {
+      "epoch": 0.7435821776335202,
+      "grad_norm": 1.5131856203079224,
+      "learning_rate": 9.59763600812305e-06,
+      "loss": 1.2608,
+      "step": 630
+    },
+    {
+      "epoch": 0.7553850693419888,
+      "grad_norm": 1.2261701822280884,
+      "learning_rate": 8.79909569745987e-06,
+      "loss": 1.1507,
+      "step": 640
+    },
+    {
+      "epoch": 0.7671879610504574,
+      "grad_norm": 1.2804995775222778,
+      "learning_rate": 8.028089274818624e-06,
+      "loss": 1.3008,
+      "step": 650
+    },
+    {
+      "epoch": 0.778990852758926,
+      "grad_norm": 1.3678828477859497,
+      "learning_rate": 7.285927087085423e-06,
+      "loss": 1.272,
+      "step": 660
+    },
+    {
+      "epoch": 0.7907937444673945,
+      "grad_norm": 1.3345593214035034,
+      "learning_rate": 6.5738704595659065e-06,
+      "loss": 1.1615,
+      "step": 670
+    },
+    {
+      "epoch": 0.8025966361758631,
+      "grad_norm": 1.2585678100585938,
+      "learning_rate": 5.893129552327781e-06,
+      "loss": 1.1878,
+      "step": 680
+    },
+    {
+      "epoch": 0.8143995278843317,
+      "grad_norm": 1.3462913036346436,
+      "learning_rate": 5.244861303500026e-06,
+      "loss": 1.2436,
+      "step": 690
+    },
+    {
+      "epoch": 0.8262024195928003,
+      "grad_norm": 1.1118088960647583,
+      "learning_rate": 4.630167463024393e-06,
+      "loss": 1.0838,
+      "step": 700
+    },
+    {
+      "epoch": 0.8380053113012688,
+      "grad_norm": 1.7299799919128418,
+      "learning_rate": 4.050092720200638e-06,
+      "loss": 1.1495,
+      "step": 710
+    },
+    {
+      "epoch": 0.8498082030097374,
+      "grad_norm": 1.3773056268692017,
+      "learning_rate": 3.5056229282080077e-06,
+      "loss": 1.234,
+      "step": 720
+    },
+    {
+      "epoch": 0.861611094718206,
+      "grad_norm": 1.2820888757705688,
+      "learning_rate": 2.997683428620296e-06,
+      "loss": 1.1803,
+      "step": 730
+    },
+    {
+      "epoch": 0.8734139864266746,
+      "grad_norm": 1.3301385641098022,
+      "learning_rate": 2.527137478762037e-06,
+      "loss": 1.2197,
+      "step": 740
+    },
+    {
+      "epoch": 0.8852168781351432,
+      "grad_norm": 1.7628834247589111,
+      "learning_rate": 2.094784784578707e-06,
+      "loss": 1.2354,
+      "step": 750
+    },
+    {
+      "epoch": 0.8970197698436116,
+      "grad_norm": 1.2032676935195923,
+      "learning_rate": 1.7013601415141383e-06,
+      "loss": 1.1835,
+      "step": 760
+    },
+    {
+      "epoch": 0.9088226615520802,
+      "grad_norm": 1.5983058214187622,
+      "learning_rate": 1.3475321857052386e-06,
+      "loss": 1.1651,
+      "step": 770
+    },
+    {
+      "epoch": 0.9206255532605488,
+      "grad_norm": 1.0227899551391602,
+      "learning_rate": 1.03390225761624e-06,
+      "loss": 1.1662,
+      "step": 780
+    },
+    {
+      "epoch": 0.9324284449690174,
+      "grad_norm": 1.352665901184082,
+      "learning_rate": 7.610033800438344e-07,
+      "loss": 1.1798,
+      "step": 790
+    },
+    {
+      "epoch": 0.9442313366774859,
+      "grad_norm": 1.6476454734802246,
+      "learning_rate": 5.292993522301005e-07,
+      "loss": 1.2053,
+      "step": 800
+    },
+    {
+      "epoch": 0.9560342283859545,
+      "grad_norm": 1.2775633335113525,
+      "learning_rate": 3.3918396162275214e-07,
+      "loss": 1.2049,
+      "step": 810
+    },
+    {
+      "epoch": 0.9678371200944231,
+      "grad_norm": 1.4991925954818726,
+      "learning_rate": 1.9098031462242705e-07,
+      "loss": 1.2097,
+      "step": 820
+    },
+    {
+      "epoch": 0.9796400118028917,
+      "grad_norm": 1.3501712083816528,
+      "learning_rate": 8.494028745434368e-08,
+      "loss": 1.2085,
+      "step": 830
+    },
+    {
+      "epoch": 0.9914429035113603,
+      "grad_norm": 1.319488763809204,
+      "learning_rate": 2.124409809766692e-08,
+      "loss": 1.1854,
+      "step": 840
+    },
+    {
+      "epoch": 0.9997049277072882,
+      "step": 847,
+      "total_flos": 1.2532647345436754e+18,
+      "train_loss": 1.3799798170537847,
+      "train_runtime": 10524.3823,
+      "train_samples_per_second": 2.576,
+      "train_steps_per_second": 0.08
+    }
+  ],
+  "logging_steps": 10,
+  "max_steps": 847,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 1,
+  "save_steps": 1000,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": true
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 1.2532647345436754e+18,
+  "train_batch_size": 8,
+  "trial_name": null,
+  "trial_params": null
+}