{ "best_metric": null, "best_model_checkpoint": null, "epoch": 10.0, "eval_steps": 500, "global_step": 2500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.02, "grad_norm": 30.21004867553711, "learning_rate": 4.0000000000000003e-07, "loss": 3.7961, "step": 5 }, { "epoch": 0.04, "grad_norm": 27.698633193969727, "learning_rate": 8.000000000000001e-07, "loss": 3.7529, "step": 10 }, { "epoch": 0.06, "grad_norm": 17.014102935791016, "learning_rate": 1.2000000000000002e-06, "loss": 3.6348, "step": 15 }, { "epoch": 0.08, "grad_norm": 12.189393997192383, "learning_rate": 1.6000000000000001e-06, "loss": 3.3817, "step": 20 }, { "epoch": 0.1, "grad_norm": 9.264037132263184, "learning_rate": 2.0000000000000003e-06, "loss": 3.0922, "step": 25 }, { "epoch": 0.12, "grad_norm": 6.129642963409424, "learning_rate": 2.4000000000000003e-06, "loss": 2.8821, "step": 30 }, { "epoch": 0.14, "grad_norm": 4.392693996429443, "learning_rate": 2.8000000000000003e-06, "loss": 2.6029, "step": 35 }, { "epoch": 0.16, "grad_norm": 6.556579113006592, "learning_rate": 3.2000000000000003e-06, "loss": 2.4349, "step": 40 }, { "epoch": 0.18, "grad_norm": 3.495549440383911, "learning_rate": 3.6000000000000003e-06, "loss": 2.2218, "step": 45 }, { "epoch": 0.2, "grad_norm": 3.5002224445343018, "learning_rate": 4.000000000000001e-06, "loss": 2.0837, "step": 50 }, { "epoch": 0.22, "grad_norm": 3.8143324851989746, "learning_rate": 4.4e-06, "loss": 2.0036, "step": 55 }, { "epoch": 0.24, "grad_norm": 3.2101738452911377, "learning_rate": 4.800000000000001e-06, "loss": 1.9349, "step": 60 }, { "epoch": 0.26, "grad_norm": 2.8242392539978027, "learning_rate": 5.2e-06, "loss": 1.8964, "step": 65 }, { "epoch": 0.28, "grad_norm": 2.515568494796753, "learning_rate": 5.600000000000001e-06, "loss": 1.8728, "step": 70 }, { "epoch": 0.3, "grad_norm": 2.872358798980713, "learning_rate": 6e-06, "loss": 1.8441, "step": 75 }, { "epoch": 0.32, "grad_norm": 3.117579936981201, "learning_rate": 6.4000000000000006e-06, "loss": 1.8218, "step": 80 }, { "epoch": 0.34, "grad_norm": 2.6006624698638916, "learning_rate": 6.800000000000001e-06, "loss": 1.809, "step": 85 }, { "epoch": 0.36, "grad_norm": 2.7116575241088867, "learning_rate": 7.2000000000000005e-06, "loss": 1.7963, "step": 90 }, { "epoch": 0.38, "grad_norm": 2.297466278076172, "learning_rate": 7.600000000000001e-06, "loss": 1.7763, "step": 95 }, { "epoch": 0.4, "grad_norm": 2.5194053649902344, "learning_rate": 8.000000000000001e-06, "loss": 1.772, "step": 100 }, { "epoch": 0.42, "grad_norm": 2.4360179901123047, "learning_rate": 8.400000000000001e-06, "loss": 1.7559, "step": 105 }, { "epoch": 0.44, "grad_norm": 2.224982261657715, "learning_rate": 8.8e-06, "loss": 1.7485, "step": 110 }, { "epoch": 0.46, "grad_norm": 2.297065258026123, "learning_rate": 9.200000000000002e-06, "loss": 1.7436, "step": 115 }, { "epoch": 0.48, "grad_norm": 2.390683889389038, "learning_rate": 9.600000000000001e-06, "loss": 1.7416, "step": 120 }, { "epoch": 0.5, "grad_norm": 2.2673444747924805, "learning_rate": 1e-05, "loss": 1.7449, "step": 125 }, { "epoch": 0.52, "grad_norm": 2.167109251022339, "learning_rate": 1.04e-05, "loss": 1.7413, "step": 130 }, { "epoch": 0.54, "grad_norm": 2.0062167644500732, "learning_rate": 1.0800000000000002e-05, "loss": 1.7111, "step": 135 }, { "epoch": 0.56, "grad_norm": 2.1711270809173584, "learning_rate": 1.1200000000000001e-05, "loss": 1.7177, "step": 140 }, { "epoch": 0.58, "grad_norm": 2.027642011642456, "learning_rate": 1.16e-05, "loss": 1.7261, "step": 145 }, { "epoch": 0.6, "grad_norm": 1.953282356262207, "learning_rate": 1.2e-05, "loss": 1.7124, "step": 150 }, { "epoch": 0.62, "grad_norm": 2.0645740032196045, "learning_rate": 1.2400000000000002e-05, "loss": 1.7249, "step": 155 }, { "epoch": 0.64, "grad_norm": 2.904872179031372, "learning_rate": 1.2800000000000001e-05, "loss": 1.7226, "step": 160 }, { "epoch": 0.66, "grad_norm": 2.2629189491271973, "learning_rate": 1.3200000000000002e-05, "loss": 1.7156, "step": 165 }, { "epoch": 0.68, "grad_norm": 2.0626604557037354, "learning_rate": 1.3600000000000002e-05, "loss": 1.7285, "step": 170 }, { "epoch": 0.7, "grad_norm": 2.1120095252990723, "learning_rate": 1.4e-05, "loss": 1.729, "step": 175 }, { "epoch": 0.72, "grad_norm": 2.1083803176879883, "learning_rate": 1.4400000000000001e-05, "loss": 1.7186, "step": 180 }, { "epoch": 0.74, "grad_norm": 2.4473488330841064, "learning_rate": 1.48e-05, "loss": 1.7169, "step": 185 }, { "epoch": 0.76, "grad_norm": 3.1553521156311035, "learning_rate": 1.5200000000000002e-05, "loss": 1.7141, "step": 190 }, { "epoch": 0.78, "grad_norm": 1.8050827980041504, "learning_rate": 1.5600000000000003e-05, "loss": 1.7211, "step": 195 }, { "epoch": 0.8, "grad_norm": 1.81538987159729, "learning_rate": 1.6000000000000003e-05, "loss": 1.718, "step": 200 }, { "epoch": 0.82, "grad_norm": 1.8642960786819458, "learning_rate": 1.64e-05, "loss": 1.7249, "step": 205 }, { "epoch": 0.84, "grad_norm": 1.9567952156066895, "learning_rate": 1.6800000000000002e-05, "loss": 1.7257, "step": 210 }, { "epoch": 0.86, "grad_norm": 2.035290002822876, "learning_rate": 1.72e-05, "loss": 1.7328, "step": 215 }, { "epoch": 0.88, "grad_norm": 1.8088799715042114, "learning_rate": 1.76e-05, "loss": 1.7182, "step": 220 }, { "epoch": 0.9, "grad_norm": 1.9366027116775513, "learning_rate": 1.8e-05, "loss": 1.7202, "step": 225 }, { "epoch": 0.92, "grad_norm": 1.8107857704162598, "learning_rate": 1.8400000000000003e-05, "loss": 1.7163, "step": 230 }, { "epoch": 0.94, "grad_norm": 1.8145129680633545, "learning_rate": 1.88e-05, "loss": 1.7214, "step": 235 }, { "epoch": 0.96, "grad_norm": 2.3850910663604736, "learning_rate": 1.9200000000000003e-05, "loss": 1.7193, "step": 240 }, { "epoch": 0.98, "grad_norm": 2.1443727016448975, "learning_rate": 1.9600000000000002e-05, "loss": 1.7152, "step": 245 }, { "epoch": 1.0, "grad_norm": 1.813616156578064, "learning_rate": 2e-05, "loss": 1.7176, "step": 250 }, { "epoch": 1.02, "grad_norm": 1.8890033960342407, "learning_rate": 1.9999756307053947e-05, "loss": 1.7146, "step": 255 }, { "epoch": 1.04, "grad_norm": 1.7228977680206299, "learning_rate": 1.9999025240093045e-05, "loss": 1.709, "step": 260 }, { "epoch": 1.06, "grad_norm": 1.7775235176086426, "learning_rate": 1.9997806834748455e-05, "loss": 1.7132, "step": 265 }, { "epoch": 1.08, "grad_norm": 1.6786800622940063, "learning_rate": 1.9996101150403543e-05, "loss": 1.7058, "step": 270 }, { "epoch": 1.1, "grad_norm": 3.5919995307922363, "learning_rate": 1.999390827019096e-05, "loss": 1.7135, "step": 275 }, { "epoch": 1.12, "grad_norm": 1.6665600538253784, "learning_rate": 1.9991228300988586e-05, "loss": 1.7138, "step": 280 }, { "epoch": 1.1400000000000001, "grad_norm": 1.6498384475708008, "learning_rate": 1.9988061373414342e-05, "loss": 1.71, "step": 285 }, { "epoch": 1.16, "grad_norm": 1.7032837867736816, "learning_rate": 1.9984407641819812e-05, "loss": 1.7103, "step": 290 }, { "epoch": 1.18, "grad_norm": 1.7445223331451416, "learning_rate": 1.9980267284282718e-05, "loss": 1.7022, "step": 295 }, { "epoch": 1.2, "grad_norm": 1.8121885061264038, "learning_rate": 1.9975640502598243e-05, "loss": 1.7238, "step": 300 }, { "epoch": 1.22, "grad_norm": 2.3573601245880127, "learning_rate": 1.9970527522269204e-05, "loss": 1.7069, "step": 305 }, { "epoch": 1.24, "grad_norm": 1.790433406829834, "learning_rate": 1.9964928592495046e-05, "loss": 1.7151, "step": 310 }, { "epoch": 1.26, "grad_norm": 1.7757713794708252, "learning_rate": 1.9958843986159705e-05, "loss": 1.7044, "step": 315 }, { "epoch": 1.28, "grad_norm": 1.4678212404251099, "learning_rate": 1.9952273999818312e-05, "loss": 1.7023, "step": 320 }, { "epoch": 1.3, "grad_norm": 1.7063349485397339, "learning_rate": 1.9945218953682736e-05, "loss": 1.7132, "step": 325 }, { "epoch": 1.32, "grad_norm": 1.537674903869629, "learning_rate": 1.9937679191605964e-05, "loss": 1.7071, "step": 330 }, { "epoch": 1.34, "grad_norm": 1.6040879487991333, "learning_rate": 1.992965508106537e-05, "loss": 1.7004, "step": 335 }, { "epoch": 1.3599999999999999, "grad_norm": 1.7427316904067993, "learning_rate": 1.9921147013144782e-05, "loss": 1.7006, "step": 340 }, { "epoch": 1.38, "grad_norm": 2.1476292610168457, "learning_rate": 1.991215540251542e-05, "loss": 1.7059, "step": 345 }, { "epoch": 1.4, "grad_norm": 1.5894598960876465, "learning_rate": 1.9902680687415704e-05, "loss": 1.7066, "step": 350 }, { "epoch": 1.42, "grad_norm": 1.5307775735855103, "learning_rate": 1.9892723329629885e-05, "loss": 1.7104, "step": 355 }, { "epoch": 1.44, "grad_norm": 1.502901315689087, "learning_rate": 1.988228381446553e-05, "loss": 1.7045, "step": 360 }, { "epoch": 1.46, "grad_norm": 1.6482329368591309, "learning_rate": 1.987136265072988e-05, "loss": 1.7235, "step": 365 }, { "epoch": 1.48, "grad_norm": 1.5000663995742798, "learning_rate": 1.985996037070505e-05, "loss": 1.7026, "step": 370 }, { "epoch": 1.5, "grad_norm": 1.5986205339431763, "learning_rate": 1.9848077530122083e-05, "loss": 1.7108, "step": 375 }, { "epoch": 1.52, "grad_norm": 2.0925049781799316, "learning_rate": 1.983571470813386e-05, "loss": 1.7049, "step": 380 }, { "epoch": 1.54, "grad_norm": 1.4937970638275146, "learning_rate": 1.982287250728689e-05, "loss": 1.7075, "step": 385 }, { "epoch": 1.56, "grad_norm": 1.7145628929138184, "learning_rate": 1.9809551553491918e-05, "loss": 1.704, "step": 390 }, { "epoch": 1.58, "grad_norm": 1.4836574792861938, "learning_rate": 1.979575249599344e-05, "loss": 1.6981, "step": 395 }, { "epoch": 1.6, "grad_norm": 1.8676332235336304, "learning_rate": 1.9781476007338058e-05, "loss": 1.7138, "step": 400 }, { "epoch": 1.62, "grad_norm": 1.549514651298523, "learning_rate": 1.9766722783341682e-05, "loss": 1.7077, "step": 405 }, { "epoch": 1.6400000000000001, "grad_norm": 1.6212232112884521, "learning_rate": 1.9751493543055634e-05, "loss": 1.7103, "step": 410 }, { "epoch": 1.6600000000000001, "grad_norm": 1.6188278198242188, "learning_rate": 1.9735789028731603e-05, "loss": 1.7064, "step": 415 }, { "epoch": 1.6800000000000002, "grad_norm": 1.6380282640457153, "learning_rate": 1.9719610005785466e-05, "loss": 1.6988, "step": 420 }, { "epoch": 1.7, "grad_norm": 1.4315824508666992, "learning_rate": 1.9702957262759964e-05, "loss": 1.7076, "step": 425 }, { "epoch": 1.72, "grad_norm": 1.497166395187378, "learning_rate": 1.9685831611286312e-05, "loss": 1.6965, "step": 430 }, { "epoch": 1.74, "grad_norm": 1.5363743305206299, "learning_rate": 1.9668233886044597e-05, "loss": 1.7073, "step": 435 }, { "epoch": 1.76, "grad_norm": 1.4399864673614502, "learning_rate": 1.9650164944723116e-05, "loss": 1.7028, "step": 440 }, { "epoch": 1.78, "grad_norm": 1.5128633975982666, "learning_rate": 1.9631625667976584e-05, "loss": 1.7078, "step": 445 }, { "epoch": 1.8, "grad_norm": 1.4235734939575195, "learning_rate": 1.961261695938319e-05, "loss": 1.7093, "step": 450 }, { "epoch": 1.8199999999999998, "grad_norm": 1.4514182806015015, "learning_rate": 1.9593139745400575e-05, "loss": 1.7059, "step": 455 }, { "epoch": 1.8399999999999999, "grad_norm": 1.360684871673584, "learning_rate": 1.9573194975320672e-05, "loss": 1.7017, "step": 460 }, { "epoch": 1.8599999999999999, "grad_norm": 1.4784973859786987, "learning_rate": 1.9552783621223437e-05, "loss": 1.7054, "step": 465 }, { "epoch": 1.88, "grad_norm": 1.4710580110549927, "learning_rate": 1.9531906677929472e-05, "loss": 1.707, "step": 470 }, { "epoch": 1.9, "grad_norm": 1.3699547052383423, "learning_rate": 1.9510565162951538e-05, "loss": 1.7047, "step": 475 }, { "epoch": 1.92, "grad_norm": 1.395521879196167, "learning_rate": 1.9488760116444966e-05, "loss": 1.7121, "step": 480 }, { "epoch": 1.94, "grad_norm": 1.368019938468933, "learning_rate": 1.9466492601156964e-05, "loss": 1.7013, "step": 485 }, { "epoch": 1.96, "grad_norm": 1.7290316820144653, "learning_rate": 1.944376370237481e-05, "loss": 1.7042, "step": 490 }, { "epoch": 1.98, "grad_norm": 1.4156850576400757, "learning_rate": 1.942057452787297e-05, "loss": 1.7041, "step": 495 }, { "epoch": 2.0, "grad_norm": 1.374906063079834, "learning_rate": 1.9396926207859085e-05, "loss": 1.6999, "step": 500 }, { "epoch": 2.02, "grad_norm": 1.431320309638977, "learning_rate": 1.937281989491892e-05, "loss": 1.7057, "step": 505 }, { "epoch": 2.04, "grad_norm": 1.4097741842269897, "learning_rate": 1.9348256763960146e-05, "loss": 1.6996, "step": 510 }, { "epoch": 2.06, "grad_norm": 1.4540244340896606, "learning_rate": 1.9323238012155125e-05, "loss": 1.7065, "step": 515 }, { "epoch": 2.08, "grad_norm": 1.3888367414474487, "learning_rate": 1.9297764858882516e-05, "loss": 1.6991, "step": 520 }, { "epoch": 2.1, "grad_norm": 1.3523085117340088, "learning_rate": 1.9271838545667876e-05, "loss": 1.7039, "step": 525 }, { "epoch": 2.12, "grad_norm": 1.4312589168548584, "learning_rate": 1.9245460336123136e-05, "loss": 1.6876, "step": 530 }, { "epoch": 2.14, "grad_norm": 1.4729927778244019, "learning_rate": 1.9218631515885007e-05, "loss": 1.7074, "step": 535 }, { "epoch": 2.16, "grad_norm": 1.3183554410934448, "learning_rate": 1.9191353392552346e-05, "loss": 1.7018, "step": 540 }, { "epoch": 2.18, "grad_norm": 1.4427063465118408, "learning_rate": 1.9163627295622397e-05, "loss": 1.6896, "step": 545 }, { "epoch": 2.2, "grad_norm": 1.4542627334594727, "learning_rate": 1.913545457642601e-05, "loss": 1.6937, "step": 550 }, { "epoch": 2.22, "grad_norm": 1.3560303449630737, "learning_rate": 1.910683660806177e-05, "loss": 1.7066, "step": 555 }, { "epoch": 2.24, "grad_norm": 1.6534428596496582, "learning_rate": 1.907777478532909e-05, "loss": 1.6913, "step": 560 }, { "epoch": 2.26, "grad_norm": 1.334754467010498, "learning_rate": 1.9048270524660197e-05, "loss": 1.7045, "step": 565 }, { "epoch": 2.2800000000000002, "grad_norm": 1.367875576019287, "learning_rate": 1.901832526405114e-05, "loss": 1.6936, "step": 570 }, { "epoch": 2.3, "grad_norm": 1.3227001428604126, "learning_rate": 1.8987940462991673e-05, "loss": 1.6991, "step": 575 }, { "epoch": 2.32, "grad_norm": 1.2901933193206787, "learning_rate": 1.895711760239413e-05, "loss": 1.696, "step": 580 }, { "epoch": 2.34, "grad_norm": 1.33935546875, "learning_rate": 1.892585818452126e-05, "loss": 1.7008, "step": 585 }, { "epoch": 2.36, "grad_norm": 1.487889051437378, "learning_rate": 1.889416373291298e-05, "loss": 1.6883, "step": 590 }, { "epoch": 2.38, "grad_norm": 1.3563237190246582, "learning_rate": 1.8862035792312148e-05, "loss": 1.6967, "step": 595 }, { "epoch": 2.4, "grad_norm": 1.2794183492660522, "learning_rate": 1.8829475928589272e-05, "loss": 1.6922, "step": 600 }, { "epoch": 2.42, "grad_norm": 1.4755550622940063, "learning_rate": 1.879648572866617e-05, "loss": 1.7054, "step": 605 }, { "epoch": 2.44, "grad_norm": 1.307241678237915, "learning_rate": 1.8763066800438638e-05, "loss": 1.6897, "step": 610 }, { "epoch": 2.46, "grad_norm": 1.4721298217773438, "learning_rate": 1.8729220772698096e-05, "loss": 1.6969, "step": 615 }, { "epoch": 2.48, "grad_norm": 1.8397995233535767, "learning_rate": 1.869494929505219e-05, "loss": 1.697, "step": 620 }, { "epoch": 2.5, "grad_norm": 1.2730300426483154, "learning_rate": 1.866025403784439e-05, "loss": 1.6994, "step": 625 }, { "epoch": 2.52, "grad_norm": 1.252538800239563, "learning_rate": 1.8625136692072577e-05, "loss": 1.7046, "step": 630 }, { "epoch": 2.54, "grad_norm": 1.2064297199249268, "learning_rate": 1.8589598969306646e-05, "loss": 1.6935, "step": 635 }, { "epoch": 2.56, "grad_norm": 1.2591032981872559, "learning_rate": 1.855364260160507e-05, "loss": 1.7, "step": 640 }, { "epoch": 2.58, "grad_norm": 1.380873203277588, "learning_rate": 1.851726934143048e-05, "loss": 1.6952, "step": 645 }, { "epoch": 2.6, "grad_norm": 1.301558256149292, "learning_rate": 1.848048096156426e-05, "loss": 1.7029, "step": 650 }, { "epoch": 2.62, "grad_norm": 1.3414067029953003, "learning_rate": 1.8443279255020153e-05, "loss": 1.6973, "step": 655 }, { "epoch": 2.64, "grad_norm": 1.2605617046356201, "learning_rate": 1.8405666034956842e-05, "loss": 1.6883, "step": 660 }, { "epoch": 2.66, "grad_norm": 1.2734073400497437, "learning_rate": 1.836764313458962e-05, "loss": 1.6925, "step": 665 }, { "epoch": 2.68, "grad_norm": 1.3956694602966309, "learning_rate": 1.8329212407100996e-05, "loss": 1.7057, "step": 670 }, { "epoch": 2.7, "grad_norm": 1.463300108909607, "learning_rate": 1.8290375725550417e-05, "loss": 1.695, "step": 675 }, { "epoch": 2.7199999999999998, "grad_norm": 1.276495099067688, "learning_rate": 1.8251134982782952e-05, "loss": 1.7108, "step": 680 }, { "epoch": 2.74, "grad_norm": 1.205746054649353, "learning_rate": 1.821149209133704e-05, "loss": 1.6987, "step": 685 }, { "epoch": 2.76, "grad_norm": 1.5678800344467163, "learning_rate": 1.8171448983351284e-05, "loss": 1.6933, "step": 690 }, { "epoch": 2.7800000000000002, "grad_norm": 1.2740401029586792, "learning_rate": 1.8131007610470278e-05, "loss": 1.7092, "step": 695 }, { "epoch": 2.8, "grad_norm": 1.2071486711502075, "learning_rate": 1.8090169943749477e-05, "loss": 1.6895, "step": 700 }, { "epoch": 2.82, "grad_norm": 1.2892779111862183, "learning_rate": 1.804893797355914e-05, "loss": 1.7124, "step": 705 }, { "epoch": 2.84, "grad_norm": 1.339904546737671, "learning_rate": 1.8007313709487334e-05, "loss": 1.7018, "step": 710 }, { "epoch": 2.86, "grad_norm": 1.2110613584518433, "learning_rate": 1.7965299180241963e-05, "loss": 1.6975, "step": 715 }, { "epoch": 2.88, "grad_norm": 1.2822394371032715, "learning_rate": 1.792289643355191e-05, "loss": 1.7022, "step": 720 }, { "epoch": 2.9, "grad_norm": 1.3015908002853394, "learning_rate": 1.788010753606722e-05, "loss": 1.6998, "step": 725 }, { "epoch": 2.92, "grad_norm": 1.2471498250961304, "learning_rate": 1.78369345732584e-05, "loss": 1.6939, "step": 730 }, { "epoch": 2.94, "grad_norm": 1.5627294778823853, "learning_rate": 1.7793379649314743e-05, "loss": 1.6913, "step": 735 }, { "epoch": 2.96, "grad_norm": 1.384990930557251, "learning_rate": 1.7749444887041797e-05, "loss": 1.6877, "step": 740 }, { "epoch": 2.98, "grad_norm": 1.2845375537872314, "learning_rate": 1.7705132427757895e-05, "loss": 1.6985, "step": 745 }, { "epoch": 3.0, "grad_norm": 1.3291269540786743, "learning_rate": 1.766044443118978e-05, "loss": 1.6925, "step": 750 }, { "epoch": 3.02, "grad_norm": 1.144539475440979, "learning_rate": 1.761538307536737e-05, "loss": 1.6943, "step": 755 }, { "epoch": 3.04, "grad_norm": 1.256880521774292, "learning_rate": 1.7569950556517566e-05, "loss": 1.6949, "step": 760 }, { "epoch": 3.06, "grad_norm": 1.3574849367141724, "learning_rate": 1.7524149088957244e-05, "loss": 1.6912, "step": 765 }, { "epoch": 3.08, "grad_norm": 1.5494495630264282, "learning_rate": 1.747798090498532e-05, "loss": 1.6831, "step": 770 }, { "epoch": 3.1, "grad_norm": 1.711756944656372, "learning_rate": 1.7431448254773943e-05, "loss": 1.689, "step": 775 }, { "epoch": 3.12, "grad_norm": 1.3662241697311401, "learning_rate": 1.7384553406258842e-05, "loss": 1.6989, "step": 780 }, { "epoch": 3.14, "grad_norm": 1.2295734882354736, "learning_rate": 1.7337298645028764e-05, "loss": 1.6812, "step": 785 }, { "epoch": 3.16, "grad_norm": 1.1872354745864868, "learning_rate": 1.7289686274214116e-05, "loss": 1.6978, "step": 790 }, { "epoch": 3.18, "grad_norm": 1.2291423082351685, "learning_rate": 1.7241718614374678e-05, "loss": 1.6937, "step": 795 }, { "epoch": 3.2, "grad_norm": 1.1743723154067993, "learning_rate": 1.7193398003386514e-05, "loss": 1.6958, "step": 800 }, { "epoch": 3.22, "grad_norm": 1.2758196592330933, "learning_rate": 1.7144726796328034e-05, "loss": 1.6886, "step": 805 }, { "epoch": 3.24, "grad_norm": 1.2271584272384644, "learning_rate": 1.709570736536521e-05, "loss": 1.6896, "step": 810 }, { "epoch": 3.26, "grad_norm": 1.4545894861221313, "learning_rate": 1.7046342099635948e-05, "loss": 1.6934, "step": 815 }, { "epoch": 3.2800000000000002, "grad_norm": 1.218001365661621, "learning_rate": 1.6996633405133656e-05, "loss": 1.6984, "step": 820 }, { "epoch": 3.3, "grad_norm": 1.7679849863052368, "learning_rate": 1.6946583704589973e-05, "loss": 1.6965, "step": 825 }, { "epoch": 3.32, "grad_norm": 1.437711477279663, "learning_rate": 1.68961954373567e-05, "loss": 1.7012, "step": 830 }, { "epoch": 3.34, "grad_norm": 1.165284514427185, "learning_rate": 1.684547105928689e-05, "loss": 1.6817, "step": 835 }, { "epoch": 3.36, "grad_norm": 1.228750467300415, "learning_rate": 1.6794413042615168e-05, "loss": 1.6972, "step": 840 }, { "epoch": 3.38, "grad_norm": 1.3721333742141724, "learning_rate": 1.6743023875837233e-05, "loss": 1.6953, "step": 845 }, { "epoch": 3.4, "grad_norm": 1.221617341041565, "learning_rate": 1.6691306063588583e-05, "loss": 1.684, "step": 850 }, { "epoch": 3.42, "grad_norm": 1.4893969297409058, "learning_rate": 1.6639262126522417e-05, "loss": 1.6976, "step": 855 }, { "epoch": 3.44, "grad_norm": 1.3793864250183105, "learning_rate": 1.6586894601186804e-05, "loss": 1.6917, "step": 860 }, { "epoch": 3.46, "grad_norm": 1.2841719388961792, "learning_rate": 1.6534206039901057e-05, "loss": 1.688, "step": 865 }, { "epoch": 3.48, "grad_norm": 1.1975661516189575, "learning_rate": 1.6481199010631312e-05, "loss": 1.701, "step": 870 }, { "epoch": 3.5, "grad_norm": 1.4768658876419067, "learning_rate": 1.6427876096865394e-05, "loss": 1.6828, "step": 875 }, { "epoch": 3.52, "grad_norm": 1.1539041996002197, "learning_rate": 1.63742398974869e-05, "loss": 1.6807, "step": 880 }, { "epoch": 3.54, "grad_norm": 1.301548719406128, "learning_rate": 1.632029302664851e-05, "loss": 1.6826, "step": 885 }, { "epoch": 3.56, "grad_norm": 1.2049092054367065, "learning_rate": 1.6266038113644605e-05, "loss": 1.7009, "step": 890 }, { "epoch": 3.58, "grad_norm": 1.151050090789795, "learning_rate": 1.6211477802783105e-05, "loss": 1.6855, "step": 895 }, { "epoch": 3.6, "grad_norm": 1.1691709756851196, "learning_rate": 1.6156614753256583e-05, "loss": 1.692, "step": 900 }, { "epoch": 3.62, "grad_norm": 1.2301398515701294, "learning_rate": 1.610145163901268e-05, "loss": 1.6893, "step": 905 }, { "epoch": 3.64, "grad_norm": 1.1491998434066772, "learning_rate": 1.6045991148623752e-05, "loss": 1.6737, "step": 910 }, { "epoch": 3.66, "grad_norm": 1.6054513454437256, "learning_rate": 1.599023598515586e-05, "loss": 1.6843, "step": 915 }, { "epoch": 3.68, "grad_norm": 1.224255919456482, "learning_rate": 1.5934188866037017e-05, "loss": 1.6828, "step": 920 }, { "epoch": 3.7, "grad_norm": 1.2744394540786743, "learning_rate": 1.5877852522924733e-05, "loss": 1.6954, "step": 925 }, { "epoch": 3.7199999999999998, "grad_norm": 1.6310608386993408, "learning_rate": 1.5821229701572897e-05, "loss": 1.6992, "step": 930 }, { "epoch": 3.74, "grad_norm": 1.1329830884933472, "learning_rate": 1.5764323161697933e-05, "loss": 1.6858, "step": 935 }, { "epoch": 3.76, "grad_norm": 1.1588170528411865, "learning_rate": 1.570713567684432e-05, "loss": 1.6868, "step": 940 }, { "epoch": 3.7800000000000002, "grad_norm": 1.1529875993728638, "learning_rate": 1.564967003424938e-05, "loss": 1.688, "step": 945 }, { "epoch": 3.8, "grad_norm": 1.172899842262268, "learning_rate": 1.5591929034707468e-05, "loss": 1.6989, "step": 950 }, { "epoch": 3.82, "grad_norm": 1.1531150341033936, "learning_rate": 1.553391549243344e-05, "loss": 1.6896, "step": 955 }, { "epoch": 3.84, "grad_norm": 1.2427817583084106, "learning_rate": 1.5475632234925505e-05, "loss": 1.6944, "step": 960 }, { "epoch": 3.86, "grad_norm": 1.168960452079773, "learning_rate": 1.54170821028274e-05, "loss": 1.6789, "step": 965 }, { "epoch": 3.88, "grad_norm": 1.1754002571105957, "learning_rate": 1.5358267949789968e-05, "loss": 1.6926, "step": 970 }, { "epoch": 3.9, "grad_norm": 1.373303771018982, "learning_rate": 1.529919264233205e-05, "loss": 1.7003, "step": 975 }, { "epoch": 3.92, "grad_norm": 2.1144957542419434, "learning_rate": 1.5239859059700794e-05, "loss": 1.6954, "step": 980 }, { "epoch": 3.94, "grad_norm": 1.2079601287841797, "learning_rate": 1.5180270093731305e-05, "loss": 1.692, "step": 985 }, { "epoch": 3.96, "grad_norm": 1.2189019918441772, "learning_rate": 1.5120428648705716e-05, "loss": 1.6848, "step": 990 }, { "epoch": 3.98, "grad_norm": 1.1501588821411133, "learning_rate": 1.5060337641211637e-05, "loss": 1.7002, "step": 995 }, { "epoch": 4.0, "grad_norm": 1.1473348140716553, "learning_rate": 1.5000000000000002e-05, "loss": 1.6857, "step": 1000 }, { "epoch": 4.02, "grad_norm": 1.1277931928634644, "learning_rate": 1.493941866584231e-05, "loss": 1.6764, "step": 1005 }, { "epoch": 4.04, "grad_norm": 1.2237178087234497, "learning_rate": 1.4878596591387329e-05, "loss": 1.6793, "step": 1010 }, { "epoch": 4.06, "grad_norm": 1.3738579750061035, "learning_rate": 1.4817536741017153e-05, "loss": 1.6807, "step": 1015 }, { "epoch": 4.08, "grad_norm": 1.2409491539001465, "learning_rate": 1.4756242090702756e-05, "loss": 1.6794, "step": 1020 }, { "epoch": 4.1, "grad_norm": 1.2782703638076782, "learning_rate": 1.469471562785891e-05, "loss": 1.6749, "step": 1025 }, { "epoch": 4.12, "grad_norm": 1.2754226922988892, "learning_rate": 1.463296035119862e-05, "loss": 1.6734, "step": 1030 }, { "epoch": 4.14, "grad_norm": 1.2375459671020508, "learning_rate": 1.4570979270586944e-05, "loss": 1.6694, "step": 1035 }, { "epoch": 4.16, "grad_norm": 1.3407933712005615, "learning_rate": 1.4508775406894308e-05, "loss": 1.6766, "step": 1040 }, { "epoch": 4.18, "grad_norm": 1.4103169441223145, "learning_rate": 1.4446351791849276e-05, "loss": 1.6756, "step": 1045 }, { "epoch": 4.2, "grad_norm": 1.2496464252471924, "learning_rate": 1.4383711467890776e-05, "loss": 1.6804, "step": 1050 }, { "epoch": 4.22, "grad_norm": 1.2907923460006714, "learning_rate": 1.4320857488019826e-05, "loss": 1.6841, "step": 1055 }, { "epoch": 4.24, "grad_norm": 1.3250927925109863, "learning_rate": 1.4257792915650728e-05, "loss": 1.6815, "step": 1060 }, { "epoch": 4.26, "grad_norm": 1.2333563566207886, "learning_rate": 1.4194520824461773e-05, "loss": 1.6641, "step": 1065 }, { "epoch": 4.28, "grad_norm": 1.340520977973938, "learning_rate": 1.413104429824542e-05, "loss": 1.6791, "step": 1070 }, { "epoch": 4.3, "grad_norm": 1.379290223121643, "learning_rate": 1.4067366430758004e-05, "loss": 1.6995, "step": 1075 }, { "epoch": 4.32, "grad_norm": 1.1723048686981201, "learning_rate": 1.4003490325568953e-05, "loss": 1.6702, "step": 1080 }, { "epoch": 4.34, "grad_norm": 1.2853553295135498, "learning_rate": 1.3939419095909513e-05, "loss": 1.676, "step": 1085 }, { "epoch": 4.36, "grad_norm": 1.2285099029541016, "learning_rate": 1.3875155864521031e-05, "loss": 1.68, "step": 1090 }, { "epoch": 4.38, "grad_norm": 1.2212533950805664, "learning_rate": 1.3810703763502744e-05, "loss": 1.6758, "step": 1095 }, { "epoch": 4.4, "grad_norm": 1.2971107959747314, "learning_rate": 1.3746065934159123e-05, "loss": 1.6779, "step": 1100 }, { "epoch": 4.42, "grad_norm": 1.2474068403244019, "learning_rate": 1.3681245526846782e-05, "loss": 1.6857, "step": 1105 }, { "epoch": 4.44, "grad_norm": 1.330390453338623, "learning_rate": 1.3616245700820922e-05, "loss": 1.6797, "step": 1110 }, { "epoch": 4.46, "grad_norm": 1.258988857269287, "learning_rate": 1.3551069624081372e-05, "loss": 1.6794, "step": 1115 }, { "epoch": 4.48, "grad_norm": 1.2740941047668457, "learning_rate": 1.3485720473218153e-05, "loss": 1.6711, "step": 1120 }, { "epoch": 4.5, "grad_norm": 1.2095681428909302, "learning_rate": 1.342020143325669e-05, "loss": 1.6831, "step": 1125 }, { "epoch": 4.52, "grad_norm": 1.3167965412139893, "learning_rate": 1.3354515697502552e-05, "loss": 1.6789, "step": 1130 }, { "epoch": 4.54, "grad_norm": 1.2791593074798584, "learning_rate": 1.3288666467385834e-05, "loss": 1.6762, "step": 1135 }, { "epoch": 4.5600000000000005, "grad_norm": 1.2301644086837769, "learning_rate": 1.3222656952305113e-05, "loss": 1.6773, "step": 1140 }, { "epoch": 4.58, "grad_norm": 1.253196120262146, "learning_rate": 1.3156490369471026e-05, "loss": 1.6793, "step": 1145 }, { "epoch": 4.6, "grad_norm": 1.2421847581863403, "learning_rate": 1.3090169943749475e-05, "loss": 1.6898, "step": 1150 }, { "epoch": 4.62, "grad_norm": 1.237874984741211, "learning_rate": 1.3023698907504447e-05, "loss": 1.6703, "step": 1155 }, { "epoch": 4.64, "grad_norm": 1.2837252616882324, "learning_rate": 1.2957080500440469e-05, "loss": 1.6747, "step": 1160 }, { "epoch": 4.66, "grad_norm": 1.2899264097213745, "learning_rate": 1.2890317969444716e-05, "loss": 1.6826, "step": 1165 }, { "epoch": 4.68, "grad_norm": 1.166990041732788, "learning_rate": 1.2823414568428767e-05, "loss": 1.6687, "step": 1170 }, { "epoch": 4.7, "grad_norm": 1.1460736989974976, "learning_rate": 1.2756373558169992e-05, "loss": 1.6774, "step": 1175 }, { "epoch": 4.72, "grad_norm": 1.2305958271026611, "learning_rate": 1.2689198206152657e-05, "loss": 1.6846, "step": 1180 }, { "epoch": 4.74, "grad_norm": 1.1499146223068237, "learning_rate": 1.2621891786408648e-05, "loss": 1.6738, "step": 1185 }, { "epoch": 4.76, "grad_norm": 1.3568623065948486, "learning_rate": 1.2554457579357906e-05, "loss": 1.6868, "step": 1190 }, { "epoch": 4.78, "grad_norm": 1.7656750679016113, "learning_rate": 1.2486898871648552e-05, "loss": 1.6829, "step": 1195 }, { "epoch": 4.8, "grad_norm": 1.4720224142074585, "learning_rate": 1.2419218955996677e-05, "loss": 1.6739, "step": 1200 }, { "epoch": 4.82, "grad_norm": 1.2218985557556152, "learning_rate": 1.23514211310259e-05, "loss": 1.6699, "step": 1205 }, { "epoch": 4.84, "grad_norm": 1.2355974912643433, "learning_rate": 1.2283508701106559e-05, "loss": 1.669, "step": 1210 }, { "epoch": 4.86, "grad_norm": 1.2267754077911377, "learning_rate": 1.2215484976194675e-05, "loss": 1.6785, "step": 1215 }, { "epoch": 4.88, "grad_norm": 1.3336670398712158, "learning_rate": 1.2147353271670634e-05, "loss": 1.6811, "step": 1220 }, { "epoch": 4.9, "grad_norm": 1.1803137063980103, "learning_rate": 1.2079116908177592e-05, "loss": 1.6802, "step": 1225 }, { "epoch": 4.92, "grad_norm": 2.2649755477905273, "learning_rate": 1.2010779211459649e-05, "loss": 1.6807, "step": 1230 }, { "epoch": 4.9399999999999995, "grad_norm": 1.1885944604873657, "learning_rate": 1.194234351219972e-05, "loss": 1.6784, "step": 1235 }, { "epoch": 4.96, "grad_norm": 1.2544772624969482, "learning_rate": 1.187381314585725e-05, "loss": 1.6766, "step": 1240 }, { "epoch": 4.98, "grad_norm": 1.3752778768539429, "learning_rate": 1.1805191452505602e-05, "loss": 1.6865, "step": 1245 }, { "epoch": 5.0, "grad_norm": 1.2073532342910767, "learning_rate": 1.1736481776669307e-05, "loss": 1.6776, "step": 1250 }, { "epoch": 5.02, "grad_norm": 1.3473936319351196, "learning_rate": 1.1667687467161025e-05, "loss": 1.6418, "step": 1255 }, { "epoch": 5.04, "grad_norm": 1.5514543056488037, "learning_rate": 1.159881187691835e-05, "loss": 1.6321, "step": 1260 }, { "epoch": 5.06, "grad_norm": 1.534508228302002, "learning_rate": 1.1529858362840383e-05, "loss": 1.6177, "step": 1265 }, { "epoch": 5.08, "grad_norm": 1.8792202472686768, "learning_rate": 1.1460830285624119e-05, "loss": 1.6193, "step": 1270 }, { "epoch": 5.1, "grad_norm": 1.6183016300201416, "learning_rate": 1.1391731009600655e-05, "loss": 1.6298, "step": 1275 }, { "epoch": 5.12, "grad_norm": 1.7961289882659912, "learning_rate": 1.1322563902571227e-05, "loss": 1.6209, "step": 1280 }, { "epoch": 5.14, "grad_norm": 1.5732005834579468, "learning_rate": 1.1253332335643043e-05, "loss": 1.6191, "step": 1285 }, { "epoch": 5.16, "grad_norm": 1.7134405374526978, "learning_rate": 1.1184039683065014e-05, "loss": 1.6236, "step": 1290 }, { "epoch": 5.18, "grad_norm": 1.8280987739562988, "learning_rate": 1.1114689322063255e-05, "loss": 1.6228, "step": 1295 }, { "epoch": 5.2, "grad_norm": 1.6293045282363892, "learning_rate": 1.1045284632676535e-05, "loss": 1.6207, "step": 1300 }, { "epoch": 5.22, "grad_norm": 1.6155800819396973, "learning_rate": 1.0975828997591496e-05, "loss": 1.6233, "step": 1305 }, { "epoch": 5.24, "grad_norm": 1.6458238363265991, "learning_rate": 1.0906325801977804e-05, "loss": 1.6252, "step": 1310 }, { "epoch": 5.26, "grad_norm": 1.6302796602249146, "learning_rate": 1.083677843332316e-05, "loss": 1.629, "step": 1315 }, { "epoch": 5.28, "grad_norm": 1.6061460971832275, "learning_rate": 1.0767190281268187e-05, "loss": 1.6288, "step": 1320 }, { "epoch": 5.3, "grad_norm": 1.6623554229736328, "learning_rate": 1.0697564737441254e-05, "loss": 1.6336, "step": 1325 }, { "epoch": 5.32, "grad_norm": 1.5957063436508179, "learning_rate": 1.0627905195293135e-05, "loss": 1.6236, "step": 1330 }, { "epoch": 5.34, "grad_norm": 1.6672245264053345, "learning_rate": 1.055821504993164e-05, "loss": 1.6304, "step": 1335 }, { "epoch": 5.36, "grad_norm": 1.7344614267349243, "learning_rate": 1.0488497697956134e-05, "loss": 1.6276, "step": 1340 }, { "epoch": 5.38, "grad_norm": 1.7076168060302734, "learning_rate": 1.0418756537291996e-05, "loss": 1.637, "step": 1345 }, { "epoch": 5.4, "grad_norm": 1.7165381908416748, "learning_rate": 1.0348994967025012e-05, "loss": 1.6308, "step": 1350 }, { "epoch": 5.42, "grad_norm": 1.69627046585083, "learning_rate": 1.0279216387235691e-05, "loss": 1.6368, "step": 1355 }, { "epoch": 5.44, "grad_norm": 1.8979768753051758, "learning_rate": 1.0209424198833571e-05, "loss": 1.6414, "step": 1360 }, { "epoch": 5.46, "grad_norm": 1.769880771636963, "learning_rate": 1.0139621803391454e-05, "loss": 1.6368, "step": 1365 }, { "epoch": 5.48, "grad_norm": 1.628905177116394, "learning_rate": 1.0069812602979617e-05, "loss": 1.6459, "step": 1370 }, { "epoch": 5.5, "grad_norm": 1.581150770187378, "learning_rate": 1e-05, "loss": 1.6233, "step": 1375 }, { "epoch": 5.52, "grad_norm": 1.5890735387802124, "learning_rate": 9.930187397020385e-06, "loss": 1.6264, "step": 1380 }, { "epoch": 5.54, "grad_norm": 1.6208202838897705, "learning_rate": 9.860378196608549e-06, "loss": 1.6264, "step": 1385 }, { "epoch": 5.5600000000000005, "grad_norm": 1.627545952796936, "learning_rate": 9.790575801166432e-06, "loss": 1.6296, "step": 1390 }, { "epoch": 5.58, "grad_norm": 1.5746124982833862, "learning_rate": 9.720783612764314e-06, "loss": 1.6331, "step": 1395 }, { "epoch": 5.6, "grad_norm": 1.6535131931304932, "learning_rate": 9.651005032974994e-06, "loss": 1.629, "step": 1400 }, { "epoch": 5.62, "grad_norm": 1.559142827987671, "learning_rate": 9.581243462708007e-06, "loss": 1.6187, "step": 1405 }, { "epoch": 5.64, "grad_norm": 1.5918751955032349, "learning_rate": 9.511502302043867e-06, "loss": 1.635, "step": 1410 }, { "epoch": 5.66, "grad_norm": 1.6395398378372192, "learning_rate": 9.441784950068362e-06, "loss": 1.6323, "step": 1415 }, { "epoch": 5.68, "grad_norm": 1.6209341287612915, "learning_rate": 9.372094804706867e-06, "loss": 1.6342, "step": 1420 }, { "epoch": 5.7, "grad_norm": 1.7134851217269897, "learning_rate": 9.302435262558748e-06, "loss": 1.6312, "step": 1425 }, { "epoch": 5.72, "grad_norm": 2.302839994430542, "learning_rate": 9.232809718731815e-06, "loss": 1.6326, "step": 1430 }, { "epoch": 5.74, "grad_norm": 1.6598634719848633, "learning_rate": 9.163221566676847e-06, "loss": 1.6416, "step": 1435 }, { "epoch": 5.76, "grad_norm": 1.692478060722351, "learning_rate": 9.093674198022201e-06, "loss": 1.6366, "step": 1440 }, { "epoch": 5.78, "grad_norm": 1.6801451444625854, "learning_rate": 9.024171002408507e-06, "loss": 1.6338, "step": 1445 }, { "epoch": 5.8, "grad_norm": 1.7073163986206055, "learning_rate": 8.954715367323468e-06, "loss": 1.6325, "step": 1450 }, { "epoch": 5.82, "grad_norm": 1.6709482669830322, "learning_rate": 8.885310677936746e-06, "loss": 1.6296, "step": 1455 }, { "epoch": 5.84, "grad_norm": 1.6847496032714844, "learning_rate": 8.815960316934991e-06, "loss": 1.63, "step": 1460 }, { "epoch": 5.86, "grad_norm": 1.6837451457977295, "learning_rate": 8.746667664356957e-06, "loss": 1.6413, "step": 1465 }, { "epoch": 5.88, "grad_norm": 1.6769607067108154, "learning_rate": 8.677436097428775e-06, "loss": 1.6393, "step": 1470 }, { "epoch": 5.9, "grad_norm": 1.7436274290084839, "learning_rate": 8.60826899039935e-06, "loss": 1.632, "step": 1475 }, { "epoch": 5.92, "grad_norm": 1.6370117664337158, "learning_rate": 8.539169714375885e-06, "loss": 1.6294, "step": 1480 }, { "epoch": 5.9399999999999995, "grad_norm": 1.633933186531067, "learning_rate": 8.47014163715962e-06, "loss": 1.6318, "step": 1485 }, { "epoch": 5.96, "grad_norm": 1.7138166427612305, "learning_rate": 8.401188123081653e-06, "loss": 1.6333, "step": 1490 }, { "epoch": 5.98, "grad_norm": 1.6793943643569946, "learning_rate": 8.332312532838978e-06, "loss": 1.625, "step": 1495 }, { "epoch": 6.0, "grad_norm": 1.6901153326034546, "learning_rate": 8.263518223330698e-06, "loss": 1.6276, "step": 1500 }, { "epoch": 6.02, "grad_norm": 2.6299827098846436, "learning_rate": 8.194808547494401e-06, "loss": 1.4932, "step": 1505 }, { "epoch": 6.04, "grad_norm": 3.6387939453125, "learning_rate": 8.126186854142752e-06, "loss": 1.4582, "step": 1510 }, { "epoch": 6.06, "grad_norm": 2.7163808345794678, "learning_rate": 8.057656487800283e-06, "loss": 1.4205, "step": 1515 }, { "epoch": 6.08, "grad_norm": 3.661618947982788, "learning_rate": 7.989220788540356e-06, "loss": 1.4164, "step": 1520 }, { "epoch": 6.1, "grad_norm": 3.3539645671844482, "learning_rate": 7.92088309182241e-06, "loss": 1.4098, "step": 1525 }, { "epoch": 6.12, "grad_norm": 3.1834163665771484, "learning_rate": 7.852646728329368e-06, "loss": 1.4189, "step": 1530 }, { "epoch": 6.14, "grad_norm": 3.607142448425293, "learning_rate": 7.784515023805328e-06, "loss": 1.4078, "step": 1535 }, { "epoch": 6.16, "grad_norm": 3.505796194076538, "learning_rate": 7.716491298893443e-06, "loss": 1.4103, "step": 1540 }, { "epoch": 6.18, "grad_norm": 3.1681082248687744, "learning_rate": 7.6485788689741e-06, "loss": 1.4122, "step": 1545 }, { "epoch": 6.2, "grad_norm": 3.3361103534698486, "learning_rate": 7.580781044003324e-06, "loss": 1.4108, "step": 1550 }, { "epoch": 6.22, "grad_norm": 3.2785024642944336, "learning_rate": 7.513101128351454e-06, "loss": 1.4101, "step": 1555 }, { "epoch": 6.24, "grad_norm": 3.350015640258789, "learning_rate": 7.445542420642097e-06, "loss": 1.419, "step": 1560 }, { "epoch": 6.26, "grad_norm": 3.3478310108184814, "learning_rate": 7.378108213591355e-06, "loss": 1.4295, "step": 1565 }, { "epoch": 6.28, "grad_norm": 3.2591845989227295, "learning_rate": 7.310801793847344e-06, "loss": 1.4049, "step": 1570 }, { "epoch": 6.3, "grad_norm": 3.5392940044403076, "learning_rate": 7.243626441830009e-06, "loss": 1.4217, "step": 1575 }, { "epoch": 6.32, "grad_norm": 3.395463228225708, "learning_rate": 7.176585431571235e-06, "loss": 1.4155, "step": 1580 }, { "epoch": 6.34, "grad_norm": 3.304579496383667, "learning_rate": 7.109682030555283e-06, "loss": 1.4195, "step": 1585 }, { "epoch": 6.36, "grad_norm": 3.369657039642334, "learning_rate": 7.042919499559538e-06, "loss": 1.4129, "step": 1590 }, { "epoch": 6.38, "grad_norm": 3.4654033184051514, "learning_rate": 6.976301092495556e-06, "loss": 1.4068, "step": 1595 }, { "epoch": 6.4, "grad_norm": 3.3557608127593994, "learning_rate": 6.909830056250527e-06, "loss": 1.4095, "step": 1600 }, { "epoch": 6.42, "grad_norm": 3.554288625717163, "learning_rate": 6.843509630528977e-06, "loss": 1.4188, "step": 1605 }, { "epoch": 6.44, "grad_norm": 3.3676187992095947, "learning_rate": 6.777343047694891e-06, "loss": 1.4055, "step": 1610 }, { "epoch": 6.46, "grad_norm": 3.2888877391815186, "learning_rate": 6.711333532614168e-06, "loss": 1.418, "step": 1615 }, { "epoch": 6.48, "grad_norm": 3.4561023712158203, "learning_rate": 6.645484302497452e-06, "loss": 1.4232, "step": 1620 }, { "epoch": 6.5, "grad_norm": 3.327456474304199, "learning_rate": 6.579798566743314e-06, "loss": 1.4278, "step": 1625 }, { "epoch": 6.52, "grad_norm": 3.2338459491729736, "learning_rate": 6.5142795267818505e-06, "loss": 1.4156, "step": 1630 }, { "epoch": 6.54, "grad_norm": 3.3685622215270996, "learning_rate": 6.448930375918632e-06, "loss": 1.4114, "step": 1635 }, { "epoch": 6.5600000000000005, "grad_norm": 3.458646774291992, "learning_rate": 6.383754299179079e-06, "loss": 1.4156, "step": 1640 }, { "epoch": 6.58, "grad_norm": 3.4090073108673096, "learning_rate": 6.318754473153221e-06, "loss": 1.4046, "step": 1645 }, { "epoch": 6.6, "grad_norm": 3.409212112426758, "learning_rate": 6.25393406584088e-06, "loss": 1.4137, "step": 1650 }, { "epoch": 6.62, "grad_norm": 3.3928050994873047, "learning_rate": 6.18929623649726e-06, "loss": 1.4198, "step": 1655 }, { "epoch": 6.64, "grad_norm": 3.4656596183776855, "learning_rate": 6.124844135478971e-06, "loss": 1.4149, "step": 1660 }, { "epoch": 6.66, "grad_norm": 3.529662847518921, "learning_rate": 6.06058090409049e-06, "loss": 1.4195, "step": 1665 }, { "epoch": 6.68, "grad_norm": 3.4222183227539062, "learning_rate": 5.996509674431053e-06, "loss": 1.4084, "step": 1670 }, { "epoch": 6.7, "grad_norm": 3.577754020690918, "learning_rate": 5.932633569242e-06, "loss": 1.4155, "step": 1675 }, { "epoch": 6.72, "grad_norm": 3.3030552864074707, "learning_rate": 5.868955701754584e-06, "loss": 1.4193, "step": 1680 }, { "epoch": 6.74, "grad_norm": 3.3563826084136963, "learning_rate": 5.8054791755382286e-06, "loss": 1.4242, "step": 1685 }, { "epoch": 6.76, "grad_norm": 3.5344948768615723, "learning_rate": 5.742207084349274e-06, "loss": 1.4201, "step": 1690 }, { "epoch": 6.78, "grad_norm": 3.388164520263672, "learning_rate": 5.679142511980176e-06, "loss": 1.4106, "step": 1695 }, { "epoch": 6.8, "grad_norm": 3.513644218444824, "learning_rate": 5.616288532109225e-06, "loss": 1.4088, "step": 1700 }, { "epoch": 6.82, "grad_norm": 3.5908257961273193, "learning_rate": 5.553648208150728e-06, "loss": 1.414, "step": 1705 }, { "epoch": 6.84, "grad_norm": 3.547783374786377, "learning_rate": 5.491224593105695e-06, "loss": 1.4183, "step": 1710 }, { "epoch": 6.86, "grad_norm": 3.5997567176818848, "learning_rate": 5.429020729413062e-06, "loss": 1.4082, "step": 1715 }, { "epoch": 6.88, "grad_norm": 3.3622734546661377, "learning_rate": 5.367039648801386e-06, "loss": 1.4178, "step": 1720 }, { "epoch": 6.9, "grad_norm": 3.4612014293670654, "learning_rate": 5.305284372141095e-06, "loss": 1.3978, "step": 1725 }, { "epoch": 6.92, "grad_norm": 3.395056962966919, "learning_rate": 5.243757909297247e-06, "loss": 1.4018, "step": 1730 }, { "epoch": 6.9399999999999995, "grad_norm": 3.3911194801330566, "learning_rate": 5.1824632589828465e-06, "loss": 1.4298, "step": 1735 }, { "epoch": 6.96, "grad_norm": 3.307224750518799, "learning_rate": 5.121403408612672e-06, "loss": 1.4006, "step": 1740 }, { "epoch": 6.98, "grad_norm": 3.5052733421325684, "learning_rate": 5.060581334157693e-06, "loss": 1.4161, "step": 1745 }, { "epoch": 7.0, "grad_norm": 3.5450494289398193, "learning_rate": 5.000000000000003e-06, "loss": 1.3935, "step": 1750 }, { "epoch": 7.02, "grad_norm": 4.5564093589782715, "learning_rate": 4.939662358788364e-06, "loss": 1.074, "step": 1755 }, { "epoch": 7.04, "grad_norm": 7.566091060638428, "learning_rate": 4.879571351294287e-06, "loss": 1.0397, "step": 1760 }, { "epoch": 7.06, "grad_norm": 5.21124792098999, "learning_rate": 4.8197299062687e-06, "loss": 1.0191, "step": 1765 }, { "epoch": 7.08, "grad_norm": 5.251906394958496, "learning_rate": 4.76014094029921e-06, "loss": 1.0124, "step": 1770 }, { "epoch": 7.1, "grad_norm": 5.793355464935303, "learning_rate": 4.700807357667953e-06, "loss": 0.9893, "step": 1775 }, { "epoch": 7.12, "grad_norm": 5.138261318206787, "learning_rate": 4.641732050210032e-06, "loss": 0.9938, "step": 1780 }, { "epoch": 7.14, "grad_norm": 5.370635032653809, "learning_rate": 4.582917897172603e-06, "loss": 0.9988, "step": 1785 }, { "epoch": 7.16, "grad_norm": 5.199043273925781, "learning_rate": 4.524367765074499e-06, "loss": 0.9776, "step": 1790 }, { "epoch": 7.18, "grad_norm": 5.165049076080322, "learning_rate": 4.46608450756656e-06, "loss": 0.9923, "step": 1795 }, { "epoch": 7.2, "grad_norm": 5.3443779945373535, "learning_rate": 4.408070965292534e-06, "loss": 0.9809, "step": 1800 }, { "epoch": 7.22, "grad_norm": 5.287759780883789, "learning_rate": 4.350329965750622e-06, "loss": 0.986, "step": 1805 }, { "epoch": 7.24, "grad_norm": 5.187274932861328, "learning_rate": 4.292864323155684e-06, "loss": 1.002, "step": 1810 }, { "epoch": 7.26, "grad_norm": 5.28734016418457, "learning_rate": 4.235676838302069e-06, "loss": 0.9989, "step": 1815 }, { "epoch": 7.28, "grad_norm": 5.318136215209961, "learning_rate": 4.178770298427107e-06, "loss": 0.981, "step": 1820 }, { "epoch": 7.3, "grad_norm": 5.267811298370361, "learning_rate": 4.12214747707527e-06, "loss": 0.9907, "step": 1825 }, { "epoch": 7.32, "grad_norm": 5.451784610748291, "learning_rate": 4.065811133962987e-06, "loss": 0.9826, "step": 1830 }, { "epoch": 7.34, "grad_norm": 5.516955375671387, "learning_rate": 4.009764014844143e-06, "loss": 0.9877, "step": 1835 }, { "epoch": 7.36, "grad_norm": 5.202047348022461, "learning_rate": 3.954008851376252e-06, "loss": 0.9866, "step": 1840 }, { "epoch": 7.38, "grad_norm": 5.0173540115356445, "learning_rate": 3.898548360987325e-06, "loss": 0.9887, "step": 1845 }, { "epoch": 7.4, "grad_norm": 5.356690883636475, "learning_rate": 3.8433852467434175e-06, "loss": 0.9897, "step": 1850 }, { "epoch": 7.42, "grad_norm": 5.419712543487549, "learning_rate": 3.7885221972168974e-06, "loss": 0.988, "step": 1855 }, { "epoch": 7.44, "grad_norm": 5.474433898925781, "learning_rate": 3.7339618863553983e-06, "loss": 1.001, "step": 1860 }, { "epoch": 7.46, "grad_norm": 5.467788219451904, "learning_rate": 3.679706973351491e-06, "loss": 0.978, "step": 1865 }, { "epoch": 7.48, "grad_norm": 5.380366802215576, "learning_rate": 3.625760102513103e-06, "loss": 0.9791, "step": 1870 }, { "epoch": 7.5, "grad_norm": 5.486819744110107, "learning_rate": 3.5721239031346067e-06, "loss": 0.9987, "step": 1875 }, { "epoch": 7.52, "grad_norm": 5.298658847808838, "learning_rate": 3.5188009893686916e-06, "loss": 0.9926, "step": 1880 }, { "epoch": 7.54, "grad_norm": 5.251545429229736, "learning_rate": 3.4657939600989453e-06, "loss": 0.9863, "step": 1885 }, { "epoch": 7.5600000000000005, "grad_norm": 5.376091480255127, "learning_rate": 3.4131053988131947e-06, "loss": 0.9786, "step": 1890 }, { "epoch": 7.58, "grad_norm": 5.347581386566162, "learning_rate": 3.360737873477584e-06, "loss": 0.9853, "step": 1895 }, { "epoch": 7.6, "grad_norm": 5.291326522827148, "learning_rate": 3.308693936411421e-06, "loss": 0.9867, "step": 1900 }, { "epoch": 7.62, "grad_norm": 5.290416240692139, "learning_rate": 3.2569761241627694e-06, "loss": 0.9824, "step": 1905 }, { "epoch": 7.64, "grad_norm": 5.326883792877197, "learning_rate": 3.2055869573848374e-06, "loss": 0.9879, "step": 1910 }, { "epoch": 7.66, "grad_norm": 5.346325397491455, "learning_rate": 3.1545289407131128e-06, "loss": 0.9931, "step": 1915 }, { "epoch": 7.68, "grad_norm": 5.48749303817749, "learning_rate": 3.103804562643302e-06, "loss": 0.9901, "step": 1920 }, { "epoch": 7.7, "grad_norm": 5.999955654144287, "learning_rate": 3.0534162954100264e-06, "loss": 1.0058, "step": 1925 }, { "epoch": 7.72, "grad_norm": 5.255881309509277, "learning_rate": 3.003366594866345e-06, "loss": 0.9939, "step": 1930 }, { "epoch": 7.74, "grad_norm": 5.439182758331299, "learning_rate": 2.953657900364053e-06, "loss": 0.9915, "step": 1935 }, { "epoch": 7.76, "grad_norm": 5.336669445037842, "learning_rate": 2.9042926346347932e-06, "loss": 0.991, "step": 1940 }, { "epoch": 7.78, "grad_norm": 5.281376361846924, "learning_rate": 2.855273203671969e-06, "loss": 0.9847, "step": 1945 }, { "epoch": 7.8, "grad_norm": 5.2418413162231445, "learning_rate": 2.8066019966134907e-06, "loss": 0.9808, "step": 1950 }, { "epoch": 7.82, "grad_norm": 5.544259071350098, "learning_rate": 2.7582813856253276e-06, "loss": 0.9833, "step": 1955 }, { "epoch": 7.84, "grad_norm": 5.195487976074219, "learning_rate": 2.7103137257858867e-06, "loss": 0.9857, "step": 1960 }, { "epoch": 7.86, "grad_norm": 5.333943843841553, "learning_rate": 2.6627013549712355e-06, "loss": 0.9994, "step": 1965 }, { "epoch": 7.88, "grad_norm": 5.322220325469971, "learning_rate": 2.615446593741161e-06, "loss": 0.9742, "step": 1970 }, { "epoch": 7.9, "grad_norm": 5.36502742767334, "learning_rate": 2.5685517452260566e-06, "loss": 0.9775, "step": 1975 }, { "epoch": 7.92, "grad_norm": 5.417782783508301, "learning_rate": 2.522019095014683e-06, "loss": 0.9779, "step": 1980 }, { "epoch": 7.9399999999999995, "grad_norm": 5.225217819213867, "learning_rate": 2.4758509110427576e-06, "loss": 0.9829, "step": 1985 }, { "epoch": 7.96, "grad_norm": 5.371156215667725, "learning_rate": 2.4300494434824373e-06, "loss": 0.9835, "step": 1990 }, { "epoch": 7.98, "grad_norm": 5.273430824279785, "learning_rate": 2.3846169246326345e-06, "loss": 0.9869, "step": 1995 }, { "epoch": 8.0, "grad_norm": 5.407411575317383, "learning_rate": 2.339555568810221e-06, "loss": 0.983, "step": 2000 }, { "epoch": 8.02, "grad_norm": 5.448741436004639, "learning_rate": 2.2948675722421086e-06, "loss": 0.6566, "step": 2005 }, { "epoch": 8.04, "grad_norm": 6.663678169250488, "learning_rate": 2.2505551129582047e-06, "loss": 0.6327, "step": 2010 }, { "epoch": 8.06, "grad_norm": 6.39850378036499, "learning_rate": 2.206620350685257e-06, "loss": 0.6286, "step": 2015 }, { "epoch": 8.08, "grad_norm": 5.739835262298584, "learning_rate": 2.163065426741603e-06, "loss": 0.6437, "step": 2020 }, { "epoch": 8.1, "grad_norm": 5.893033981323242, "learning_rate": 2.119892463932781e-06, "loss": 0.6179, "step": 2025 }, { "epoch": 8.12, "grad_norm": 6.056233882904053, "learning_rate": 2.0771035664480944e-06, "loss": 0.6167, "step": 2030 }, { "epoch": 8.14, "grad_norm": 6.03655481338501, "learning_rate": 2.0347008197580376e-06, "loss": 0.6305, "step": 2035 }, { "epoch": 8.16, "grad_norm": 6.043553352355957, "learning_rate": 1.9926862905126663e-06, "loss": 0.6165, "step": 2040 }, { "epoch": 8.18, "grad_norm": 5.853250503540039, "learning_rate": 1.95106202644086e-06, "loss": 0.6307, "step": 2045 }, { "epoch": 8.2, "grad_norm": 5.7116780281066895, "learning_rate": 1.9098300562505266e-06, "loss": 0.6103, "step": 2050 }, { "epoch": 8.22, "grad_norm": 5.799511432647705, "learning_rate": 1.8689923895297247e-06, "loss": 0.6174, "step": 2055 }, { "epoch": 8.24, "grad_norm": 5.831727504730225, "learning_rate": 1.8285510166487154e-06, "loss": 0.6292, "step": 2060 }, { "epoch": 8.26, "grad_norm": 6.006520748138428, "learning_rate": 1.7885079086629598e-06, "loss": 0.6243, "step": 2065 }, { "epoch": 8.28, "grad_norm": 6.150590419769287, "learning_rate": 1.7488650172170496e-06, "loss": 0.616, "step": 2070 }, { "epoch": 8.3, "grad_norm": 6.151932239532471, "learning_rate": 1.709624274449584e-06, "loss": 0.632, "step": 2075 }, { "epoch": 8.32, "grad_norm": 5.84307336807251, "learning_rate": 1.6707875928990059e-06, "loss": 0.6216, "step": 2080 }, { "epoch": 8.34, "grad_norm": 6.241573810577393, "learning_rate": 1.6323568654103838e-06, "loss": 0.6106, "step": 2085 }, { "epoch": 8.36, "grad_norm": 6.045923233032227, "learning_rate": 1.5943339650431578e-06, "loss": 0.6349, "step": 2090 }, { "epoch": 8.38, "grad_norm": 6.1905598640441895, "learning_rate": 1.5567207449798517e-06, "loss": 0.6338, "step": 2095 }, { "epoch": 8.4, "grad_norm": 5.788094520568848, "learning_rate": 1.5195190384357405e-06, "loss": 0.6191, "step": 2100 }, { "epoch": 8.42, "grad_norm": 5.833618640899658, "learning_rate": 1.4827306585695234e-06, "loss": 0.6309, "step": 2105 }, { "epoch": 8.44, "grad_norm": 6.230561256408691, "learning_rate": 1.446357398394934e-06, "loss": 0.6558, "step": 2110 }, { "epoch": 8.46, "grad_norm": 6.213106632232666, "learning_rate": 1.4104010306933558e-06, "loss": 0.6292, "step": 2115 }, { "epoch": 8.48, "grad_norm": 5.9851975440979, "learning_rate": 1.3748633079274254e-06, "loss": 0.6263, "step": 2120 }, { "epoch": 8.5, "grad_norm": 5.79006290435791, "learning_rate": 1.339745962155613e-06, "loss": 0.6192, "step": 2125 }, { "epoch": 8.52, "grad_norm": 5.929935932159424, "learning_rate": 1.30505070494781e-06, "loss": 0.6353, "step": 2130 }, { "epoch": 8.54, "grad_norm": 6.356067180633545, "learning_rate": 1.2707792273019049e-06, "loss": 0.624, "step": 2135 }, { "epoch": 8.56, "grad_norm": 6.018552303314209, "learning_rate": 1.2369331995613664e-06, "loss": 0.6158, "step": 2140 }, { "epoch": 8.58, "grad_norm": 6.076599597930908, "learning_rate": 1.2035142713338366e-06, "loss": 0.6242, "step": 2145 }, { "epoch": 8.6, "grad_norm": 5.8761701583862305, "learning_rate": 1.1705240714107301e-06, "loss": 0.616, "step": 2150 }, { "epoch": 8.62, "grad_norm": 5.857326984405518, "learning_rate": 1.1379642076878528e-06, "loss": 0.6187, "step": 2155 }, { "epoch": 8.64, "grad_norm": 6.0746636390686035, "learning_rate": 1.1058362670870248e-06, "loss": 0.6359, "step": 2160 }, { "epoch": 8.66, "grad_norm": 6.17335844039917, "learning_rate": 1.0741418154787443e-06, "loss": 0.6234, "step": 2165 }, { "epoch": 8.68, "grad_norm": 6.193073749542236, "learning_rate": 1.042882397605871e-06, "loss": 0.6144, "step": 2170 }, { "epoch": 8.7, "grad_norm": 5.796996593475342, "learning_rate": 1.012059537008332e-06, "loss": 0.6187, "step": 2175 }, { "epoch": 8.72, "grad_norm": 6.0771164894104, "learning_rate": 9.816747359488632e-07, "loss": 0.6352, "step": 2180 }, { "epoch": 8.74, "grad_norm": 5.912444114685059, "learning_rate": 9.517294753398066e-07, "loss": 0.6201, "step": 2185 }, { "epoch": 8.76, "grad_norm": 5.874046325683594, "learning_rate": 9.222252146709143e-07, "loss": 0.6308, "step": 2190 }, { "epoch": 8.78, "grad_norm": 6.162435054779053, "learning_rate": 8.931633919382299e-07, "loss": 0.6282, "step": 2195 }, { "epoch": 8.8, "grad_norm": 6.2155914306640625, "learning_rate": 8.645454235739903e-07, "loss": 0.6188, "step": 2200 }, { "epoch": 8.82, "grad_norm": 6.127189636230469, "learning_rate": 8.363727043776037e-07, "loss": 0.6267, "step": 2205 }, { "epoch": 8.84, "grad_norm": 6.407212734222412, "learning_rate": 8.086466074476562e-07, "loss": 0.6268, "step": 2210 }, { "epoch": 8.86, "grad_norm": 6.001618385314941, "learning_rate": 7.81368484114996e-07, "loss": 0.6274, "step": 2215 }, { "epoch": 8.88, "grad_norm": 6.074345111846924, "learning_rate": 7.545396638768698e-07, "loss": 0.6183, "step": 2220 }, { "epoch": 8.9, "grad_norm": 6.457765579223633, "learning_rate": 7.281614543321269e-07, "loss": 0.6112, "step": 2225 }, { "epoch": 8.92, "grad_norm": 5.76880407333374, "learning_rate": 7.022351411174866e-07, "loss": 0.6086, "step": 2230 }, { "epoch": 8.94, "grad_norm": 6.315970420837402, "learning_rate": 6.767619878448783e-07, "loss": 0.6381, "step": 2235 }, { "epoch": 8.96, "grad_norm": 5.965588569641113, "learning_rate": 6.517432360398556e-07, "loss": 0.6116, "step": 2240 }, { "epoch": 8.98, "grad_norm": 5.967585563659668, "learning_rate": 6.271801050810856e-07, "loss": 0.643, "step": 2245 }, { "epoch": 9.0, "grad_norm": 6.2677226066589355, "learning_rate": 6.030737921409169e-07, "loss": 0.62, "step": 2250 }, { "epoch": 9.02, "grad_norm": 5.486161231994629, "learning_rate": 5.794254721270331e-07, "loss": 0.464, "step": 2255 }, { "epoch": 9.04, "grad_norm": 5.577027797698975, "learning_rate": 5.562362976251901e-07, "loss": 0.4631, "step": 2260 }, { "epoch": 9.06, "grad_norm": 5.71274995803833, "learning_rate": 5.335073988430373e-07, "loss": 0.4612, "step": 2265 }, { "epoch": 9.08, "grad_norm": 5.8267083168029785, "learning_rate": 5.112398835550348e-07, "loss": 0.4524, "step": 2270 }, { "epoch": 9.1, "grad_norm": 5.931846618652344, "learning_rate": 4.894348370484648e-07, "loss": 0.4674, "step": 2275 }, { "epoch": 9.12, "grad_norm": 5.749095916748047, "learning_rate": 4.6809332207053083e-07, "loss": 0.4544, "step": 2280 }, { "epoch": 9.14, "grad_norm": 6.270796298980713, "learning_rate": 4.4721637877656377e-07, "loss": 0.4581, "step": 2285 }, { "epoch": 9.16, "grad_norm": 5.706393718719482, "learning_rate": 4.268050246793276e-07, "loss": 0.4534, "step": 2290 }, { "epoch": 9.18, "grad_norm": 5.894087791442871, "learning_rate": 4.068602545994249e-07, "loss": 0.456, "step": 2295 }, { "epoch": 9.2, "grad_norm": 5.531440734863281, "learning_rate": 3.8738304061681107e-07, "loss": 0.4561, "step": 2300 }, { "epoch": 9.22, "grad_norm": 5.7872314453125, "learning_rate": 3.68374332023419e-07, "loss": 0.4683, "step": 2305 }, { "epoch": 9.24, "grad_norm": 5.619744777679443, "learning_rate": 3.498350552768859e-07, "loss": 0.4468, "step": 2310 }, { "epoch": 9.26, "grad_norm": 5.65980863571167, "learning_rate": 3.3176611395540625e-07, "loss": 0.4558, "step": 2315 }, { "epoch": 9.28, "grad_norm": 5.706433296203613, "learning_rate": 3.1416838871368925e-07, "loss": 0.451, "step": 2320 }, { "epoch": 9.3, "grad_norm": 5.564632892608643, "learning_rate": 2.970427372400353e-07, "loss": 0.4597, "step": 2325 }, { "epoch": 9.32, "grad_norm": 5.819154262542725, "learning_rate": 2.8038999421453827e-07, "loss": 0.4589, "step": 2330 }, { "epoch": 9.34, "grad_norm": 5.930497169494629, "learning_rate": 2.6421097126839714e-07, "loss": 0.4668, "step": 2335 }, { "epoch": 9.36, "grad_norm": 5.8165059089660645, "learning_rate": 2.4850645694436736e-07, "loss": 0.4481, "step": 2340 }, { "epoch": 9.38, "grad_norm": 5.936621189117432, "learning_rate": 2.332772166583208e-07, "loss": 0.4527, "step": 2345 }, { "epoch": 9.4, "grad_norm": 5.787903308868408, "learning_rate": 2.1852399266194312e-07, "loss": 0.4645, "step": 2350 }, { "epoch": 9.42, "grad_norm": 6.234328269958496, "learning_rate": 2.0424750400655947e-07, "loss": 0.4722, "step": 2355 }, { "epoch": 9.44, "grad_norm": 5.86660623550415, "learning_rate": 1.9044844650808468e-07, "loss": 0.4692, "step": 2360 }, { "epoch": 9.46, "grad_norm": 6.366551876068115, "learning_rate": 1.7712749271311392e-07, "loss": 0.4828, "step": 2365 }, { "epoch": 9.48, "grad_norm": 5.825527191162109, "learning_rate": 1.6428529186614195e-07, "loss": 0.4536, "step": 2370 }, { "epoch": 9.5, "grad_norm": 5.744025230407715, "learning_rate": 1.519224698779198e-07, "loss": 0.4635, "step": 2375 }, { "epoch": 9.52, "grad_norm": 6.037289142608643, "learning_rate": 1.400396292949513e-07, "loss": 0.4679, "step": 2380 }, { "epoch": 9.54, "grad_norm": 5.628565311431885, "learning_rate": 1.2863734927012094e-07, "loss": 0.48, "step": 2385 }, { "epoch": 9.56, "grad_norm": 6.118548393249512, "learning_rate": 1.1771618553447217e-07, "loss": 0.4501, "step": 2390 }, { "epoch": 9.58, "grad_norm": 5.726400375366211, "learning_rate": 1.0727667037011668e-07, "loss": 0.4647, "step": 2395 }, { "epoch": 9.6, "grad_norm": 5.941714286804199, "learning_rate": 9.731931258429638e-08, "loss": 0.4756, "step": 2400 }, { "epoch": 9.62, "grad_norm": 5.818594455718994, "learning_rate": 8.784459748458318e-08, "loss": 0.4471, "step": 2405 }, { "epoch": 9.64, "grad_norm": 5.669724464416504, "learning_rate": 7.885298685522235e-08, "loss": 0.4629, "step": 2410 }, { "epoch": 9.66, "grad_norm": 6.057743549346924, "learning_rate": 7.034491893463059e-08, "loss": 0.4573, "step": 2415 }, { "epoch": 9.68, "grad_norm": 5.836614608764648, "learning_rate": 6.232080839403631e-08, "loss": 0.4519, "step": 2420 }, { "epoch": 9.7, "grad_norm": 5.849831581115723, "learning_rate": 5.4781046317267103e-08, "loss": 0.4679, "step": 2425 }, { "epoch": 9.72, "grad_norm": 5.624325275421143, "learning_rate": 4.772600018168816e-08, "loss": 0.4458, "step": 2430 }, { "epoch": 9.74, "grad_norm": 5.884892463684082, "learning_rate": 4.115601384029666e-08, "loss": 0.4679, "step": 2435 }, { "epoch": 9.76, "grad_norm": 5.915401935577393, "learning_rate": 3.50714075049563e-08, "loss": 0.4603, "step": 2440 }, { "epoch": 9.78, "grad_norm": 5.965377330780029, "learning_rate": 2.947247773079753e-08, "loss": 0.4726, "step": 2445 }, { "epoch": 9.8, "grad_norm": 6.21063232421875, "learning_rate": 2.4359497401758026e-08, "loss": 0.4775, "step": 2450 }, { "epoch": 9.82, "grad_norm": 5.652218341827393, "learning_rate": 1.973271571728441e-08, "loss": 0.435, "step": 2455 }, { "epoch": 9.84, "grad_norm": 5.780144214630127, "learning_rate": 1.5592358180189782e-08, "loss": 0.4834, "step": 2460 }, { "epoch": 9.86, "grad_norm": 5.7750563621521, "learning_rate": 1.1938626585660252e-08, "loss": 0.463, "step": 2465 }, { "epoch": 9.88, "grad_norm": 5.517821788787842, "learning_rate": 8.771699011416169e-09, "loss": 0.4486, "step": 2470 }, { "epoch": 9.9, "grad_norm": 5.808730602264404, "learning_rate": 6.091729809042379e-09, "loss": 0.468, "step": 2475 }, { "epoch": 9.92, "grad_norm": 5.867649555206299, "learning_rate": 3.898849596456477e-09, "loss": 0.4439, "step": 2480 }, { "epoch": 9.94, "grad_norm": 6.076911926269531, "learning_rate": 2.193165251545004e-09, "loss": 0.455, "step": 2485 }, { "epoch": 9.96, "grad_norm": 5.9915971755981445, "learning_rate": 9.74759906957612e-10, "loss": 0.4597, "step": 2490 }, { "epoch": 9.98, "grad_norm": 5.625442028045654, "learning_rate": 2.436929460525317e-10, "loss": 0.4575, "step": 2495 }, { "epoch": 10.0, "grad_norm": 5.879223346710205, "learning_rate": 0.0, "loss": 0.4807, "step": 2500 }, { "epoch": 10.0, "step": 2500, "total_flos": 2.880737320506491e+17, "train_loss": 1.3902658111572266, "train_runtime": 2102.4841, "train_samples_per_second": 19.025, "train_steps_per_second": 1.189 } ], "logging_steps": 5, "max_steps": 2500, "num_input_tokens_seen": 0, "num_train_epochs": 10, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": false, "should_training_stop": false }, "attributes": {} } }, "total_flos": 2.880737320506491e+17, "train_batch_size": 1, "trial_name": null, "trial_params": null }