{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 9.0, "eval_steps": 200, "global_step": 2925, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.03076923076923077, "grad_norm": 24.726886749267578, "learning_rate": 1.8e-06, "loss": 11.4221, "step": 10 }, { "epoch": 0.06153846153846154, "grad_norm": 17.795185089111328, "learning_rate": 3.8e-06, "loss": 10.4941, "step": 20 }, { "epoch": 0.09230769230769231, "grad_norm": 14.8720703125, "learning_rate": 5.8e-06, "loss": 10.943, "step": 30 }, { "epoch": 0.12307692307692308, "grad_norm": 26.370025634765625, "learning_rate": 7.8e-06, "loss": 9.7676, "step": 40 }, { "epoch": 0.15384615384615385, "grad_norm": 25.46526527404785, "learning_rate": 9.800000000000001e-06, "loss": 8.0724, "step": 50 }, { "epoch": 0.18461538461538463, "grad_norm": 31.001745223999023, "learning_rate": 1.18e-05, "loss": 6.9668, "step": 60 }, { "epoch": 0.2153846153846154, "grad_norm": 9.090025901794434, "learning_rate": 1.3800000000000002e-05, "loss": 5.0298, "step": 70 }, { "epoch": 0.24615384615384617, "grad_norm": 4.529256820678711, "learning_rate": 1.58e-05, "loss": 3.5286, "step": 80 }, { "epoch": 0.27692307692307694, "grad_norm": 2.912689685821533, "learning_rate": 1.78e-05, "loss": 2.9783, "step": 90 }, { "epoch": 0.3076923076923077, "grad_norm": 2.04130220413208, "learning_rate": 1.9800000000000004e-05, "loss": 2.5693, "step": 100 }, { "epoch": 0.3384615384615385, "grad_norm": 1.7645025253295898, "learning_rate": 2.18e-05, "loss": 2.3032, "step": 110 }, { "epoch": 0.36923076923076925, "grad_norm": 1.4153923988342285, "learning_rate": 2.38e-05, "loss": 2.1182, "step": 120 }, { "epoch": 0.4, "grad_norm": 2.1566500663757324, "learning_rate": 2.58e-05, "loss": 2.2848, "step": 130 }, { "epoch": 0.4307692307692308, "grad_norm": 1.7678470611572266, "learning_rate": 2.7800000000000005e-05, "loss": 2.2928, "step": 140 }, { "epoch": 0.46153846153846156, "grad_norm": 1.717806100845337, "learning_rate": 2.98e-05, "loss": 2.0866, "step": 150 }, { "epoch": 0.49230769230769234, "grad_norm": 1.8488136529922485, "learning_rate": 3.18e-05, "loss": 2.2323, "step": 160 }, { "epoch": 0.5230769230769231, "grad_norm": 1.8968263864517212, "learning_rate": 3.38e-05, "loss": 2.2183, "step": 170 }, { "epoch": 0.5538461538461539, "grad_norm": 1.398100733757019, "learning_rate": 3.58e-05, "loss": 1.8395, "step": 180 }, { "epoch": 0.5846153846153846, "grad_norm": 1.5084631443023682, "learning_rate": 3.7800000000000004e-05, "loss": 1.9694, "step": 190 }, { "epoch": 0.6153846153846154, "grad_norm": 1.1019172668457031, "learning_rate": 3.9800000000000005e-05, "loss": 1.9702, "step": 200 }, { "epoch": 0.6153846153846154, "eval_gen_len": 186.9088, "eval_loss": 1.8701356649398804, "eval_rouge1": 0.5735, "eval_rouge2": 0.2231, "eval_rougeL": 0.4346, "eval_runtime": 33.3119, "eval_samples_per_second": 8.225, "eval_steps_per_second": 2.071, "step": 200 }, { "epoch": 0.6461538461538462, "grad_norm": 1.5155857801437378, "learning_rate": 4.18e-05, "loss": 2.0705, "step": 210 }, { "epoch": 0.676923076923077, "grad_norm": 1.3338611125946045, "learning_rate": 4.38e-05, "loss": 2.051, "step": 220 }, { "epoch": 0.7076923076923077, "grad_norm": 1.2012193202972412, "learning_rate": 4.58e-05, "loss": 1.9834, "step": 230 }, { "epoch": 0.7384615384615385, "grad_norm": 1.527007818222046, "learning_rate": 4.78e-05, "loss": 2.0451, "step": 240 }, { "epoch": 0.7692307692307693, "grad_norm": 1.2146987915039062, "learning_rate": 4.9800000000000004e-05, "loss": 1.974, "step": 250 }, { "epoch": 0.8, "grad_norm": 1.7576699256896973, "learning_rate": 5.1800000000000005e-05, "loss": 2.0347, "step": 260 }, { "epoch": 0.8307692307692308, "grad_norm": 1.17750084400177, "learning_rate": 5.380000000000001e-05, "loss": 1.9763, "step": 270 }, { "epoch": 0.8615384615384616, "grad_norm": 1.3267815113067627, "learning_rate": 5.580000000000001e-05, "loss": 1.842, "step": 280 }, { "epoch": 0.8923076923076924, "grad_norm": 1.0520875453948975, "learning_rate": 5.7799999999999995e-05, "loss": 2.0525, "step": 290 }, { "epoch": 0.9230769230769231, "grad_norm": 1.4701600074768066, "learning_rate": 5.9800000000000003e-05, "loss": 1.7418, "step": 300 }, { "epoch": 0.9538461538461539, "grad_norm": 1.062267780303955, "learning_rate": 6.18e-05, "loss": 1.9685, "step": 310 }, { "epoch": 0.9846153846153847, "grad_norm": 1.2724727392196655, "learning_rate": 6.38e-05, "loss": 1.7972, "step": 320 }, { "epoch": 1.0153846153846153, "grad_norm": 1.2834393978118896, "learning_rate": 6.58e-05, "loss": 1.8395, "step": 330 }, { "epoch": 1.0461538461538462, "grad_norm": 0.9657095074653625, "learning_rate": 6.780000000000001e-05, "loss": 1.836, "step": 340 }, { "epoch": 1.0769230769230769, "grad_norm": 1.0390011072158813, "learning_rate": 6.98e-05, "loss": 1.9328, "step": 350 }, { "epoch": 1.1076923076923078, "grad_norm": 1.2896322011947632, "learning_rate": 7.18e-05, "loss": 1.8227, "step": 360 }, { "epoch": 1.1384615384615384, "grad_norm": 1.53290593624115, "learning_rate": 7.38e-05, "loss": 1.9214, "step": 370 }, { "epoch": 1.1692307692307693, "grad_norm": 1.0145893096923828, "learning_rate": 7.58e-05, "loss": 1.8295, "step": 380 }, { "epoch": 1.2, "grad_norm": 1.2127511501312256, "learning_rate": 7.780000000000001e-05, "loss": 1.7774, "step": 390 }, { "epoch": 1.2307692307692308, "grad_norm": 1.1971853971481323, "learning_rate": 7.98e-05, "loss": 1.9926, "step": 400 }, { "epoch": 1.2307692307692308, "eval_gen_len": 187.0146, "eval_loss": 1.7412512302398682, "eval_rouge1": 0.5961, "eval_rouge2": 0.2459, "eval_rougeL": 0.4577, "eval_runtime": 33.1213, "eval_samples_per_second": 8.273, "eval_steps_per_second": 2.083, "step": 400 }, { "epoch": 1.2615384615384615, "grad_norm": 1.107351303100586, "learning_rate": 8.18e-05, "loss": 1.8515, "step": 410 }, { "epoch": 1.2923076923076924, "grad_norm": 1.1407504081726074, "learning_rate": 8.38e-05, "loss": 1.7011, "step": 420 }, { "epoch": 1.323076923076923, "grad_norm": 1.418338656425476, "learning_rate": 8.58e-05, "loss": 1.6384, "step": 430 }, { "epoch": 1.353846153846154, "grad_norm": 1.3715286254882812, "learning_rate": 8.78e-05, "loss": 1.8502, "step": 440 }, { "epoch": 1.3846153846153846, "grad_norm": 0.9774390459060669, "learning_rate": 8.98e-05, "loss": 1.8264, "step": 450 }, { "epoch": 1.4153846153846155, "grad_norm": 1.4778176546096802, "learning_rate": 9.180000000000001e-05, "loss": 1.694, "step": 460 }, { "epoch": 1.4461538461538461, "grad_norm": 1.2721563577651978, "learning_rate": 9.38e-05, "loss": 1.8213, "step": 470 }, { "epoch": 1.476923076923077, "grad_norm": 0.94813472032547, "learning_rate": 9.58e-05, "loss": 1.6636, "step": 480 }, { "epoch": 1.5076923076923077, "grad_norm": 1.0905983448028564, "learning_rate": 9.78e-05, "loss": 1.7712, "step": 490 }, { "epoch": 1.5384615384615383, "grad_norm": 1.1593286991119385, "learning_rate": 9.98e-05, "loss": 1.808, "step": 500 }, { "epoch": 1.5692307692307692, "grad_norm": 1.0593713521957397, "learning_rate": 9.967272727272727e-05, "loss": 1.82, "step": 510 }, { "epoch": 1.6, "grad_norm": 0.941973865032196, "learning_rate": 9.930909090909092e-05, "loss": 1.7341, "step": 520 }, { "epoch": 1.6307692307692307, "grad_norm": 0.80891352891922, "learning_rate": 9.894545454545455e-05, "loss": 1.6166, "step": 530 }, { "epoch": 1.6615384615384614, "grad_norm": 1.0325396060943604, "learning_rate": 9.858181818181819e-05, "loss": 1.8333, "step": 540 }, { "epoch": 1.6923076923076923, "grad_norm": 1.3042590618133545, "learning_rate": 9.821818181818182e-05, "loss": 1.6287, "step": 550 }, { "epoch": 1.7230769230769232, "grad_norm": 1.475900650024414, "learning_rate": 9.785454545454545e-05, "loss": 1.6019, "step": 560 }, { "epoch": 1.7538461538461538, "grad_norm": 1.1589939594268799, "learning_rate": 9.74909090909091e-05, "loss": 1.6904, "step": 570 }, { "epoch": 1.7846153846153845, "grad_norm": 1.2714788913726807, "learning_rate": 9.712727272727274e-05, "loss": 1.7928, "step": 580 }, { "epoch": 1.8153846153846154, "grad_norm": 1.2037074565887451, "learning_rate": 9.676363636363637e-05, "loss": 1.8325, "step": 590 }, { "epoch": 1.8461538461538463, "grad_norm": 1.1115801334381104, "learning_rate": 9.64e-05, "loss": 1.7673, "step": 600 }, { "epoch": 1.8461538461538463, "eval_gen_len": 187.0146, "eval_loss": 1.69492506980896, "eval_rouge1": 0.6004, "eval_rouge2": 0.2505, "eval_rougeL": 0.4658, "eval_runtime": 33.2205, "eval_samples_per_second": 8.248, "eval_steps_per_second": 2.077, "step": 600 }, { "epoch": 1.876923076923077, "grad_norm": 1.2008461952209473, "learning_rate": 9.603636363636364e-05, "loss": 1.7674, "step": 610 }, { "epoch": 1.9076923076923076, "grad_norm": 1.1482900381088257, "learning_rate": 9.567272727272729e-05, "loss": 1.7932, "step": 620 }, { "epoch": 1.9384615384615385, "grad_norm": 1.0144352912902832, "learning_rate": 9.530909090909092e-05, "loss": 1.6315, "step": 630 }, { "epoch": 1.9692307692307693, "grad_norm": 0.9276631474494934, "learning_rate": 9.494545454545455e-05, "loss": 1.8373, "step": 640 }, { "epoch": 2.0, "grad_norm": 1.0593888759613037, "learning_rate": 9.458181818181819e-05, "loss": 1.7967, "step": 650 }, { "epoch": 2.0307692307692307, "grad_norm": 1.1406164169311523, "learning_rate": 9.421818181818183e-05, "loss": 1.7668, "step": 660 }, { "epoch": 2.0615384615384613, "grad_norm": 0.9809508919715881, "learning_rate": 9.385454545454546e-05, "loss": 1.6602, "step": 670 }, { "epoch": 2.0923076923076924, "grad_norm": 1.1698426008224487, "learning_rate": 9.349090909090909e-05, "loss": 1.6775, "step": 680 }, { "epoch": 2.123076923076923, "grad_norm": 1.1539372205734253, "learning_rate": 9.312727272727274e-05, "loss": 1.533, "step": 690 }, { "epoch": 2.1538461538461537, "grad_norm": 1.0783981084823608, "learning_rate": 9.276363636363637e-05, "loss": 1.5243, "step": 700 }, { "epoch": 2.184615384615385, "grad_norm": 1.262705683708191, "learning_rate": 9.240000000000001e-05, "loss": 1.6625, "step": 710 }, { "epoch": 2.2153846153846155, "grad_norm": 1.1545718908309937, "learning_rate": 9.203636363636364e-05, "loss": 1.9172, "step": 720 }, { "epoch": 2.246153846153846, "grad_norm": 0.9896947741508484, "learning_rate": 9.167272727272728e-05, "loss": 1.5449, "step": 730 }, { "epoch": 2.276923076923077, "grad_norm": 1.063262701034546, "learning_rate": 9.130909090909091e-05, "loss": 1.5318, "step": 740 }, { "epoch": 2.3076923076923075, "grad_norm": 1.0846728086471558, "learning_rate": 9.094545454545454e-05, "loss": 1.5875, "step": 750 }, { "epoch": 2.3384615384615386, "grad_norm": 1.0437549352645874, "learning_rate": 9.058181818181819e-05, "loss": 1.5724, "step": 760 }, { "epoch": 2.3692307692307693, "grad_norm": 1.050115942955017, "learning_rate": 9.021818181818183e-05, "loss": 1.662, "step": 770 }, { "epoch": 2.4, "grad_norm": 0.9663347601890564, "learning_rate": 8.985454545454546e-05, "loss": 1.6284, "step": 780 }, { "epoch": 2.430769230769231, "grad_norm": 1.1656932830810547, "learning_rate": 8.949090909090909e-05, "loss": 1.5995, "step": 790 }, { "epoch": 2.4615384615384617, "grad_norm": 1.073716402053833, "learning_rate": 8.912727272727273e-05, "loss": 1.4811, "step": 800 }, { "epoch": 2.4615384615384617, "eval_gen_len": 187.0146, "eval_loss": 1.6769312620162964, "eval_rouge1": 0.6042, "eval_rouge2": 0.2561, "eval_rougeL": 0.4686, "eval_runtime": 33.5273, "eval_samples_per_second": 8.172, "eval_steps_per_second": 2.058, "step": 800 }, { "epoch": 2.4923076923076923, "grad_norm": 0.9764583110809326, "learning_rate": 8.876363636363638e-05, "loss": 1.5478, "step": 810 }, { "epoch": 2.523076923076923, "grad_norm": 0.9336417317390442, "learning_rate": 8.840000000000001e-05, "loss": 1.5138, "step": 820 }, { "epoch": 2.5538461538461537, "grad_norm": 0.9714758992195129, "learning_rate": 8.803636363636364e-05, "loss": 1.5506, "step": 830 }, { "epoch": 2.5846153846153848, "grad_norm": 0.9208464622497559, "learning_rate": 8.767272727272727e-05, "loss": 1.4944, "step": 840 }, { "epoch": 2.6153846153846154, "grad_norm": 1.0252026319503784, "learning_rate": 8.730909090909092e-05, "loss": 1.6991, "step": 850 }, { "epoch": 2.646153846153846, "grad_norm": 1.0464015007019043, "learning_rate": 8.694545454545455e-05, "loss": 1.679, "step": 860 }, { "epoch": 2.676923076923077, "grad_norm": 1.3673149347305298, "learning_rate": 8.658181818181818e-05, "loss": 1.5021, "step": 870 }, { "epoch": 2.707692307692308, "grad_norm": 1.1350778341293335, "learning_rate": 8.621818181818181e-05, "loss": 1.5898, "step": 880 }, { "epoch": 2.7384615384615385, "grad_norm": 0.9916401505470276, "learning_rate": 8.585454545454546e-05, "loss": 1.6542, "step": 890 }, { "epoch": 2.769230769230769, "grad_norm": 0.9967766404151917, "learning_rate": 8.54909090909091e-05, "loss": 1.7056, "step": 900 }, { "epoch": 2.8, "grad_norm": 1.2031991481781006, "learning_rate": 8.512727272727273e-05, "loss": 1.6856, "step": 910 }, { "epoch": 2.830769230769231, "grad_norm": 1.0159794092178345, "learning_rate": 8.476363636363636e-05, "loss": 1.5293, "step": 920 }, { "epoch": 2.8615384615384616, "grad_norm": 1.3572866916656494, "learning_rate": 8.44e-05, "loss": 1.6191, "step": 930 }, { "epoch": 2.8923076923076922, "grad_norm": 1.2567291259765625, "learning_rate": 8.403636363636364e-05, "loss": 1.7504, "step": 940 }, { "epoch": 2.9230769230769234, "grad_norm": 1.2280553579330444, "learning_rate": 8.367272727272728e-05, "loss": 1.6523, "step": 950 }, { "epoch": 2.953846153846154, "grad_norm": 1.0409953594207764, "learning_rate": 8.330909090909091e-05, "loss": 1.5903, "step": 960 }, { "epoch": 2.9846153846153847, "grad_norm": 1.10386061668396, "learning_rate": 8.294545454545455e-05, "loss": 1.5235, "step": 970 }, { "epoch": 3.0153846153846153, "grad_norm": 1.0341882705688477, "learning_rate": 8.258181818181818e-05, "loss": 1.6025, "step": 980 }, { "epoch": 3.046153846153846, "grad_norm": 1.3020343780517578, "learning_rate": 8.221818181818183e-05, "loss": 1.4696, "step": 990 }, { "epoch": 3.076923076923077, "grad_norm": 1.104643702507019, "learning_rate": 8.185454545454546e-05, "loss": 1.4009, "step": 1000 }, { "epoch": 3.076923076923077, "eval_gen_len": 187.0146, "eval_loss": 1.6721168756484985, "eval_rouge1": 0.6044, "eval_rouge2": 0.2558, "eval_rougeL": 0.4692, "eval_runtime": 34.4765, "eval_samples_per_second": 7.947, "eval_steps_per_second": 2.001, "step": 1000 }, { "epoch": 3.1076923076923078, "grad_norm": 1.1327263116836548, "learning_rate": 8.14909090909091e-05, "loss": 1.5893, "step": 1010 }, { "epoch": 3.1384615384615384, "grad_norm": 1.168095350265503, "learning_rate": 8.112727272727273e-05, "loss": 1.4248, "step": 1020 }, { "epoch": 3.169230769230769, "grad_norm": 0.9978489279747009, "learning_rate": 8.076363636363636e-05, "loss": 1.5407, "step": 1030 }, { "epoch": 3.2, "grad_norm": 1.0370062589645386, "learning_rate": 8.04e-05, "loss": 1.4867, "step": 1040 }, { "epoch": 3.230769230769231, "grad_norm": 0.9647369384765625, "learning_rate": 8.003636363636365e-05, "loss": 1.4806, "step": 1050 }, { "epoch": 3.2615384615384615, "grad_norm": 1.3316948413848877, "learning_rate": 7.967272727272728e-05, "loss": 1.4612, "step": 1060 }, { "epoch": 3.292307692307692, "grad_norm": 1.37971830368042, "learning_rate": 7.93090909090909e-05, "loss": 1.5745, "step": 1070 }, { "epoch": 3.3230769230769233, "grad_norm": 1.1220242977142334, "learning_rate": 7.894545454545455e-05, "loss": 1.3228, "step": 1080 }, { "epoch": 3.353846153846154, "grad_norm": 1.0595531463623047, "learning_rate": 7.85818181818182e-05, "loss": 1.4618, "step": 1090 }, { "epoch": 3.3846153846153846, "grad_norm": 1.3739666938781738, "learning_rate": 7.821818181818182e-05, "loss": 1.4973, "step": 1100 }, { "epoch": 3.4153846153846152, "grad_norm": 1.2643866539001465, "learning_rate": 7.785454545454545e-05, "loss": 1.533, "step": 1110 }, { "epoch": 3.4461538461538463, "grad_norm": 1.232230544090271, "learning_rate": 7.74909090909091e-05, "loss": 1.4867, "step": 1120 }, { "epoch": 3.476923076923077, "grad_norm": 0.9712868332862854, "learning_rate": 7.712727272727273e-05, "loss": 1.4916, "step": 1130 }, { "epoch": 3.5076923076923077, "grad_norm": 1.3414063453674316, "learning_rate": 7.676363636363637e-05, "loss": 1.5992, "step": 1140 }, { "epoch": 3.5384615384615383, "grad_norm": 1.0222588777542114, "learning_rate": 7.64e-05, "loss": 1.5378, "step": 1150 }, { "epoch": 3.569230769230769, "grad_norm": 1.1905276775360107, "learning_rate": 7.603636363636364e-05, "loss": 1.4324, "step": 1160 }, { "epoch": 3.6, "grad_norm": 1.2846956253051758, "learning_rate": 7.567272727272727e-05, "loss": 1.4927, "step": 1170 }, { "epoch": 3.6307692307692307, "grad_norm": 1.165310025215149, "learning_rate": 7.530909090909092e-05, "loss": 1.4277, "step": 1180 }, { "epoch": 3.6615384615384614, "grad_norm": 1.063883900642395, "learning_rate": 7.494545454545455e-05, "loss": 1.5226, "step": 1190 }, { "epoch": 3.6923076923076925, "grad_norm": 1.3430577516555786, "learning_rate": 7.458181818181819e-05, "loss": 1.5315, "step": 1200 }, { "epoch": 3.6923076923076925, "eval_gen_len": 187.0146, "eval_loss": 1.6592342853546143, "eval_rouge1": 0.6057, "eval_rouge2": 0.2572, "eval_rougeL": 0.4709, "eval_runtime": 33.3732, "eval_samples_per_second": 8.21, "eval_steps_per_second": 2.068, "step": 1200 }, { "epoch": 3.723076923076923, "grad_norm": 1.2028673887252808, "learning_rate": 7.421818181818182e-05, "loss": 1.5135, "step": 1210 }, { "epoch": 3.753846153846154, "grad_norm": 0.9091282486915588, "learning_rate": 7.385454545454545e-05, "loss": 1.3876, "step": 1220 }, { "epoch": 3.7846153846153845, "grad_norm": 0.9549902677536011, "learning_rate": 7.34909090909091e-05, "loss": 1.7408, "step": 1230 }, { "epoch": 3.815384615384615, "grad_norm": 1.109423279762268, "learning_rate": 7.312727272727274e-05, "loss": 1.3897, "step": 1240 }, { "epoch": 3.8461538461538463, "grad_norm": 1.1412984132766724, "learning_rate": 7.276363636363637e-05, "loss": 1.5034, "step": 1250 }, { "epoch": 3.876923076923077, "grad_norm": 1.176283359527588, "learning_rate": 7.24e-05, "loss": 1.5426, "step": 1260 }, { "epoch": 3.9076923076923076, "grad_norm": 1.3558485507965088, "learning_rate": 7.203636363636364e-05, "loss": 1.5287, "step": 1270 }, { "epoch": 3.9384615384615387, "grad_norm": 1.23688805103302, "learning_rate": 7.167272727272729e-05, "loss": 1.5794, "step": 1280 }, { "epoch": 3.9692307692307693, "grad_norm": 1.248528242111206, "learning_rate": 7.130909090909092e-05, "loss": 1.5166, "step": 1290 }, { "epoch": 4.0, "grad_norm": 0.8396208882331848, "learning_rate": 7.094545454545455e-05, "loss": 1.4648, "step": 1300 }, { "epoch": 4.030769230769231, "grad_norm": 1.0756162405014038, "learning_rate": 7.058181818181819e-05, "loss": 1.4147, "step": 1310 }, { "epoch": 4.061538461538461, "grad_norm": 1.1731723546981812, "learning_rate": 7.021818181818182e-05, "loss": 1.3732, "step": 1320 }, { "epoch": 4.092307692307692, "grad_norm": 1.2993149757385254, "learning_rate": 6.985454545454546e-05, "loss": 1.4118, "step": 1330 }, { "epoch": 4.123076923076923, "grad_norm": 1.1648204326629639, "learning_rate": 6.949090909090909e-05, "loss": 1.4258, "step": 1340 }, { "epoch": 4.153846153846154, "grad_norm": 1.1242045164108276, "learning_rate": 6.912727272727274e-05, "loss": 1.3598, "step": 1350 }, { "epoch": 4.184615384615385, "grad_norm": 1.53397536277771, "learning_rate": 6.876363636363637e-05, "loss": 1.3431, "step": 1360 }, { "epoch": 4.2153846153846155, "grad_norm": 1.3859331607818604, "learning_rate": 6.840000000000001e-05, "loss": 1.5014, "step": 1370 }, { "epoch": 4.246153846153846, "grad_norm": 1.0821737051010132, "learning_rate": 6.803636363636364e-05, "loss": 1.2638, "step": 1380 }, { "epoch": 4.276923076923077, "grad_norm": 1.2648742198944092, "learning_rate": 6.767272727272728e-05, "loss": 1.4567, "step": 1390 }, { "epoch": 4.3076923076923075, "grad_norm": 1.1357372999191284, "learning_rate": 6.730909090909091e-05, "loss": 1.4706, "step": 1400 }, { "epoch": 4.3076923076923075, "eval_gen_len": 187.0146, "eval_loss": 1.6691502332687378, "eval_rouge1": 0.6061, "eval_rouge2": 0.2593, "eval_rougeL": 0.4719, "eval_runtime": 33.439, "eval_samples_per_second": 8.194, "eval_steps_per_second": 2.063, "step": 1400 }, { "epoch": 4.338461538461538, "grad_norm": 1.4324264526367188, "learning_rate": 6.694545454545454e-05, "loss": 1.4032, "step": 1410 }, { "epoch": 4.36923076923077, "grad_norm": 1.3646095991134644, "learning_rate": 6.658181818181819e-05, "loss": 1.3943, "step": 1420 }, { "epoch": 4.4, "grad_norm": 0.9991398453712463, "learning_rate": 6.621818181818183e-05, "loss": 1.5292, "step": 1430 }, { "epoch": 4.430769230769231, "grad_norm": 1.1873986721038818, "learning_rate": 6.585454545454546e-05, "loss": 1.4813, "step": 1440 }, { "epoch": 4.461538461538462, "grad_norm": 1.0080267190933228, "learning_rate": 6.549090909090909e-05, "loss": 1.4951, "step": 1450 }, { "epoch": 4.492307692307692, "grad_norm": 1.5542734861373901, "learning_rate": 6.512727272727272e-05, "loss": 1.5603, "step": 1460 }, { "epoch": 4.523076923076923, "grad_norm": 1.2610498666763306, "learning_rate": 6.476363636363638e-05, "loss": 1.3286, "step": 1470 }, { "epoch": 4.553846153846154, "grad_norm": 1.0882760286331177, "learning_rate": 6.440000000000001e-05, "loss": 1.309, "step": 1480 }, { "epoch": 4.584615384615384, "grad_norm": 1.1589834690093994, "learning_rate": 6.403636363636364e-05, "loss": 1.414, "step": 1490 }, { "epoch": 4.615384615384615, "grad_norm": 1.3731642961502075, "learning_rate": 6.367272727272727e-05, "loss": 1.4529, "step": 1500 }, { "epoch": 4.6461538461538465, "grad_norm": 1.3053221702575684, "learning_rate": 6.330909090909091e-05, "loss": 1.4623, "step": 1510 }, { "epoch": 4.676923076923077, "grad_norm": 1.2154396772384644, "learning_rate": 6.294545454545455e-05, "loss": 1.4766, "step": 1520 }, { "epoch": 4.707692307692308, "grad_norm": 1.0947812795639038, "learning_rate": 6.258181818181818e-05, "loss": 1.3212, "step": 1530 }, { "epoch": 4.7384615384615385, "grad_norm": 1.005462646484375, "learning_rate": 6.221818181818181e-05, "loss": 1.3956, "step": 1540 }, { "epoch": 4.769230769230769, "grad_norm": 1.196108341217041, "learning_rate": 6.185454545454546e-05, "loss": 1.4048, "step": 1550 }, { "epoch": 4.8, "grad_norm": 1.355747103691101, "learning_rate": 6.14909090909091e-05, "loss": 1.3474, "step": 1560 }, { "epoch": 4.8307692307692305, "grad_norm": 1.177310585975647, "learning_rate": 6.112727272727273e-05, "loss": 1.3038, "step": 1570 }, { "epoch": 4.861538461538462, "grad_norm": 1.273474097251892, "learning_rate": 6.076363636363637e-05, "loss": 1.3858, "step": 1580 }, { "epoch": 4.892307692307693, "grad_norm": 1.2601613998413086, "learning_rate": 6.04e-05, "loss": 1.2742, "step": 1590 }, { "epoch": 4.923076923076923, "grad_norm": 1.052040696144104, "learning_rate": 6.0036363636363634e-05, "loss": 1.5551, "step": 1600 }, { "epoch": 4.923076923076923, "eval_gen_len": 187.0146, "eval_loss": 1.6678508520126343, "eval_rouge1": 0.6061, "eval_rouge2": 0.2582, "eval_rougeL": 0.4724, "eval_runtime": 33.608, "eval_samples_per_second": 8.153, "eval_steps_per_second": 2.053, "step": 1600 }, { "epoch": 4.953846153846154, "grad_norm": 1.4437050819396973, "learning_rate": 5.967272727272728e-05, "loss": 1.3613, "step": 1610 }, { "epoch": 4.984615384615385, "grad_norm": 1.398398518562317, "learning_rate": 5.9309090909090915e-05, "loss": 1.3861, "step": 1620 }, { "epoch": 5.015384615384615, "grad_norm": 1.1901689767837524, "learning_rate": 5.894545454545455e-05, "loss": 1.4525, "step": 1630 }, { "epoch": 5.046153846153846, "grad_norm": 1.2631349563598633, "learning_rate": 5.858181818181818e-05, "loss": 1.3698, "step": 1640 }, { "epoch": 5.076923076923077, "grad_norm": 1.3628337383270264, "learning_rate": 5.821818181818182e-05, "loss": 1.1917, "step": 1650 }, { "epoch": 5.107692307692307, "grad_norm": 1.2330440282821655, "learning_rate": 5.785454545454546e-05, "loss": 1.3692, "step": 1660 }, { "epoch": 5.138461538461539, "grad_norm": 1.069877028465271, "learning_rate": 5.74909090909091e-05, "loss": 1.2439, "step": 1670 }, { "epoch": 5.1692307692307695, "grad_norm": 1.1694751977920532, "learning_rate": 5.712727272727273e-05, "loss": 1.1194, "step": 1680 }, { "epoch": 5.2, "grad_norm": 1.424668312072754, "learning_rate": 5.6763636363636365e-05, "loss": 1.456, "step": 1690 }, { "epoch": 5.230769230769231, "grad_norm": 1.1166226863861084, "learning_rate": 5.6399999999999995e-05, "loss": 1.3011, "step": 1700 }, { "epoch": 5.2615384615384615, "grad_norm": 1.196712851524353, "learning_rate": 5.6036363636363646e-05, "loss": 1.3272, "step": 1710 }, { "epoch": 5.292307692307692, "grad_norm": 1.105592131614685, "learning_rate": 5.5672727272727276e-05, "loss": 1.3831, "step": 1720 }, { "epoch": 5.323076923076923, "grad_norm": 1.3789408206939697, "learning_rate": 5.530909090909091e-05, "loss": 1.3924, "step": 1730 }, { "epoch": 5.3538461538461535, "grad_norm": 1.1058343648910522, "learning_rate": 5.494545454545454e-05, "loss": 1.1278, "step": 1740 }, { "epoch": 5.384615384615385, "grad_norm": 1.0470004081726074, "learning_rate": 5.458181818181819e-05, "loss": 1.2675, "step": 1750 }, { "epoch": 5.415384615384616, "grad_norm": 1.5735996961593628, "learning_rate": 5.421818181818182e-05, "loss": 1.368, "step": 1760 }, { "epoch": 5.446153846153846, "grad_norm": 1.053110122680664, "learning_rate": 5.385454545454546e-05, "loss": 1.4409, "step": 1770 }, { "epoch": 5.476923076923077, "grad_norm": 1.2032136917114258, "learning_rate": 5.349090909090909e-05, "loss": 1.3919, "step": 1780 }, { "epoch": 5.507692307692308, "grad_norm": 1.3398489952087402, "learning_rate": 5.3127272727272726e-05, "loss": 1.3891, "step": 1790 }, { "epoch": 5.538461538461538, "grad_norm": 1.1674134731292725, "learning_rate": 5.276363636363637e-05, "loss": 1.4167, "step": 1800 }, { "epoch": 5.538461538461538, "eval_gen_len": 187.0146, "eval_loss": 1.6820106506347656, "eval_rouge1": 0.6051, "eval_rouge2": 0.256, "eval_rougeL": 0.4705, "eval_runtime": 34.3835, "eval_samples_per_second": 7.969, "eval_steps_per_second": 2.007, "step": 1800 }, { "epoch": 5.569230769230769, "grad_norm": 1.3142492771148682, "learning_rate": 5.2400000000000007e-05, "loss": 1.2869, "step": 1810 }, { "epoch": 5.6, "grad_norm": 1.3149932622909546, "learning_rate": 5.2036363636363637e-05, "loss": 1.3653, "step": 1820 }, { "epoch": 5.63076923076923, "grad_norm": 1.1139847040176392, "learning_rate": 5.167272727272727e-05, "loss": 1.3807, "step": 1830 }, { "epoch": 5.661538461538462, "grad_norm": 1.302495002746582, "learning_rate": 5.130909090909091e-05, "loss": 1.3826, "step": 1840 }, { "epoch": 5.6923076923076925, "grad_norm": 1.2617419958114624, "learning_rate": 5.0945454545454554e-05, "loss": 1.283, "step": 1850 }, { "epoch": 5.723076923076923, "grad_norm": 1.252189040184021, "learning_rate": 5.0581818181818184e-05, "loss": 1.3425, "step": 1860 }, { "epoch": 5.753846153846154, "grad_norm": 1.0823620557785034, "learning_rate": 5.021818181818182e-05, "loss": 1.4646, "step": 1870 }, { "epoch": 5.7846153846153845, "grad_norm": 1.39573335647583, "learning_rate": 4.985454545454546e-05, "loss": 1.2957, "step": 1880 }, { "epoch": 5.815384615384615, "grad_norm": 1.217499852180481, "learning_rate": 4.9490909090909094e-05, "loss": 1.3232, "step": 1890 }, { "epoch": 5.846153846153846, "grad_norm": 1.3049825429916382, "learning_rate": 4.912727272727273e-05, "loss": 1.333, "step": 1900 }, { "epoch": 5.876923076923077, "grad_norm": 1.265807032585144, "learning_rate": 4.876363636363637e-05, "loss": 1.2635, "step": 1910 }, { "epoch": 5.907692307692308, "grad_norm": 1.4045813083648682, "learning_rate": 4.8400000000000004e-05, "loss": 1.3453, "step": 1920 }, { "epoch": 5.938461538461539, "grad_norm": 1.4151256084442139, "learning_rate": 4.803636363636364e-05, "loss": 1.2274, "step": 1930 }, { "epoch": 5.969230769230769, "grad_norm": 1.30918288230896, "learning_rate": 4.767272727272728e-05, "loss": 1.3835, "step": 1940 }, { "epoch": 6.0, "grad_norm": 1.1394106149673462, "learning_rate": 4.7309090909090914e-05, "loss": 1.4773, "step": 1950 }, { "epoch": 6.030769230769231, "grad_norm": 1.0991594791412354, "learning_rate": 4.694545454545455e-05, "loss": 1.1885, "step": 1960 }, { "epoch": 6.061538461538461, "grad_norm": 1.2676807641983032, "learning_rate": 4.658181818181818e-05, "loss": 1.2931, "step": 1970 }, { "epoch": 6.092307692307692, "grad_norm": 1.2844555377960205, "learning_rate": 4.6218181818181825e-05, "loss": 1.2758, "step": 1980 }, { "epoch": 6.123076923076923, "grad_norm": 1.2364481687545776, "learning_rate": 4.5854545454545455e-05, "loss": 1.1871, "step": 1990 }, { "epoch": 6.153846153846154, "grad_norm": 1.4142909049987793, "learning_rate": 4.54909090909091e-05, "loss": 1.2976, "step": 2000 }, { "epoch": 6.153846153846154, "eval_gen_len": 187.0146, "eval_loss": 1.6960315704345703, "eval_rouge1": 0.6074, "eval_rouge2": 0.2564, "eval_rougeL": 0.4698, "eval_runtime": 33.4283, "eval_samples_per_second": 8.197, "eval_steps_per_second": 2.064, "step": 2000 }, { "epoch": 6.184615384615385, "grad_norm": 1.234192132949829, "learning_rate": 4.512727272727273e-05, "loss": 1.3096, "step": 2010 }, { "epoch": 6.2153846153846155, "grad_norm": 1.1222543716430664, "learning_rate": 4.4763636363636365e-05, "loss": 1.2937, "step": 2020 }, { "epoch": 6.246153846153846, "grad_norm": 1.0864425897598267, "learning_rate": 4.44e-05, "loss": 1.278, "step": 2030 }, { "epoch": 6.276923076923077, "grad_norm": 1.3431516885757446, "learning_rate": 4.403636363636364e-05, "loss": 1.2601, "step": 2040 }, { "epoch": 6.3076923076923075, "grad_norm": 1.2604031562805176, "learning_rate": 4.3672727272727275e-05, "loss": 1.3587, "step": 2050 }, { "epoch": 6.338461538461538, "grad_norm": 1.4237326383590698, "learning_rate": 4.330909090909091e-05, "loss": 1.3936, "step": 2060 }, { "epoch": 6.36923076923077, "grad_norm": 1.2190274000167847, "learning_rate": 4.294545454545455e-05, "loss": 1.2805, "step": 2070 }, { "epoch": 6.4, "grad_norm": 1.3023786544799805, "learning_rate": 4.2581818181818186e-05, "loss": 1.2676, "step": 2080 }, { "epoch": 6.430769230769231, "grad_norm": 1.2170274257659912, "learning_rate": 4.2218181818181816e-05, "loss": 1.3696, "step": 2090 }, { "epoch": 6.461538461538462, "grad_norm": 1.613784670829773, "learning_rate": 4.185454545454546e-05, "loss": 1.28, "step": 2100 }, { "epoch": 6.492307692307692, "grad_norm": 1.2165530920028687, "learning_rate": 4.149090909090909e-05, "loss": 1.2933, "step": 2110 }, { "epoch": 6.523076923076923, "grad_norm": 1.2213079929351807, "learning_rate": 4.112727272727273e-05, "loss": 1.2062, "step": 2120 }, { "epoch": 6.553846153846154, "grad_norm": 1.5889174938201904, "learning_rate": 4.076363636363636e-05, "loss": 1.2481, "step": 2130 }, { "epoch": 6.584615384615384, "grad_norm": 1.2638423442840576, "learning_rate": 4.0400000000000006e-05, "loss": 1.29, "step": 2140 }, { "epoch": 6.615384615384615, "grad_norm": 1.0796576738357544, "learning_rate": 4.0036363636363636e-05, "loss": 1.1189, "step": 2150 }, { "epoch": 6.6461538461538465, "grad_norm": 1.5910948514938354, "learning_rate": 3.967272727272727e-05, "loss": 1.2767, "step": 2160 }, { "epoch": 6.676923076923077, "grad_norm": 1.3346668481826782, "learning_rate": 3.930909090909091e-05, "loss": 1.2212, "step": 2170 }, { "epoch": 6.707692307692308, "grad_norm": 1.1277836561203003, "learning_rate": 3.8945454545454547e-05, "loss": 1.2392, "step": 2180 }, { "epoch": 6.7384615384615385, "grad_norm": 1.1212108135223389, "learning_rate": 3.858181818181818e-05, "loss": 1.3223, "step": 2190 }, { "epoch": 6.769230769230769, "grad_norm": 1.2175902128219604, "learning_rate": 3.821818181818182e-05, "loss": 1.3124, "step": 2200 }, { "epoch": 6.769230769230769, "eval_gen_len": 187.0146, "eval_loss": 1.6920864582061768, "eval_rouge1": 0.606, "eval_rouge2": 0.2554, "eval_rougeL": 0.4694, "eval_runtime": 34.3853, "eval_samples_per_second": 7.969, "eval_steps_per_second": 2.007, "step": 2200 }, { "epoch": 6.8, "grad_norm": 1.163404107093811, "learning_rate": 3.785454545454546e-05, "loss": 1.4557, "step": 2210 }, { "epoch": 6.8307692307692305, "grad_norm": 1.5031542778015137, "learning_rate": 3.7490909090909094e-05, "loss": 1.1773, "step": 2220 }, { "epoch": 6.861538461538462, "grad_norm": 1.0126748085021973, "learning_rate": 3.712727272727273e-05, "loss": 1.3922, "step": 2230 }, { "epoch": 6.892307692307693, "grad_norm": 1.0135952234268188, "learning_rate": 3.676363636363637e-05, "loss": 1.2028, "step": 2240 }, { "epoch": 6.923076923076923, "grad_norm": 1.16098153591156, "learning_rate": 3.6400000000000004e-05, "loss": 1.4111, "step": 2250 }, { "epoch": 6.953846153846154, "grad_norm": 1.488234519958496, "learning_rate": 3.603636363636364e-05, "loss": 1.3131, "step": 2260 }, { "epoch": 6.984615384615385, "grad_norm": 1.129989743232727, "learning_rate": 3.567272727272728e-05, "loss": 1.1701, "step": 2270 }, { "epoch": 7.015384615384615, "grad_norm": 1.218468189239502, "learning_rate": 3.530909090909091e-05, "loss": 1.2604, "step": 2280 }, { "epoch": 7.046153846153846, "grad_norm": 1.2339926958084106, "learning_rate": 3.494545454545455e-05, "loss": 1.0932, "step": 2290 }, { "epoch": 7.076923076923077, "grad_norm": 1.4972765445709229, "learning_rate": 3.458181818181818e-05, "loss": 1.1137, "step": 2300 }, { "epoch": 7.107692307692307, "grad_norm": 1.1884584426879883, "learning_rate": 3.4218181818181824e-05, "loss": 1.1522, "step": 2310 }, { "epoch": 7.138461538461539, "grad_norm": 1.4934840202331543, "learning_rate": 3.3854545454545454e-05, "loss": 1.3121, "step": 2320 }, { "epoch": 7.1692307692307695, "grad_norm": 1.1432678699493408, "learning_rate": 3.34909090909091e-05, "loss": 1.1549, "step": 2330 }, { "epoch": 7.2, "grad_norm": 1.1708807945251465, "learning_rate": 3.312727272727273e-05, "loss": 1.1692, "step": 2340 }, { "epoch": 7.230769230769231, "grad_norm": 1.3824517726898193, "learning_rate": 3.2763636363636365e-05, "loss": 1.208, "step": 2350 }, { "epoch": 7.2615384615384615, "grad_norm": 1.1225407123565674, "learning_rate": 3.24e-05, "loss": 1.1542, "step": 2360 }, { "epoch": 7.292307692307692, "grad_norm": 1.2445507049560547, "learning_rate": 3.203636363636364e-05, "loss": 1.2265, "step": 2370 }, { "epoch": 7.323076923076923, "grad_norm": 1.256062626838684, "learning_rate": 3.1672727272727275e-05, "loss": 1.1822, "step": 2380 }, { "epoch": 7.3538461538461535, "grad_norm": 1.3986501693725586, "learning_rate": 3.130909090909091e-05, "loss": 1.262, "step": 2390 }, { "epoch": 7.384615384615385, "grad_norm": 1.1086236238479614, "learning_rate": 3.094545454545455e-05, "loss": 1.2275, "step": 2400 }, { "epoch": 7.384615384615385, "eval_gen_len": 187.0146, "eval_loss": 1.6998823881149292, "eval_rouge1": 0.6055, "eval_rouge2": 0.2541, "eval_rougeL": 0.4684, "eval_runtime": 33.5145, "eval_samples_per_second": 8.176, "eval_steps_per_second": 2.059, "step": 2400 }, { "epoch": 7.415384615384616, "grad_norm": 1.5682780742645264, "learning_rate": 3.0581818181818185e-05, "loss": 1.3442, "step": 2410 }, { "epoch": 7.446153846153846, "grad_norm": 1.034818410873413, "learning_rate": 3.021818181818182e-05, "loss": 1.2158, "step": 2420 }, { "epoch": 7.476923076923077, "grad_norm": 1.2816352844238281, "learning_rate": 2.985454545454546e-05, "loss": 1.1646, "step": 2430 }, { "epoch": 7.507692307692308, "grad_norm": 1.252765417098999, "learning_rate": 2.9490909090909092e-05, "loss": 1.1985, "step": 2440 }, { "epoch": 7.538461538461538, "grad_norm": 1.4074809551239014, "learning_rate": 2.9127272727272732e-05, "loss": 1.3245, "step": 2450 }, { "epoch": 7.569230769230769, "grad_norm": 1.3757801055908203, "learning_rate": 2.8763636363636366e-05, "loss": 1.2856, "step": 2460 }, { "epoch": 7.6, "grad_norm": 1.355635643005371, "learning_rate": 2.84e-05, "loss": 1.229, "step": 2470 }, { "epoch": 7.63076923076923, "grad_norm": 1.185659646987915, "learning_rate": 2.803636363636364e-05, "loss": 1.2444, "step": 2480 }, { "epoch": 7.661538461538462, "grad_norm": 1.4726060628890991, "learning_rate": 2.7672727272727273e-05, "loss": 1.2877, "step": 2490 }, { "epoch": 7.6923076923076925, "grad_norm": 1.525718092918396, "learning_rate": 2.7309090909090913e-05, "loss": 1.2993, "step": 2500 }, { "epoch": 7.723076923076923, "grad_norm": 1.2432451248168945, "learning_rate": 2.6945454545454546e-05, "loss": 1.2803, "step": 2510 }, { "epoch": 7.753846153846154, "grad_norm": 1.2237839698791504, "learning_rate": 2.6581818181818186e-05, "loss": 1.2785, "step": 2520 }, { "epoch": 7.7846153846153845, "grad_norm": 1.5360924005508423, "learning_rate": 2.621818181818182e-05, "loss": 1.3164, "step": 2530 }, { "epoch": 7.815384615384615, "grad_norm": 1.4242442846298218, "learning_rate": 2.5854545454545453e-05, "loss": 1.1864, "step": 2540 }, { "epoch": 7.846153846153846, "grad_norm": 1.0664770603179932, "learning_rate": 2.5490909090909093e-05, "loss": 1.2769, "step": 2550 }, { "epoch": 7.876923076923077, "grad_norm": 1.3427962064743042, "learning_rate": 2.5127272727272727e-05, "loss": 1.183, "step": 2560 }, { "epoch": 7.907692307692308, "grad_norm": 1.2692787647247314, "learning_rate": 2.4763636363636363e-05, "loss": 1.3775, "step": 2570 }, { "epoch": 7.938461538461539, "grad_norm": 1.3220490217208862, "learning_rate": 2.44e-05, "loss": 1.1557, "step": 2580 }, { "epoch": 7.969230769230769, "grad_norm": 1.31517493724823, "learning_rate": 2.4036363636363637e-05, "loss": 1.3144, "step": 2590 }, { "epoch": 8.0, "grad_norm": 1.145848274230957, "learning_rate": 2.3672727272727274e-05, "loss": 1.4194, "step": 2600 }, { "epoch": 8.0, "eval_gen_len": 187.0146, "eval_loss": 1.6980103254318237, "eval_rouge1": 0.6066, "eval_rouge2": 0.2565, "eval_rougeL": 0.4699, "eval_runtime": 33.5275, "eval_samples_per_second": 8.172, "eval_steps_per_second": 2.058, "step": 2600 }, { "epoch": 8.03076923076923, "grad_norm": 1.3528770208358765, "learning_rate": 2.330909090909091e-05, "loss": 1.2504, "step": 2610 }, { "epoch": 8.061538461538461, "grad_norm": 1.1651642322540283, "learning_rate": 2.2945454545454547e-05, "loss": 1.0993, "step": 2620 }, { "epoch": 8.092307692307692, "grad_norm": 1.1845202445983887, "learning_rate": 2.258181818181818e-05, "loss": 1.1356, "step": 2630 }, { "epoch": 8.123076923076923, "grad_norm": 1.2000699043273926, "learning_rate": 2.2218181818181817e-05, "loss": 1.206, "step": 2640 }, { "epoch": 8.153846153846153, "grad_norm": 1.449044108390808, "learning_rate": 2.1854545454545454e-05, "loss": 1.2059, "step": 2650 }, { "epoch": 8.184615384615384, "grad_norm": 1.2176152467727661, "learning_rate": 2.149090909090909e-05, "loss": 1.1849, "step": 2660 }, { "epoch": 8.215384615384615, "grad_norm": 1.4765113592147827, "learning_rate": 2.1127272727272728e-05, "loss": 1.3351, "step": 2670 }, { "epoch": 8.246153846153845, "grad_norm": 1.5038341283798218, "learning_rate": 2.0763636363636364e-05, "loss": 1.2766, "step": 2680 }, { "epoch": 8.276923076923078, "grad_norm": 1.3483731746673584, "learning_rate": 2.04e-05, "loss": 1.1067, "step": 2690 }, { "epoch": 8.307692307692308, "grad_norm": 1.025032639503479, "learning_rate": 2.0036363636363638e-05, "loss": 1.2155, "step": 2700 }, { "epoch": 8.338461538461539, "grad_norm": 1.3824971914291382, "learning_rate": 1.9672727272727275e-05, "loss": 1.1767, "step": 2710 }, { "epoch": 8.36923076923077, "grad_norm": 1.2280523777008057, "learning_rate": 1.930909090909091e-05, "loss": 1.2881, "step": 2720 }, { "epoch": 8.4, "grad_norm": 1.2223644256591797, "learning_rate": 1.8945454545454548e-05, "loss": 1.1898, "step": 2730 }, { "epoch": 8.430769230769231, "grad_norm": 1.349334955215454, "learning_rate": 1.8581818181818185e-05, "loss": 1.2984, "step": 2740 }, { "epoch": 8.461538461538462, "grad_norm": 1.2894556522369385, "learning_rate": 1.8218181818181822e-05, "loss": 1.3458, "step": 2750 }, { "epoch": 8.492307692307692, "grad_norm": 1.6086795330047607, "learning_rate": 1.7854545454545455e-05, "loss": 1.2394, "step": 2760 }, { "epoch": 8.523076923076923, "grad_norm": 1.4955778121948242, "learning_rate": 1.7490909090909092e-05, "loss": 1.1506, "step": 2770 }, { "epoch": 8.553846153846154, "grad_norm": 1.4156781435012817, "learning_rate": 1.712727272727273e-05, "loss": 1.1746, "step": 2780 }, { "epoch": 8.584615384615384, "grad_norm": 1.2073826789855957, "learning_rate": 1.6763636363636365e-05, "loss": 1.1301, "step": 2790 }, { "epoch": 8.615384615384615, "grad_norm": 1.574342966079712, "learning_rate": 1.6400000000000002e-05, "loss": 1.3976, "step": 2800 }, { "epoch": 8.615384615384615, "eval_gen_len": 187.0146, "eval_loss": 1.7040081024169922, "eval_rouge1": 0.6049, "eval_rouge2": 0.2551, "eval_rougeL": 0.469, "eval_runtime": 34.1499, "eval_samples_per_second": 8.023, "eval_steps_per_second": 2.021, "step": 2800 }, { "epoch": 8.646153846153846, "grad_norm": 1.207323670387268, "learning_rate": 1.603636363636364e-05, "loss": 1.2123, "step": 2810 }, { "epoch": 8.676923076923076, "grad_norm": 1.346170425415039, "learning_rate": 1.5672727272727272e-05, "loss": 1.1284, "step": 2820 }, { "epoch": 8.707692307692307, "grad_norm": 1.3920952081680298, "learning_rate": 1.530909090909091e-05, "loss": 1.1961, "step": 2830 }, { "epoch": 8.73846153846154, "grad_norm": 1.4912207126617432, "learning_rate": 1.4945454545454546e-05, "loss": 1.2558, "step": 2840 }, { "epoch": 8.76923076923077, "grad_norm": 0.9997207522392273, "learning_rate": 1.4581818181818183e-05, "loss": 1.1842, "step": 2850 }, { "epoch": 8.8, "grad_norm": 1.207138180732727, "learning_rate": 1.421818181818182e-05, "loss": 1.2588, "step": 2860 }, { "epoch": 8.830769230769231, "grad_norm": 1.398917555809021, "learning_rate": 1.3854545454545456e-05, "loss": 1.2445, "step": 2870 }, { "epoch": 8.861538461538462, "grad_norm": 1.3793071508407593, "learning_rate": 1.3490909090909093e-05, "loss": 1.164, "step": 2880 }, { "epoch": 8.892307692307693, "grad_norm": 1.2650920152664185, "learning_rate": 1.3127272727272726e-05, "loss": 1.2235, "step": 2890 }, { "epoch": 8.923076923076923, "grad_norm": 1.3319740295410156, "learning_rate": 1.2763636363636363e-05, "loss": 1.1818, "step": 2900 }, { "epoch": 8.953846153846154, "grad_norm": 1.395668387413025, "learning_rate": 1.24e-05, "loss": 1.2237, "step": 2910 }, { "epoch": 8.984615384615385, "grad_norm": 1.1730422973632812, "learning_rate": 1.2036363636363637e-05, "loss": 1.0141, "step": 2920 } ], "logging_steps": 10, "max_steps": 3250, "num_input_tokens_seen": 0, "num_train_epochs": 10, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 7124807319552000.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }